tools/binary_size/analyze.py - Issue 2778963003: Revert of V2 of //tools/binary_size rewrite (diffs).

Side by Side Diff: tools/binary_size/analyze.py

Issue 2778963003: Revert of V2 of //tools/binary_size rewrite (diffs). (Closed)

Patch Set: Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright 2017 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Main Python API for analyzing binary size."""

	7

	8 import argparse

	9 import ast

	10 import distutils.spawn

	11 import gzip

	12 import logging

	13 import os

	14 import re

	15 import subprocess

	16

	17 import function_signature

	18 import helpers

	19 import mapfileparser

	20 import symbols

	21

	22

	23 # File format version for .size files.

	24 _SERIALIZATION_VERSION = 1

	25

	26

	27 def _OpenMaybeGz(path, mode=None):

	28 """Calls `gzip.open()` if \|path\| ends in ".gz", otherwise calls `open()`."""

	29 if path.endswith('.gz'):

	30 if mode and 'w' in mode:

	31 return gzip.GzipFile(path, mode, 1)

	32 return gzip.open(path, mode)

	33 return open(path, mode or 'r')

	34

	35

	36 def _EndsWithMaybeGz(path, suffix):

	37 return path.endswith(suffix) or path.endswith(suffix + '.gz')

	38

	39

	40 def _IterLines(s):

	41 prev_idx = -1

	42 while True:

	43 idx = s.find('\n', prev_idx + 1)

	44 if idx == -1:

	45 return

	46 yield s[prev_idx + 1:idx]

	47 prev_idx = idx

	48

	49

	50 def _UnmangleRemainingSymbols(symbol_group, tool_prefix):

	51 """Uses c++filt to unmangle any symbols that need it."""

	52 to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')]

	53 if not to_process:

	54 return

	55

	56 logging.info('Unmangling %d names', len(to_process))

	57 proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE,

	58 stdout=subprocess.PIPE)

	59 stdout = proc.communicate('\n'.join(s.name for s in to_process))[0]

	60 assert proc.returncode == 0

	61

	62 for i, line in enumerate(_IterLines(stdout)):

	63 to_process[i].name = line

	64

	65

	66 def _NormalizeNames(symbol_group):

	67 """Ensures that all names are formatted in a useful way.

	68

	69 This includes:

	70 - Assigning of \|function_signature\| (for functions).

	71 - Stripping of return types in \|function_signature\| and \|name\|.

	72 - Stripping parameters from \|name\|.

	73 - Moving "vtable for" and the like to be suffixes rather than prefixes.

	74 """

	75 found_prefixes = set()

	76 for symbol in symbol_group:

	77 if not symbol.name or symbol.name.startswith('*'):

	78 # See comment in _RemoveDuplicatesAndCalculatePadding() about when this

	79 # can happen.

	80 continue

	81

	82 # E.g.: vtable for FOO

	83 idx = symbol.name.find(' for ', 0, 30)

	84 if idx != -1:

	85 found_prefixes.add(symbol.name[:idx + 4])

	86 symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']'

	87

	88 # E.g.: virtual thunk to FOO

	89 idx = symbol.name.find(' to ', 0, 30)

	90 if idx != -1:

	91 found_prefixes.add(symbol.name[:idx + 3])

	92 symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']'

	93

	94 # Strip out return type, and identify where parameter list starts.

	95 if symbol.section == 't':

	96 symbol.function_signature, symbol.name = (

	97 function_signature.Parse(symbol.name))

	98

	99 # Remove anonymous namespaces (they just harm clustering).

	100 symbol.name = symbol.name.replace('(anonymous namespace)::', '')

	101

	102 logging.debug('Found name prefixes of: %r', found_prefixes)

	103

	104

	105 def _NormalizeObjectPaths(symbol_group):

	106 """Ensures that all paths are formatted in a useful way."""

	107 for symbol in symbol_group:

	108 if symbol.path:

	109 if symbol.path.startswith('obj/'):

	110 # Convert obj/third_party/... -> third_party/...

	111 symbol.path = symbol.path[4:]

	112 elif symbol.path.startswith('../../'):

	113 # Convert ../../third_party/... -> third_party/...

	114 symbol.path = symbol.path[6:]

	115 if symbol.path.endswith(')'):

	116 # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o

	117 start_idx = symbol.path.index('(')

	118 paren_path = symbol.path[start_idx + 1:-1]

	119 symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path

	120

	121

	122 def _RemoveDuplicatesAndCalculatePadding(symbol_group):

	123 """Removes symbols at the same address and calculates the \|padding\| field.

	124

	125 Symbols must already be sorted by \|address\|.

	126 """

	127 i = 0

	128 to_remove = set()

	129 all_symbols = symbol_group.symbols

	130 for i in xrange(len(all_symbols)):

	131 prev_symbol = all_symbols[i - 1]

	132 symbol = all_symbols[i]

	133 if prev_symbol.section_name is not symbol.section_name:

	134 continue

	135 if symbol.address > 0 and prev_symbol.address > 0:

	136 # Fold symbols that are at the same address (happens in nm output).

	137 if symbol.address == prev_symbol.address:

	138 symbol.size = max(prev_symbol.size, symbol.size)

	139 to_remove.add(i)

	140 continue

	141 # Even with symbols at the same address removed, overlaps can still

	142 # happen. In this case, padding will be negative (and this is fine).

	143 padding = symbol.address - prev_symbol.end_address

	144 # These thresholds were found by manually auditing arm32 Chrome.

	145 # E.g.: Set them to 0 and see what warnings get logged.

	146 # TODO(agrieve): See if these thresholds make sense for architectures

	147 # other than arm32.

	148 if (symbol.section in 'rd' and padding >= 256 or

	149 symbol.section in 't' and padding >= 64):

	150 # For nm data, this is caused by data that has no associated symbol.

	151 # The linker map file lists them with no name, but with a file.

	152 # Example:

	153 # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o

	154 # Where as most look like:

	155 # .data.MANGLED_NAME...

	156 logging.debug('Large padding of %d between:\n A) %r\n B) %r' % (

	157 padding, prev_symbol, symbol))

	158 continue

	159 symbol.padding = padding

	160 symbol.size += padding

	161 assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol

	162 # Map files have no overlaps, so worth special-casing the no-op case.

	163 if to_remove:

	164 logging.info('Removing %d overlapping symbols', len(to_remove))

	165 symbol_group.symbols = (

	166 [s for i, s in enumerate(all_symbols) if i not in to_remove])

	167

	168

	169 def _PrintStats(result, write_func):

	170 """Prints out how accurate \|result\| is."""

	171 for section in symbols.SECTION_TO_SECTION_NAME:

	172 if section == 'd':

	173 expected_size = sum(v for k, v in result.section_sizes.iteritems()

	174 if k.startswith('.data'))

	175 else:

	176 expected_size = result.section_sizes[

	177 symbols.SECTION_TO_SECTION_NAME[section]]

	178

	179 def one_stat(group):

	180 template = ('Section %s has %.1f%% of %d bytes accounted for from '

	181 '%d symbols. %d bytes are unaccounted for. Padding '

	182 'accounts for %d bytes\n')

	183 actual_size = group.size

	184 count = len(group)

	185 padding = group.padding

	186 size_percent = 100.0 * actual_size / expected_size

	187 return (template % (section, size_percent, actual_size, count,

	188 expected_size - actual_size, padding))

	189

	190 in_section = result.symbol_group.WhereInSection(section)

	191 write_func(one_stat(in_section))

	192

	193 star_syms = in_section.WhereNameMatches(r'^\*')

	194 attributed_syms = star_syms.Inverted().WhereHasAnyAttribution()

	195 anonymous_syms = attributed_syms.Inverted()

	196 if star_syms or anonymous_syms:

	197 missing_size = star_syms.size + anonymous_syms.size

	198 write_func(('+ Without %d merge sections and %d anonymous entries ('

	199 'accounting for %d bytes):\n') % (

	200 len(star_syms), len(anonymous_syms), missing_size))

	201 write_func('+ ' + one_stat(attributed_syms))

	202

	203

	204 def _SaveResult(result, file_obj):

	205 """Saves the result to the given file object."""

	206 # Store one bucket per line.

	207 file_obj.write('%d\n' % _SERIALIZATION_VERSION)

	208 file_obj.write('%r\n' % result.section_sizes)

	209 file_obj.write('%d\n' % len(result.symbol_group))

	210 prev_section_name = None

	211 # Store symbol fields as tab-separated.

	212 # Store only non-derived fields.

	213 for symbol in result.symbol_group:

	214 if symbol.section_name != prev_section_name:

	215 file_obj.write('%s\n' % symbol.section_name)

	216 prev_section_name = symbol.section_name

	217 # Don't write padding nor name since these are derived values.

	218 file_obj.write('%x\t%x\t%s\t%s\n' % (

	219 symbol.address, symbol.size_without_padding,

	220 symbol.function_signature or symbol.name or '',

	221 symbol.path or ''))

	222

	223

	224 def _LoadResults(file_obj):

	225 """Loads a result from the given file."""

	226 lines = iter(file_obj)

	227 actual_version = int(next(lines))

	228 assert actual_version == _SERIALIZATION_VERSION, (

	229 'Version mismatch. Need to write some upgrade code.')

	230

	231 section_sizes = ast.literal_eval(next(lines))

	232 num_syms = int(next(lines))

	233 symbol_list = [None] * num_syms

	234 section_name = None

	235 for i in xrange(num_syms):

	236 line = next(lines)[:-1]

	237 if '\t' not in line:

	238 section_name = intern(line)

	239 line = next(lines)[:-1]

	240 new_sym = symbols.Symbol.__new__(symbols.Symbol)

	241 parts = line.split('\t')

	242 new_sym.section_name = section_name

	243 new_sym.address = int(parts[0], 16)

	244 new_sym.size = int(parts[1], 16)

	245 new_sym.name = parts[2] or None

	246 new_sym.path = parts[3] or None

	247 new_sym.padding = 0 # Derived

	248 new_sym.function_signature = None # Derived

	249 symbol_list[i] = new_sym

	250

	251 # Recompute derived values (padding and function names).

	252 result = mapfileparser.ParseResult(symbol_list, section_sizes)

	253 logging.info('Calculating padding')

	254 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)

	255 logging.info('Deriving signatures')

	256 # Re-parse out function parameters.

	257 _NormalizeNames(result.symbol_group.WhereInSection('t'))

	258 return result

	259

	260

	261 def AddOptions(parser):

	262 parser.add_argument('input_file',

	263 help='Path to input file. Can be a linker .map file, an '

	264 'unstripped binary, or a saved result from '

	265 'analyze.py')

	266 parser.add_argument('--tool-prefix', default='',

	267 help='Path prefix for c++filt.')

	268 parser.add_argument('--output-directory',

	269 help='Path to the root build directory.')

	270

	271

	272 def _DetectToolPrefix(tool_prefix, input_file, output_directory=None):

	273 """Calls Analyze with values from args."""

	274 if not output_directory:

	275 abs_path = os.path.abspath(input_file)

	276 release_idx = abs_path.find('Release')

	277 if release_idx != -1:

	278 output_directory = abs_path[:release_idx] + 'Release'

	279 output_directory = os.path.relpath(abs_path[:release_idx] + '/Release')

	280 logging.debug('Detected --output-directory=%s', output_directory)

	281

	282 if not tool_prefix and output_directory:

	283 # Auto-detect from build_vars.txt

	284 build_vars_path = os.path.join(output_directory, 'build_vars.txt')

	285 if os.path.exists(build_vars_path):

	286 with open(build_vars_path) as f:

	287 build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l)

	288 logging.debug('Found --tool-prefix from build_vars.txt')

	289 tool_prefix = build_vars['android_tool_prefix']

	290

	291 if os.path.sep not in tool_prefix:

	292 full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt')

	293 else:

	294 full_path = tool_prefix + 'c++filt'

	295

	296 if not os.path.isfile(full_path):

	297 raise Exception('Bad --tool-prefix. Path not found: %s' % full_path)

	298 logging.info('Using --tool-prefix=%s', tool_prefix)

	299 return tool_prefix

	300

	301

	302 def AnalyzeWithArgs(args):

	303 return Analyze(args.input_file, args.output_directory, args.tool_prefix)

	304

	305

	306 def Analyze(path, output_directory=None, tool_prefix=''):

	307 if _EndsWithMaybeGz(path, '.size'):

	308 logging.info('Loading cached results.')

	309 with _OpenMaybeGz(path) as f:

	310 result = _LoadResults(f)

	311 elif not _EndsWithMaybeGz(path, '.map'):

	312 raise Exception('Expected input to be a .map or a .size')

	313 else:

	314 # Verify tool_prefix early.

	315 tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory)

	316

	317 with _OpenMaybeGz(path) as map_file:

	318 result = mapfileparser.MapFileParser().Parse(map_file)

	319

	320 # Map file for some reason doesn't unmangle all names.

	321 logging.info('Calculating padding')

	322 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)

	323 # Unmangle prints its own log statement.

	324 _UnmangleRemainingSymbols(result.symbol_group, tool_prefix)

	325 # Resolve paths prints its own log statement.

	326 logging.info('Normalizing names')

	327 _NormalizeNames(result.symbol_group)

	328 logging.info('Normalizing paths')

	329 _NormalizeObjectPaths(result.symbol_group)

	330

	331 if logging.getLogger().isEnabledFor(logging.INFO):

	332 _PrintStats(result, lambda l: logging.info(l.rstrip()))

	333 logging.info('Finished analyzing %d symbols', len(result.symbol_group))

	334 return result

	335

	336

	337 def main():

	338 parser = argparse.ArgumentParser()

	339 parser.add_argument('--output', required=True,

	340 help='Path to store results. Must end in .size or '

	341 '.size.gz')

	342 AddOptions(parser)

	343 args = helpers.AddCommonOptionsAndParseArgs(parser)

	344 if not _EndsWithMaybeGz(args.output, '.size'):

	345 raise Exception('--output must end with .size or .size.gz')

	346

	347 result = AnalyzeWithArgs(args)

	348 logging.info('Saving result to %s', args.output)

	349 with _OpenMaybeGz(args.output, 'wb') as f:

	350 _SaveResult(result, f)

	351

	352 logging.info('Done')

	353

	354

	355 if __name__ == '__main__':

	356 main()

OLD	NEW

« no previous file with comments | « tools/binary_size/README.md ('k') | tools/binary_size/binary_size_utils.py » ('j') | no next file with comments »