Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/env python | |
| 2 # Copyright 2017 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 """Main Python API for analyzing binary size.""" | |
| 7 | |
| 8 import argparse | |
| 9 import ast | |
| 10 import distutils.spawn | |
| 11 import gzip | |
| 12 import logging | |
| 13 import os | |
| 14 import re | |
| 15 import subprocess | |
| 16 | |
| 17 import parsers | |
| 18 import helpers | |
| 19 import symbols | |
| 20 | |
| 21 | |
| 22 # File format version for .size files. | |
| 23 _SERIALIZATION_VERSION = 1 | |
| 24 | |
| 25 | |
| 26 def _OpenMaybeGz(path, mode=None): | |
| 27 """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`.""" | |
| 28 if path.endswith('.gz'): | |
| 29 if mode and 'w' in mode: | |
| 30 return gzip.GzipFile(path, mode, 1) | |
| 31 return gzip.open(path, mode) | |
| 32 return open(path, mode or 'r') | |
| 33 | |
| 34 | |
| 35 def _EndsWithMaybeGz(path, suffix): | |
| 36 return path.endswith(suffix) or path.endswith(suffix + '.gz') | |
| 37 | |
| 38 | |
| 39 def _IterLines(s): | |
| 40 prev_idx = -1 | |
| 41 while True: | |
| 42 idx = s.find('\n', prev_idx + 1) | |
| 43 if idx == -1: | |
| 44 return | |
| 45 yield s[prev_idx + 1:idx] | |
| 46 prev_idx = idx | |
| 47 | |
| 48 | |
| 49 def _UnmangleRemainingSymbols(symbol_group, tool_prefix): | |
| 50 """Uses c++filt to unmangle any symbols that need it.""" | |
| 51 to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')] | |
| 52 if not to_process: | |
| 53 return | |
| 54 | |
| 55 logging.info('Unmangling %d names', len(to_process)) | |
| 56 proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE, | |
| 57 stdout=subprocess.PIPE) | |
| 58 stdout = proc.communicate('\n'.join(s.name for s in to_process))[0] | |
| 59 assert proc.returncode == 0 | |
| 60 | |
| 61 for i, line in enumerate(_IterLines(stdout)): | |
| 62 to_process[i].name = line | |
| 63 | |
| 64 | |
| 65 def _FindParameterListParen(name): | |
| 66 """Finds index of the "(" that denotes the start of a paremeter list.""" | |
| 67 # This loops from left-to-right, but the only reason (I think) that this | |
| 68 # is necessary (rather than reusing _FindLastCharOutsideOfBrackets), is | |
| 69 # to capture the outer-most function in the case where classes are nested. | |
| 70 start_idx = 0 | |
| 71 while True: | |
| 72 template_balance_count = 0 | |
| 73 paren_balance_count = 0 | |
| 74 while True: | |
| 75 idx = name.find('(', start_idx) | |
| 76 if idx == -1: | |
| 77 return -1 | |
| 78 template_balance_count += ( | |
| 79 name.count('<', start_idx, idx) - name.count('>', start_idx, idx)) | |
| 80 # Special: operators with angle brackets. | |
| 81 operator_idx = name.find('operator<', start_idx, idx) | |
| 82 if operator_idx != -1: | |
| 83 if name[operator_idx + 9] == '<': | |
| 84 template_balance_count -= 2 | |
| 85 else: | |
| 86 template_balance_count -= 1 | |
| 87 else: | |
| 88 operator_idx = name.find('operator>', start_idx, idx) | |
| 89 if operator_idx != -1: | |
| 90 if name[operator_idx + 9] == '>': | |
| 91 template_balance_count += 2 | |
| 92 else: | |
| 93 template_balance_count += 1 | |
| 94 | |
| 95 paren_balance_count += ( | |
| 96 name.count('(', start_idx, idx) - name.count(')', start_idx, idx)) | |
| 97 if template_balance_count == 0 and paren_balance_count == 0: | |
| 98 # Special case: skip "(anonymous namespace)". | |
| 99 if -1 != name.find('(anonymous namespace)', idx, idx + 21): | |
| 100 start_idx = idx + 21 | |
| 101 continue | |
| 102 # Special case: skip "decltype (...)" | |
| 103 if name[idx - 1] != ' ': | |
| 104 return idx | |
| 105 start_idx = idx + 1 | |
| 106 paren_balance_count += 1 | |
| 107 | |
| 108 | |
| 109 def _FindLastCharOutsideOfBrackets(name, target_char, prev_idx=None): | |
| 110 paren_balance_count = 0 | |
| 111 template_balance_count = 0 | |
| 112 while True: | |
| 113 idx = name.rfind(target_char, 0, prev_idx) | |
| 114 if idx == -1: | |
| 115 return -1 | |
| 116 # It is much faster to use.find() and.count() than to loop over each | |
| 117 # character. | |
| 118 template_balance_count += ( | |
| 119 name.count('<', idx, prev_idx) - name.count('>', idx, prev_idx)) | |
| 120 paren_balance_count += ( | |
| 121 name.count('(', idx, prev_idx) - name.count(')', idx, prev_idx)) | |
| 122 if template_balance_count == 0 and paren_balance_count == 0: | |
| 123 return idx | |
| 124 prev_idx = idx | |
| 125 | |
| 126 | |
| 127 def _ParseFunctionSignature(name): | |
| 128 """Extracts a function name from a function signature. | |
| 129 | |
| 130 See unit tests for example signatures. | |
| 131 | |
| 132 Returns: | |
| 133 A tuple of (name_without_return_type, name_without_return_type_and_params). | |
| 134 """ | |
| 135 paren_idx = _FindParameterListParen(name) | |
| 136 | |
| 137 if paren_idx > 0: | |
| 138 space_idx = paren_idx | |
| 139 # Special case: const cast operators (see tests). | |
| 140 if -1 != name.find(' const', paren_idx - 6, paren_idx): | |
| 141 space_idx = paren_idx - 6 | |
| 142 while True: | |
| 143 space_idx = _FindLastCharOutsideOfBrackets(name, ' ', space_idx) | |
| 144 # Special case: "operator new", and "operator<< <template>". | |
| 145 if -1 == space_idx or ( | |
| 146 -1 == name.find('operator', space_idx - 8, space_idx) and | |
| 147 -1 == name.find('operator<<', space_idx - 10, space_idx)): | |
| 148 break | |
| 149 space_idx -= 8 | |
| 150 return (name[space_idx + 1:], name[space_idx + 1:paren_idx]) | |
| 151 return name, name | |
| 152 | |
| 153 | |
| 154 def _NormalizeNames(symbol_group): | |
| 155 """Ensures that all names are formatted in a useful way. | |
| 156 | |
| 157 This include: | |
| 158 - Assigning of |function_signature| (for functions). | |
| 159 - Stripping of return types in |function_signature| and |name|. | |
| 160 - Stripping parameters from |name|. | |
| 161 - Moving "vtable for" and the like to be suffixes rather than prefixes. | |
| 162 """ | |
| 163 found_prefixes = set() | |
| 164 for symbol in symbol_group: | |
| 165 if not symbol.name or symbol.name.startswith('*'): | |
| 166 # See comment in _RemoveDuplicatesAndCalculatePadding() about when this | |
| 167 # can happen. | |
| 168 continue | |
| 169 | |
| 170 # E.g.: vtable for FOO | |
| 171 idx = symbol.name.find(' for ', 0, 30) | |
| 172 if idx != -1: | |
| 173 found_prefixes.add(symbol.name[:idx + 4]) | |
| 174 symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']' | |
| 175 | |
| 176 # E.g.: virtual thunk to FOO | |
| 177 idx = symbol.name.find(' to ', 0, 30) | |
| 178 if idx != -1: | |
| 179 found_prefixes.add(symbol.name[:idx + 3]) | |
| 180 symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']' | |
| 181 | |
| 182 # Strip out return type, and identify where parameter list starts. | |
| 183 if symbol.section == 't': | |
| 184 symbol.function_signature, symbol.name = ( | |
| 185 _ParseFunctionSignature(symbol.name)) | |
| 186 | |
| 187 # Remove anonymous namespaces (they just harm clustering). | |
| 188 symbol.name = symbol.name.replace('(anonymous namespace)::', '') | |
| 189 | |
| 190 logging.debug('Found name prefixes of: %r', found_prefixes) | |
| 191 | |
| 192 | |
| 193 def _NormalizeObjectPaths(symbol_group): | |
| 194 """Ensures that all paths are formatted in a useful way.""" | |
| 195 for symbol in symbol_group: | |
| 196 if symbol.path: | |
| 197 if symbol.path.startswith('obj/'): | |
| 198 # Convert obj/third_party/... -> third_party/... | |
| 199 symbol.path = symbol.path[4:] | |
| 200 elif symbol.path.startswith('../../'): | |
| 201 # Convert ../../third_party/... -> third_party/... | |
| 202 symbol.path = symbol.path[6:] | |
| 203 if symbol.path.endswith(')'): | |
| 204 # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o | |
| 205 start_idx = symbol.path.index('(') | |
| 206 paren_path = symbol.path[start_idx + 1:-1] | |
| 207 symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path | |
| 208 | |
| 209 | |
| 210 def _RemoveDuplicatesAndCalculatePadding(symbol_group): | |
| 211 """Removes symbols at the same address and calculates the |padding| field. | |
| 212 | |
| 213 Symbols must already be sorted by |address|. | |
| 214 """ | |
| 215 i = 0 | |
| 216 to_remove = set() | |
| 217 all_symbols = symbol_group.symbols | |
| 218 for i in xrange(len(all_symbols)): | |
| 219 prev_symbol = all_symbols[i - 1] | |
| 220 symbol = all_symbols[i] | |
| 221 if prev_symbol.section_name is not symbol.section_name: | |
| 222 continue | |
| 223 if symbol.address > 0 and prev_symbol.address > 0: | |
| 224 # Fold symbols that are at the same address (happens in nm output). | |
| 225 if symbol.address == prev_symbol.address: | |
| 226 symbol.size = max(prev_symbol.size, symbol.size) | |
| 227 to_remove.add(i) | |
| 228 continue | |
| 229 # Even with symbols at the same address removed, overlaps can still | |
| 230 # happen. In this case, padding will be negative (and this is fine). | |
| 231 padding = symbol.address - prev_symbol.end_address | |
| 232 if (symbol.section in 'rd' and padding >= 256 or | |
| 233 symbol.section in 't' and padding >= 64): | |
| 234 # For nm data, this is caused by data that has no associated symbol. | |
| 235 # The linker map file lists them with no name, but with a file. | |
| 236 # Example: | |
| 237 # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o | |
| 238 # Where as most look like: | |
| 239 # .data.MANGLED_NAME... | |
| 240 logging.debug('Large padding of %d between:\n A) %r\n B) %r' % ( | |
| 241 padding, prev_symbol, symbol)) | |
| 242 continue | |
| 243 symbol.padding = padding | |
| 244 symbol.size += padding | |
| 245 assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol | |
| 246 # Map files have no overlaps, so worth special-casing the no-op case. | |
| 247 if to_remove: | |
| 248 logging.info('Removing %d overlapping symbols', len(to_remove)) | |
| 249 symbol_group.symbols = ( | |
| 250 [s for i, s in enumerate(all_symbols) if i not in to_remove]) | |
| 251 | |
| 252 | |
| 253 def _PrintStats(result, write_func): | |
| 254 """Prints out how accurate |result| is.""" | |
| 255 for section in symbols.SECTION_TO_SECTION_NAME: | |
| 256 if section == 'd': | |
| 257 expected_size = sum(v for k, v in result.section_sizes.iteritems() | |
| 258 if k.startswith('.data')) | |
| 259 else: | |
| 260 expected_size = result.section_sizes[ | |
| 261 symbols.SECTION_TO_SECTION_NAME[section]] | |
| 262 | |
| 263 def show_one_stat(group): | |
| 264 template = ('Section %s has %.1f%% of %d bytes accounted for from ' | |
| 265 '%d symbols. %d bytes are unaccounted for. Padding ' | |
| 266 'accounts for %d bytes\n') | |
| 267 actual_size = group.size | |
| 268 count = len(group) | |
| 269 padding = group.padding | |
| 270 size_percent = 100.0 * actual_size / expected_size | |
| 271 write_func(template % (section, size_percent, actual_size, count, | |
| 272 expected_size - actual_size, padding)) | |
| 273 | |
| 274 in_section = result.symbol_group.WhereInSection(section) | |
| 275 show_one_stat(in_section) | |
| 276 | |
| 277 star_syms = in_section.WhereNameMatches(r'^\*') | |
| 278 attributed_syms = star_syms.Inverted().WhereHasAnyAttribution() | |
| 279 anonymous_syms = attributed_syms.Inverted() | |
| 280 if star_syms or anonymous_syms: | |
| 281 missing_size = star_syms.size + anonymous_syms.size | |
| 282 write_func(('Without %d merge sections and %d anonymous entries (' | |
| 283 'accounting for %d bytes):\n') % ( | |
| 284 len(star_syms), len(anonymous_syms), missing_size)) | |
| 285 show_one_stat(attributed_syms) | |
|
estevenson
2017/03/20 14:13:03
It's a little hard to see just by looking at the o
agrieve
2017/03/20 19:58:09
Good idea! Done. Looks like:
I 3711 Section r h
| |
| 286 | |
| 287 | |
| 288 def _SaveResult(result, file_obj): | |
| 289 """Saves the result to the given file object.""" | |
| 290 # Store one bucket per line. | |
| 291 file_obj.write('%d\n' % _SERIALIZATION_VERSION) | |
| 292 file_obj.write('%r\n' % result.section_sizes) | |
| 293 file_obj.write('%d\n' % len(result.symbol_group)) | |
| 294 prev_section_name = None | |
| 295 # Store symbol fields as tab-separated. | |
| 296 # Store only non-derived fields. | |
| 297 for symbol in result.symbol_group: | |
| 298 if symbol.section_name != prev_section_name: | |
| 299 file_obj.write('%s\n' % symbol.section_name) | |
| 300 prev_section_name = symbol.section_name | |
| 301 # Don't write padding nor name since these are derived values. | |
| 302 file_obj.write('%x\t%x\t%s\t%s\n' % ( | |
| 303 symbol.address, symbol.size_without_padding, | |
| 304 symbol.function_signature or symbol.name or '', | |
| 305 symbol.path or '')) | |
| 306 | |
| 307 | |
| 308 def _LoadResults(file_obj): | |
| 309 """Loads a result from the given file.""" | |
| 310 lines = iter(file_obj) | |
| 311 actual_version = int(next(lines)) | |
| 312 assert actual_version == _SERIALIZATION_VERSION, ( | |
| 313 'Version mismatch. Need to write some upgrade code.') | |
| 314 | |
| 315 section_sizes = ast.literal_eval(next(lines)) | |
| 316 num_syms = int(next(lines)) | |
| 317 symbol_list = [None] * num_syms | |
| 318 section_name = None | |
| 319 for i in xrange(num_syms): | |
| 320 line = next(lines)[:-1] | |
| 321 if '\t' not in line: | |
| 322 section_name = intern(line) | |
| 323 line = next(lines)[:-1] | |
| 324 new_sym = symbols.Symbol.__new__(symbols.Symbol) | |
| 325 parts = line.split('\t') | |
| 326 new_sym.section_name = section_name | |
| 327 new_sym.address = int(parts[0], 16) | |
| 328 new_sym.size = int(parts[1], 16) | |
| 329 new_sym.name = parts[2] or None | |
| 330 new_sym.path = parts[3] or None | |
| 331 new_sym.padding = 0 # Derived | |
| 332 new_sym.function_signature = None # Derived | |
| 333 symbol_list[i] = new_sym | |
| 334 | |
| 335 # Recompute derived values (padding and function names). | |
| 336 result = parsers.ParseResult(symbol_list, section_sizes) | |
| 337 logging.info('Calculating padding') | |
| 338 _RemoveDuplicatesAndCalculatePadding(result.symbol_group) | |
| 339 logging.info('Deriving signatures') | |
| 340 # Re-parse out function parameters. | |
| 341 _NormalizeNames(result.symbol_group.WhereInSection('t')) | |
| 342 return result | |
| 343 | |
| 344 | |
| 345 def AddOptions(parser): | |
| 346 parser.add_argument('input_file', | |
| 347 help='Path to input file. Can be a linker .map file, an ' | |
| 348 'unstripped binary, or a saved result from ' | |
| 349 'analyze.py') | |
| 350 parser.add_argument('--tool-prefix', default='', | |
| 351 help='Path prefix for c++filt.') | |
| 352 parser.add_argument('--output-directory', | |
| 353 help='Path to the root build directory.') | |
| 354 | |
| 355 | |
| 356 def _DetectToolPrefix(tool_prefix, input_file, output_directory=None): | |
| 357 """Calls Analyze with values from args.""" | |
| 358 if not output_directory: | |
| 359 abs_path = os.path.abspath(input_file) | |
| 360 release_idx = abs_path.find('Release') | |
| 361 if release_idx != -1: | |
| 362 output_directory = os.path.relpath(abs_path[:release_idx], | |
| 363 helpers.SRC_ROOT) + '/Release' | |
| 364 logging.debug('Detected --output-directory=%s', output_directory) | |
| 365 | |
| 366 if not tool_prefix and output_directory: | |
| 367 # Auto-detect from build_vars.txt | |
| 368 build_vars_path = os.path.join(output_directory, 'build_vars.txt') | |
| 369 if os.path.exists(build_vars_path): | |
| 370 with open(build_vars_path) as f: | |
| 371 build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l) | |
| 372 logging.debug('Found --tool-prefix from build_vars.txt') | |
| 373 tool_prefix = build_vars['android_tool_prefix'] | |
| 374 | |
| 375 if os.path.sep not in tool_prefix: | |
| 376 full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt') | |
| 377 else: | |
| 378 full_path = tool_prefix + 'c++filt' | |
| 379 | |
| 380 if not os.path.isfile(full_path): | |
| 381 raise Exception('Bad --tool-prefix. Path not found: %s' % full_path) | |
| 382 return tool_prefix | |
| 383 | |
| 384 | |
| 385 def AnalyzeWithArgs(args): | |
| 386 return Analyze(args.input_file, args.output_directory, args.tool_prefix) | |
| 387 | |
| 388 | |
| 389 def Analyze(path, output_directory=None, tool_prefix=''): | |
| 390 if _EndsWithMaybeGz(path, '.size'): | |
| 391 logging.info('Loading cached results.') | |
| 392 with _OpenMaybeGz(path) as f: | |
| 393 result = _LoadResults(f) | |
| 394 elif not _EndsWithMaybeGz(path, '.map'): | |
| 395 raise Exception('Expected input to be a .map or a .size') | |
| 396 else: | |
| 397 # Verify tool_prefix early. | |
| 398 tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory) | |
| 399 | |
| 400 with _OpenMaybeGz(path) as map_file: | |
| 401 result = parsers.MapFileParser().Parse(map_file) | |
| 402 | |
| 403 # Map file for some reason doesn't unmangle all names. | |
| 404 logging.info('Calculating padding') | |
| 405 _RemoveDuplicatesAndCalculatePadding(result.symbol_group) | |
| 406 # Unmangle prints its own log statement. | |
| 407 _UnmangleRemainingSymbols(result.symbol_group, tool_prefix) | |
| 408 # Resolve paths prints its own log statement. | |
| 409 logging.info('Normalizing names') | |
| 410 _NormalizeNames(result.symbol_group) | |
| 411 logging.info('Normalizing paths') | |
| 412 _NormalizeObjectPaths(result.symbol_group) | |
| 413 | |
| 414 if logging.getLogger().isEnabledFor(logging.INFO): | |
| 415 _PrintStats(result, lambda l: logging.info(l.rstrip())) | |
| 416 logging.info('Finished analyzing %d symbols', len(result.symbol_group)) | |
| 417 return result | |
| 418 | |
| 419 | |
| 420 def main(): | |
| 421 parser = argparse.ArgumentParser() | |
| 422 parser.add_argument('--output', required=True, | |
| 423 help='Path to store results. Must end in .size or ' | |
| 424 '.size.gz') | |
| 425 AddOptions(parser) | |
| 426 helpers.AddCommonOptions(parser) | |
| 427 args = parser.parse_args() | |
| 428 if not _EndsWithMaybeGz(args.output, '.size'): | |
| 429 raise Exception('--output must end with .size or .size.gz') | |
| 430 helpers.HandleCommonOptions(args) | |
| 431 | |
| 432 result = AnalyzeWithArgs(args) | |
| 433 logging.info('Saving result to %s', args.output) | |
| 434 with _OpenMaybeGz(args.output, 'wb') as f: | |
| 435 _SaveResult(result, f) | |
| 436 | |
| 437 logging.info('Done. Peak RAM usage was %d MB.', helpers.GetPeakRamUsage()) | |
| 438 | |
| 439 | |
| 440 if __name__ == '__main__': | |
| 441 main() | |
| OLD | NEW |