Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(72)

Side by Side Diff: tools/binary_size/analyze.py

Issue 2724253002: V1 of //tools/binary_size rewrite (Closed)
Patch Set: Add repl to query.py Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2017 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Main Python API for analyzing binary size."""
7
8 import argparse
9 import ast
10 import distutils.spawn
11 import gzip
12 import logging
13 import os
14 import re
15 import subprocess
16
17 import parsers
18 import helpers
19 import symbols
20
21
22 # File format version for .size files.
23 _SERIALIZATION_VERSION = 1
24
25 _ANONYMOUS_NAMESPACE = '(anonymous namespace)'
26 _LEN_ANONYMOUS_NAMESPACE = len(_ANONYMOUS_NAMESPACE)
27 _STARTS_WITH_OPERATOR_PATTERN = re.compile(r'\S*(?::|^)operator')
28
29
30 def _OpenMaybeGz(path, mode=None):
31 """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
32 if path.endswith('.gz'):
33 if mode and 'w' in mode:
34 return gzip.GzipFile(path, mode, 1)
35 return gzip.open(path, mode)
36 return open(path, mode or 'r')
37
38
39 def _EndsWithMaybeGz(path, suffix):
40 return path.endswith(suffix) or path.endswith(suffix + '.gz')
41
42
43 def _IterLines(s):
44 prev_idx = -1
45 while True:
46 idx = s.find('\n', prev_idx + 1)
47 if idx == -1:
48 return
49 yield s[prev_idx + 1:idx]
50 prev_idx = idx
51
52
53 def _UnmangleRemainingSymbols(symbol_group, tool_prefix):
54 """Uses c++filt to unmangle any symbols that need it."""
55 to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')]
56 if not to_process:
57 return
58
59 logging.info('Unmangling %d names', len(to_process))
60 proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE,
61 stdout=subprocess.PIPE)
62 stdout = proc.communicate('\n'.join(s.name for s in to_process))[0]
63 assert proc.returncode == 0
64
65 for i, line in enumerate(_IterLines(stdout)):
66 to_process[i].name = line
67
68
69 def _FindParameterListParen(name):
estevenson 2017/03/16 19:49:18 Spent too long trying to figure this out: why can'
agrieve 2017/03/20 19:58:08 Well, originally, I decided to look from the front
70 """Finds index of the "(" that denotes the start of a paremeter list."""
71 # It is much faster to use .find() and .count() than to loop over each
72 # character.
73 start_idx = 0
74 while True:
75 template_balance_count = 0
76 paren_balance_count = 0
77 while True:
78 idx = name.find('(', start_idx)
79 if idx == -1:
80 return -1
81 template_balance_count += (
82 name.count('<', start_idx, idx) - name.count('>', start_idx, idx))
83 paren_balance_count += (
84 name.count('(', start_idx, idx) - name.count(')', start_idx, idx))
85 if template_balance_count == 0 and paren_balance_count == 0:
86 # Special case: skip "(anonymous namespace)".
87 if -1 != name.find(_ANONYMOUS_NAMESPACE, idx,
88 idx + _LEN_ANONYMOUS_NAMESPACE):
89 start_idx = idx + _LEN_ANONYMOUS_NAMESPACE
90 continue
91 # Special case: skip "decltype (...)"
92 if name[idx - 1] != ' ':
93 return idx
94 start_idx = idx + 1
95 paren_balance_count += 1
96
97
98 def _FindLastSpaceOutsideOfBrackets(name, prev_idx=None):
99 template_balance_count = 0
100 paren_balance_count = 0
101 while True:
102 idx = name.rfind(' ', 0, prev_idx)
103 if idx == -1:
104 return -1
105 template_balance_count += (
106 name.count('<', idx, prev_idx) - name.count('>', idx, prev_idx))
107 paren_balance_count += (
108 name.count('(', idx, prev_idx) - name.count(')', idx, prev_idx))
109 if template_balance_count == 0 and paren_balance_count == 0:
110 return idx
111 prev_idx = idx
112
113
114 def _ParseFunctionSignature(name):
115 """Extracts a function name from a function signature.
116
117 See unit tests for example signatures.
118
119 Returns:
120 A tuple of (name_without_return_type, name_without_return_type_and_params).
121 """
122 paren_start = _FindParameterListParen(name)
123
124 if paren_start == 0:
125 logging.warning('Found an odd name %s', name)
126 elif paren_start > 0:
127 # Special case: Some operators have odd syntax (see tests).
128 if _STARTS_WITH_OPERATOR_PATTERN.match(name):
129 space_index = -1
130 else:
131 space_index = _FindLastSpaceOutsideOfBrackets(name, paren_start)
132 return (name[space_index + 1:], name[space_index + 1:paren_start])
133 return name, name
134
135
136 def _NormalizeNames(symbol_group):
137 """Ensures that all names are formatted in a useful way.
138
139 This include:
estevenson 2017/03/16 19:49:18 nit: s/include/includes
agrieve 2017/03/20 19:58:08 Done.
140 - Assigning of |function_signature| (for functions).
141 - Stripping of return types in |function_signature| and |name|.
142 - Stripping parameters from |name|.
143 - Moving "vtable for" and the like to be suffixes rather than prefixes.
144 """
145 found_prefixes = set()
146 for symbol in symbol_group:
147 if not symbol.name or symbol.name.startswith('*'):
148 # See comment in _RemoveDuplicatesAndCalculatePadding() about when this
149 # can happen.
150 continue
151
152 # E.g.: vtable for FOO
153 idx = symbol.name.find(' for ', 0, 30)
154 if idx != -1:
155 found_prefixes.add(symbol.name[:idx + 4])
156 symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']'
157
158 # E.g.: virtual thunk to FOO
159 idx = symbol.name.find(' to ', 0, 30)
160 if idx != -1:
161 found_prefixes.add(symbol.name[:idx + 3])
162 symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']'
163
164 # TODO(agrieve): Store mangled names instead (smaller).
165 if symbol.section == 't':
166 symbol.function_signature, symbol.name = (
167 _ParseFunctionSignature(symbol.name))
168 logging.debug('Found name prefixes of: %r', found_prefixes)
169
170
171 def _NormalizeObjectPaths(symbol_group):
172 """Ensures that all paths are formatted in a useful way."""
173 for symbol in symbol_group:
174 if symbol.path:
175 if symbol.path.startswith('obj/'):
176 # Convert obj/third_party/... -> third_party/...
177 symbol.path = symbol.path[4:]
178 elif symbol.path.startswith('../../'):
179 # Convert ../../third_party/... -> third_party/...
180 symbol.path = symbol.path[6:]
181 if symbol.path.endswith(')'):
182 # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o
183 start_idx = symbol.path.index('(')
184 paren_path = symbol.path[start_idx + 1:-1]
185 symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path
186
187
188 def _RemoveDuplicatesAndCalculatePadding(symbol_group):
189 """Removes symbols at the same address and calculates the |padding| field.
190
191 Symbols must already be sorted by |address|.
192 """
193 i = 0
194 to_remove = set()
195 all_symbols = symbol_group.symbols
196 for i in xrange(len(all_symbols)):
197 prev_symbol = all_symbols[i - 1]
198 symbol = all_symbols[i]
199 if prev_symbol.section_name is not symbol.section_name:
200 continue
201 if symbol.address > 0 and prev_symbol.address > 0:
202 # Fold symbols that are at the same address (happens in nm output).
203 if symbol.address == prev_symbol.address:
204 symbol.size = max(prev_symbol.size, symbol.size)
205 to_remove.add(i)
206 continue
207 # Even with symbols at the same address removed, overlaps can still
208 # happen. In this case, padding will be negative (and this is fine).
209 padding = symbol.address - prev_symbol.end_address
210 if (symbol.section in 'rd' and padding >= 256 or
estevenson 2017/03/16 19:49:18 nit: might be worth adding a comment saying these
agrieve 2017/03/20 19:58:08 Done.
211 symbol.section in 't' and padding >= 64):
212 # For nm data, this is caused by data that has no associated symbol.
213 # The linker map file lists them with no name, but with a file.
214 # Example:
215 # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o
216 # Where as most look like:
217 # .data.MANGLED_NAME ...
218 logging.debug('Large padding of %d between:\n A) %r\n B) %r' % (
219 padding, prev_symbol, symbol))
220 continue
221 symbol.padding = padding
222 symbol.size += padding
223 assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol
224 # Map files have no overlaps, so worth special-casing the no-op case.
225 if to_remove:
226 logging.info('Removing %d overlapping symbols', len(to_remove))
227 symbol_group.symbols = (
228 [s for i, s in enumerate(all_symbols) if i not in to_remove])
229
230
231 def _PrintStats(result, write_func):
232 """Prints out how accurate |result| is."""
233 for section in symbols.SECTION_TO_SECTION_NAME:
234 if section == 'd':
235 expected_size = sum(v for k, v in result.section_sizes.iteritems()
236 if k.startswith('.data'))
237 else:
238 expected_size = result.section_sizes[
239 symbols.SECTION_TO_SECTION_NAME[section]]
240
241 def show_one_stat(group):
242 template = ('Section %s has %.1f%% of %d bytes accounted for from '
243 '%d symbols. %d bytes are unaccounted for. Padding '
244 'accounts for %d bytes\n')
245 actual_size = group.size
246 count = len(group)
247 padding = group.padding
248 size_percent = 100.0 * actual_size / expected_size
249 write_func(template % (section, size_percent, actual_size, count,
250 expected_size - actual_size, padding))
251
252 in_section = result.symbol_group.WhereInSection(section)
253 show_one_stat(in_section)
254
255 star_syms = in_section.WhereNameMatches(r'^\*')
256 attributed_syms = star_syms.Inverted().WhereHasAnyAttribution()
257 anonymous_syms = attributed_syms.Inverted()
258 if star_syms or anonymous_syms:
259 missing_size = star_syms.size + anonymous_syms.size
260 write_func(('Without %d merge sections and %d anonymous entries ('
261 'accounting for %d bytes):\n') % (
262 len(star_syms), len(anonymous_syms), missing_size))
263 show_one_stat(attributed_syms)
264
265
266 def _SaveResult(result, file_obj):
267 """Saves the result to the given file object."""
estevenson 2017/03/16 19:49:18 It's probably too slow but did you try pickle for
agrieve 2017/03/20 19:58:08 It was the first thing I tried, and it is sadly 10
268 # Store one bucket per line.
269 file_obj.write('%d\n' % _SERIALIZATION_VERSION)
270 file_obj.write('%r\n' % result.section_sizes)
271 file_obj.write('%d\n' % len(result.symbol_group))
272 prev_section_name = None
273 # Store symbol fields as tab-separated.
274 # Store only non-derived fields.
275 for symbol in result.symbol_group:
276 if symbol.section_name != prev_section_name:
277 file_obj.write('%s\n' % symbol.section_name)
278 prev_section_name = symbol.section_name
279 # Don't write padding nor name since these are derived values.
280 file_obj.write('%x\t%x\t%s\t%s\n' % (
281 symbol.address, symbol.size_without_padding,
282 symbol.function_signature or symbol.name or '',
283 symbol.path or ''))
284
285
286 def _LoadResults(file_obj):
287 """Loads a result from the given file."""
288 lines = iter(file_obj)
289 actual_version = int(next(lines))
290 assert actual_version == _SERIALIZATION_VERSION, (
291 'Version mismatch. Need to write some upgrade code.')
292
293 section_sizes = ast.literal_eval(next(lines))
294 num_syms = int(next(lines))
295 symbol_list = [None] * num_syms
296 section_name = None
297 for i in xrange(num_syms):
298 line = next(lines)[:-1]
299 if '\t' not in line:
300 section_name = intern(line)
301 line = next(lines)[:-1]
302 new_sym = symbols.Symbol.__new__(symbols.Symbol)
303 parts = line.split('\t')
304 new_sym.section_name = section_name
305 new_sym.address = int(parts[0], 16)
306 new_sym.size = int(parts[1], 16)
307 new_sym.name = parts[2] or None
308 new_sym.path = parts[3] or None
309 new_sym.padding = 0 # Derived
310 new_sym.function_signature = None # Derived
311 symbol_list[i] = new_sym
312
313 # Recompute derived values (padding and function names).
314 result = parsers.ParseResult(symbol_list, section_sizes)
315 logging.info('Calculating padding')
316 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)
estevenson 2017/03/16 19:49:18 Does this need to be done, since it's done in Anal
estevenson 2017/03/20 14:13:02 Nvm, can't read.
317 logging.info('Deriving signatures')
318 # Re-parse out function parameters.
319 _NormalizeNames(result.symbol_group.WhereInSection('t'))
320 return result
321
322
323 def AddOptions(parser):
324 parser.add_argument('input_file',
325 help='Path to input file. Can be a linker .map file, an '
326 'unstripped binary, or a saved result from '
327 'analyze.py')
328 parser.add_argument('--tool-prefix', default='',
329 help='Path prefix for c++filt.')
330 parser.add_argument('--output-directory',
331 help='Path to the root build directory.')
332
333
334 def _DetectToolPrefix(tool_prefix, input_file, output_directory=None):
335 """Calls Analyze with values from args."""
336 if not output_directory:
337 abs_path = os.path.abspath(input_file)
338 release_idx = abs_path.find('Release')
339 if release_idx != -1:
340 output_directory = os.path.relpath(abs_path[:release_idx],
341 helpers.SRC_ROOT) + '/Release'
342 logging.debug('Detected --output-directory=%s', output_directory)
343
344 if not tool_prefix and output_directory:
345 # Auto-detect from build_vars.txt
346 build_vars_path = os.path.join(output_directory, 'build_vars.txt')
347 if os.path.exists(build_vars_path):
348 with open(build_vars_path) as f:
349 build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l)
350 logging.debug('Found --tool-prefix from build_vars.txt')
351 tool_prefix = build_vars['android_tool_prefix']
estevenson 2017/03/16 19:49:18 I get an error because of this, I think the "//" n
agrieve 2017/03/20 19:58:08 Changed it to not have // in: https://codereview.c
352
353 if os.path.sep not in tool_prefix:
estevenson 2017/03/16 19:49:18 When does this happen?
agrieve 2017/03/20 19:58:08 It's not uncommon for the tool prefix to be entire
354 full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt')
355 else:
356 full_path = tool_prefix + 'c++filt'
357
358 if not os.path.isfile(full_path):
359 raise Exception('Bad --tool-prefix. Path not found: %s' % full_path)
360 return tool_prefix
361
362
363 def AnalyzeWithArgs(args):
364 return Analyze(args.input_file, args.output_directory, args.tool_prefix)
365
366
367 def Analyze(path, output_directory=None, tool_prefix=''):
368 if _EndsWithMaybeGz(path, '.size'):
369 logging.info('Loading cached results.')
370 with _OpenMaybeGz(path) as f:
371 result = _LoadResults(f)
372 elif not _EndsWithMaybeGz(path, '.map'):
373 raise Exception('Expected input to be a .map or a .size')
374 else:
375 # Verify tool_prefix early.
376 tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory)
377
378 with _OpenMaybeGz(path) as map_file:
379 result = parsers.MapFileParser().Parse(map_file)
380
381 # Map file for some reason doesn't unmangle all names.
382 logging.info('Calculating padding')
383 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)
384 # Unmangle prints its own log statement.
385 _UnmangleRemainingSymbols(result.symbol_group, tool_prefix)
386 # Resolve paths prints its own log statement.
387 logging.info('Normalizing names')
388 _NormalizeNames(result.symbol_group)
389 logging.info('Normalizing paths')
390 _NormalizeObjectPaths(result.symbol_group)
391
392 if logging.getLogger().isEnabledFor(logging.INFO):
393 _PrintStats(result, lambda l: logging.info(l.rstrip()))
394 logging.info('Finished analyzing %d symbols', len(result.symbol_group))
395 return result
396
397
398 def main():
399 parser = argparse.ArgumentParser()
400 parser.add_argument('--output', required=True,
401 help='Path to store results. Must end in .size or '
402 '.size.gz')
403 AddOptions(parser)
404 helpers.AddCommonOptions(parser)
405 args = parser.parse_args()
406 if not _EndsWithMaybeGz(args.output, '.size'):
407 raise Exception('--output must end with .size or .size.gz')
408 helpers.HandleCommonOptions(args)
409
410 result = AnalyzeWithArgs(args)
411 logging.info('Saving result to %s', args.output)
412 with _OpenMaybeGz(args.output, 'wb') as f:
413 _SaveResult(result, f)
414
415 logging.info('Done. Peak RAM usage was %d MB.', helpers.GetPeakRamUsage())
416
417
418 if __name__ == '__main__':
419 main()
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698