Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(752)

Side by Side Diff: tools/binary_size/analyze.py

Issue 2724253002: V1 of //tools/binary_size rewrite (Closed)
Patch Set: README tweaks, more cases for function parsing Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2017 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Main Python API for analyzing binary size."""
7
8 import argparse
9 import ast
10 import distutils.spawn
11 import gzip
12 import logging
13 import os
14 import re
15 import subprocess
16
17 import parsers
18 import helpers
19 import symbols
20
21
22 # File format version for .size files.
23 _SERIALIZATION_VERSION = 1
24
25
26 def _OpenMaybeGz(path, mode=None):
27 """Calls `gzip.open()` if |path| ends in ".gz", otherwise calls `open()`."""
28 if path.endswith('.gz'):
29 if mode and 'w' in mode:
30 return gzip.GzipFile(path, mode, 1)
31 return gzip.open(path, mode)
32 return open(path, mode or 'r')
33
34
35 def _EndsWithMaybeGz(path, suffix):
36 return path.endswith(suffix) or path.endswith(suffix + '.gz')
37
38
39 def _IterLines(s):
40 prev_idx = -1
41 while True:
42 idx = s.find('\n', prev_idx + 1)
43 if idx == -1:
44 return
45 yield s[prev_idx + 1:idx]
46 prev_idx = idx
47
48
49 def _UnmangleRemainingSymbols(symbol_group, tool_prefix):
50 """Uses c++filt to unmangle any symbols that need it."""
51 to_process = [s for s in symbol_group if s.name and s.name.startswith('_Z')]
52 if not to_process:
53 return
54
55 logging.info('Unmangling %d names', len(to_process))
56 proc = subprocess.Popen([tool_prefix + 'c++filt'], stdin=subprocess.PIPE,
57 stdout=subprocess.PIPE)
58 stdout = proc.communicate('\n'.join(s.name for s in to_process))[0]
59 assert proc.returncode == 0
60
61 for i, line in enumerate(_IterLines(stdout)):
62 to_process[i].name = line
63
64
65 def _FindParameterListParen(name):
66 """Finds index of the "(" that denotes the start of a paremeter list."""
67 # This loops from left-to-right, but the only reason (I think) that this
68 # is necessary (rather than reusing _FindLastCharOutsideOfBrackets), is
69 # to capture the outer-most function in the case where classes are nested.
70 start_idx = 0
71 while True:
72 template_balance_count = 0
73 paren_balance_count = 0
74 while True:
75 idx = name.find('(', start_idx)
76 if idx == -1:
77 return -1
78 template_balance_count += (
79 name.count('<', start_idx, idx) - name.count('>', start_idx, idx))
80 # Special: operators with angle brackets.
81 operator_idx = name.find('operator<', start_idx, idx)
82 if operator_idx != -1:
83 if name[operator_idx + 9] == '<':
84 template_balance_count -= 2
85 else:
86 template_balance_count -= 1
87 else:
88 operator_idx = name.find('operator>', start_idx, idx)
89 if operator_idx != -1:
90 if name[operator_idx + 9] == '>':
91 template_balance_count += 2
92 else:
93 template_balance_count += 1
94
95 paren_balance_count += (
96 name.count('(', start_idx, idx) - name.count(')', start_idx, idx))
97 if template_balance_count == 0 and paren_balance_count == 0:
98 # Special case: skip "(anonymous namespace)".
99 if -1 != name.find('(anonymous namespace)', idx, idx + 21):
100 start_idx = idx + 21
101 continue
102 # Special case: skip "decltype (...)"
103 if name[idx - 1] != ' ':
104 return idx
105 start_idx = idx + 1
106 paren_balance_count += 1
107
108
109 def _FindLastCharOutsideOfBrackets(name, target_char, prev_idx=None):
110 paren_balance_count = 0
111 template_balance_count = 0
112 while True:
113 idx = name.rfind(target_char, 0, prev_idx)
114 if idx == -1:
115 return -1
116 # It is much faster to use.find() and.count() than to loop over each
117 # character.
118 template_balance_count += (
119 name.count('<', idx, prev_idx) - name.count('>', idx, prev_idx))
120 paren_balance_count += (
121 name.count('(', idx, prev_idx) - name.count(')', idx, prev_idx))
122 if template_balance_count == 0 and paren_balance_count == 0:
123 return idx
124 prev_idx = idx
125
126
127 def _ParseFunctionSignature(name):
128 """Extracts a function name from a function signature.
129
130 See unit tests for example signatures.
131
132 Returns:
133 A tuple of (name_without_return_type, name_without_return_type_and_params).
134 """
135 paren_idx = _FindParameterListParen(name)
136
137 if paren_idx > 0:
138 space_idx = paren_idx
139 # Special case: const cast operators (see tests).
140 if -1 != name.find(' const', paren_idx - 6, paren_idx):
141 space_idx = paren_idx - 6
142 while True:
143 space_idx = _FindLastCharOutsideOfBrackets(name, ' ', space_idx)
144 # Special case: "operator new", and "operator<< <template>".
145 if -1 == space_idx or (
146 -1 == name.find('operator', space_idx - 8, space_idx) and
147 -1 == name.find('operator<<', space_idx - 10, space_idx)):
148 break
149 space_idx -= 8
150 return (name[space_idx + 1:], name[space_idx + 1:paren_idx])
151 return name, name
152
153
154 def _NormalizeNames(symbol_group):
155 """Ensures that all names are formatted in a useful way.
156
157 This include:
158 - Assigning of |function_signature| (for functions).
159 - Stripping of return types in |function_signature| and |name|.
160 - Stripping parameters from |name|.
161 - Moving "vtable for" and the like to be suffixes rather than prefixes.
162 """
163 found_prefixes = set()
164 for symbol in symbol_group:
165 if not symbol.name or symbol.name.startswith('*'):
166 # See comment in _RemoveDuplicatesAndCalculatePadding() about when this
167 # can happen.
168 continue
169
170 # E.g.: vtable for FOO
171 idx = symbol.name.find(' for ', 0, 30)
172 if idx != -1:
173 found_prefixes.add(symbol.name[:idx + 4])
174 symbol.name = symbol.name[idx + 5:] + ' [' + symbol.name[:idx] + ']'
175
176 # E.g.: virtual thunk to FOO
177 idx = symbol.name.find(' to ', 0, 30)
178 if idx != -1:
179 found_prefixes.add(symbol.name[:idx + 3])
180 symbol.name = symbol.name[idx + 4:] + ' [' + symbol.name[:idx] + ']'
181
182 # Strip out return type, and identify where parameter list starts.
183 if symbol.section == 't':
184 symbol.function_signature, symbol.name = (
185 _ParseFunctionSignature(symbol.name))
186
187 # Remove anonymous namespaces (they just harm clustering).
188 symbol.name = symbol.name.replace('(anonymous namespace)::', '')
189
190 logging.debug('Found name prefixes of: %r', found_prefixes)
191
192
193 def _NormalizeObjectPaths(symbol_group):
194 """Ensures that all paths are formatted in a useful way."""
195 for symbol in symbol_group:
196 if symbol.path:
197 if symbol.path.startswith('obj/'):
198 # Convert obj/third_party/... -> third_party/...
199 symbol.path = symbol.path[4:]
200 elif symbol.path.startswith('../../'):
201 # Convert ../../third_party/... -> third_party/...
202 symbol.path = symbol.path[6:]
203 if symbol.path.endswith(')'):
204 # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o
205 start_idx = symbol.path.index('(')
206 paren_path = symbol.path[start_idx + 1:-1]
207 symbol.path = symbol.path[:start_idx] + os.path.sep + paren_path
208
209
210 def _RemoveDuplicatesAndCalculatePadding(symbol_group):
211 """Removes symbols at the same address and calculates the |padding| field.
212
213 Symbols must already be sorted by |address|.
214 """
215 i = 0
216 to_remove = set()
217 all_symbols = symbol_group.symbols
218 for i in xrange(len(all_symbols)):
219 prev_symbol = all_symbols[i - 1]
220 symbol = all_symbols[i]
221 if prev_symbol.section_name is not symbol.section_name:
222 continue
223 if symbol.address > 0 and prev_symbol.address > 0:
224 # Fold symbols that are at the same address (happens in nm output).
225 if symbol.address == prev_symbol.address:
226 symbol.size = max(prev_symbol.size, symbol.size)
227 to_remove.add(i)
228 continue
229 # Even with symbols at the same address removed, overlaps can still
230 # happen. In this case, padding will be negative (and this is fine).
231 padding = symbol.address - prev_symbol.end_address
232 if (symbol.section in 'rd' and padding >= 256 or
233 symbol.section in 't' and padding >= 64):
234 # For nm data, this is caused by data that has no associated symbol.
235 # The linker map file lists them with no name, but with a file.
236 # Example:
237 # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o
238 # Where as most look like:
239 # .data.MANGLED_NAME...
240 logging.debug('Large padding of %d between:\n A) %r\n B) %r' % (
241 padding, prev_symbol, symbol))
242 continue
243 symbol.padding = padding
244 symbol.size += padding
245 assert symbol.size >= 0, 'Symbol has negative size: %r' % symbol
246 # Map files have no overlaps, so worth special-casing the no-op case.
247 if to_remove:
248 logging.info('Removing %d overlapping symbols', len(to_remove))
249 symbol_group.symbols = (
250 [s for i, s in enumerate(all_symbols) if i not in to_remove])
251
252
253 def _PrintStats(result, write_func):
254 """Prints out how accurate |result| is."""
255 for section in symbols.SECTION_TO_SECTION_NAME:
256 if section == 'd':
257 expected_size = sum(v for k, v in result.section_sizes.iteritems()
258 if k.startswith('.data'))
259 else:
260 expected_size = result.section_sizes[
261 symbols.SECTION_TO_SECTION_NAME[section]]
262
263 def show_one_stat(group):
264 template = ('Section %s has %.1f%% of %d bytes accounted for from '
265 '%d symbols. %d bytes are unaccounted for. Padding '
266 'accounts for %d bytes\n')
267 actual_size = group.size
268 count = len(group)
269 padding = group.padding
270 size_percent = 100.0 * actual_size / expected_size
271 write_func(template % (section, size_percent, actual_size, count,
272 expected_size - actual_size, padding))
273
274 in_section = result.symbol_group.WhereInSection(section)
275 show_one_stat(in_section)
276
277 star_syms = in_section.WhereNameMatches(r'^\*')
278 attributed_syms = star_syms.Inverted().WhereHasAnyAttribution()
279 anonymous_syms = attributed_syms.Inverted()
280 if star_syms or anonymous_syms:
281 missing_size = star_syms.size + anonymous_syms.size
282 write_func(('Without %d merge sections and %d anonymous entries ('
283 'accounting for %d bytes):\n') % (
284 len(star_syms), len(anonymous_syms), missing_size))
285 show_one_stat(attributed_syms)
estevenson 2017/03/20 14:13:03 It's a little hard to see just by looking at the o
agrieve 2017/03/20 19:58:09 Good idea! Done. Looks like: I 3711 Section r h
286
287
288 def _SaveResult(result, file_obj):
289 """Saves the result to the given file object."""
290 # Store one bucket per line.
291 file_obj.write('%d\n' % _SERIALIZATION_VERSION)
292 file_obj.write('%r\n' % result.section_sizes)
293 file_obj.write('%d\n' % len(result.symbol_group))
294 prev_section_name = None
295 # Store symbol fields as tab-separated.
296 # Store only non-derived fields.
297 for symbol in result.symbol_group:
298 if symbol.section_name != prev_section_name:
299 file_obj.write('%s\n' % symbol.section_name)
300 prev_section_name = symbol.section_name
301 # Don't write padding nor name since these are derived values.
302 file_obj.write('%x\t%x\t%s\t%s\n' % (
303 symbol.address, symbol.size_without_padding,
304 symbol.function_signature or symbol.name or '',
305 symbol.path or ''))
306
307
308 def _LoadResults(file_obj):
309 """Loads a result from the given file."""
310 lines = iter(file_obj)
311 actual_version = int(next(lines))
312 assert actual_version == _SERIALIZATION_VERSION, (
313 'Version mismatch. Need to write some upgrade code.')
314
315 section_sizes = ast.literal_eval(next(lines))
316 num_syms = int(next(lines))
317 symbol_list = [None] * num_syms
318 section_name = None
319 for i in xrange(num_syms):
320 line = next(lines)[:-1]
321 if '\t' not in line:
322 section_name = intern(line)
323 line = next(lines)[:-1]
324 new_sym = symbols.Symbol.__new__(symbols.Symbol)
325 parts = line.split('\t')
326 new_sym.section_name = section_name
327 new_sym.address = int(parts[0], 16)
328 new_sym.size = int(parts[1], 16)
329 new_sym.name = parts[2] or None
330 new_sym.path = parts[3] or None
331 new_sym.padding = 0 # Derived
332 new_sym.function_signature = None # Derived
333 symbol_list[i] = new_sym
334
335 # Recompute derived values (padding and function names).
336 result = parsers.ParseResult(symbol_list, section_sizes)
337 logging.info('Calculating padding')
338 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)
339 logging.info('Deriving signatures')
340 # Re-parse out function parameters.
341 _NormalizeNames(result.symbol_group.WhereInSection('t'))
342 return result
343
344
345 def AddOptions(parser):
346 parser.add_argument('input_file',
347 help='Path to input file. Can be a linker .map file, an '
348 'unstripped binary, or a saved result from '
349 'analyze.py')
350 parser.add_argument('--tool-prefix', default='',
351 help='Path prefix for c++filt.')
352 parser.add_argument('--output-directory',
353 help='Path to the root build directory.')
354
355
356 def _DetectToolPrefix(tool_prefix, input_file, output_directory=None):
357 """Calls Analyze with values from args."""
358 if not output_directory:
359 abs_path = os.path.abspath(input_file)
360 release_idx = abs_path.find('Release')
361 if release_idx != -1:
362 output_directory = os.path.relpath(abs_path[:release_idx],
363 helpers.SRC_ROOT) + '/Release'
364 logging.debug('Detected --output-directory=%s', output_directory)
365
366 if not tool_prefix and output_directory:
367 # Auto-detect from build_vars.txt
368 build_vars_path = os.path.join(output_directory, 'build_vars.txt')
369 if os.path.exists(build_vars_path):
370 with open(build_vars_path) as f:
371 build_vars = dict(l.rstrip().split('=', 1) for l in f if '=' in l)
372 logging.debug('Found --tool-prefix from build_vars.txt')
373 tool_prefix = build_vars['android_tool_prefix']
374
375 if os.path.sep not in tool_prefix:
376 full_path = distutils.spawn.find_executable(tool_prefix + 'c++filt')
377 else:
378 full_path = tool_prefix + 'c++filt'
379
380 if not os.path.isfile(full_path):
381 raise Exception('Bad --tool-prefix. Path not found: %s' % full_path)
382 return tool_prefix
383
384
385 def AnalyzeWithArgs(args):
386 return Analyze(args.input_file, args.output_directory, args.tool_prefix)
387
388
389 def Analyze(path, output_directory=None, tool_prefix=''):
390 if _EndsWithMaybeGz(path, '.size'):
391 logging.info('Loading cached results.')
392 with _OpenMaybeGz(path) as f:
393 result = _LoadResults(f)
394 elif not _EndsWithMaybeGz(path, '.map'):
395 raise Exception('Expected input to be a .map or a .size')
396 else:
397 # Verify tool_prefix early.
398 tool_prefix = _DetectToolPrefix(tool_prefix, path, output_directory)
399
400 with _OpenMaybeGz(path) as map_file:
401 result = parsers.MapFileParser().Parse(map_file)
402
403 # Map file for some reason doesn't unmangle all names.
404 logging.info('Calculating padding')
405 _RemoveDuplicatesAndCalculatePadding(result.symbol_group)
406 # Unmangle prints its own log statement.
407 _UnmangleRemainingSymbols(result.symbol_group, tool_prefix)
408 # Resolve paths prints its own log statement.
409 logging.info('Normalizing names')
410 _NormalizeNames(result.symbol_group)
411 logging.info('Normalizing paths')
412 _NormalizeObjectPaths(result.symbol_group)
413
414 if logging.getLogger().isEnabledFor(logging.INFO):
415 _PrintStats(result, lambda l: logging.info(l.rstrip()))
416 logging.info('Finished analyzing %d symbols', len(result.symbol_group))
417 return result
418
419
420 def main():
421 parser = argparse.ArgumentParser()
422 parser.add_argument('--output', required=True,
423 help='Path to store results. Must end in .size or '
424 '.size.gz')
425 AddOptions(parser)
426 helpers.AddCommonOptions(parser)
427 args = parser.parse_args()
428 if not _EndsWithMaybeGz(args.output, '.size'):
429 raise Exception('--output must end with .size or .size.gz')
430 helpers.HandleCommonOptions(args)
431
432 result = AnalyzeWithArgs(args)
433 logging.info('Saving result to %s', args.output)
434 with _OpenMaybeGz(args.output, 'wb') as f:
435 _SaveResult(result, f)
436
437 logging.info('Done. Peak RAM usage was %d MB.', helpers.GetPeakRamUsage())
438
439
440 if __name__ == '__main__':
441 main()
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698