Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(298)

Side by Side Diff: tools/binary_size/run_binary_size_analysis.py

Issue 119083006: Add tool to help analyze binary size (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Marcus' comments Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/python
2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Generate a spatial analysis against an arbitrary library.
7
8 To use, build the 'binary_size_tool' target. Then run this tool, passing
9 in the location of the library to be analyzed along with any other options
10 you desire.
11 """
12
13 import collections
14 import fileinput
15 import json
16 import optparse
17 import os
18 import pprint
19 import re
20 import shutil
21 import subprocess
22 import sys
23 import tempfile
24
25
26 def FormatBytes(bytes):
27 """Pretty-print a number of bytes."""
28 if bytes > 1e6:
29 bytes = bytes / 1.0e6
30 return '%.1fm' % bytes
31 if bytes > 1e3:
32 bytes = bytes / 1.0e3
33 return '%.1fk' % bytes
34 return str(bytes)
35
36
37 def SymbolTypeToHuman(type):
38 """Convert a symbol type as printed by nm into a human-readable name."""
39 return {'b': 'bss',
40 'd': 'data',
41 'r': 'read-only data',
42 't': 'code',
43 'w': 'weak symbol',
44 'v': 'weak symbol'}[type]
45
46
47 def ParseNm(input):
48 """Parse nm output.
49
50 Argument: an iterable over lines of nm output.
51
52 Yields: (symbol name, symbol type, symbol size, source file path).
53 Path may be None if nm couldn't figure out the source file.
54 """
55
56 # Match lines with size, symbol, optional location, optional discriminator
57 sym_re = re.compile(r'^[0-9a-f]{8} ' # address (8 hex digits)
58 '([0-9a-f]{8}) ' # size (8 hex digits)
59 '(.) ' # symbol type, one character
60 '([^\t]+)' # symbol name, separated from next by tab
61 '(?:\t(.*):[\d\?]+)?.*$') # location
62 # Match lines with addr but no size.
63 addr_re = re.compile(r'^[0-9a-f]{8} (.) ([^\t]+)(?:\t.*)?$')
64 # Match lines that don't have an address at all -- typically external symbols.
65 noaddr_re = re.compile(r'^ {8} (.) (.*)$')
66
67 for line in input:
68 line = line.rstrip()
69 match = sym_re.match(line)
70 if match:
71 size, type, sym = match.groups()[0:3]
72 size = int(size, 16)
73 type = type.lower()
74 if type == 'v':
75 type = 'w' # just call them all weak
76 if type == 'b':
77 continue # skip all BSS for now
78 path = match.group(4)
79 yield sym, type, size, path
80 continue
81 match = addr_re.match(line)
82 if match:
83 type, sym = match.groups()[0:2]
84 # No size == we don't care.
85 continue
86 match = noaddr_re.match(line)
87 if match:
88 type, sym = match.groups()
89 if type in ('U', 'w'):
90 # external or weak symbol
91 continue
92
93 print >>sys.stderr, 'unparsed:', repr(line)
94
95
96 def TreeifySymbols(symbols):
97 """Convert symbols into a path-based tree, calculating size information
98 along the way.
99
100 The result is a dictionary that contains two kinds of nodes:
101 1. Leaf nodes, representing source code locations (e.g., c++ files)
102 These nodes have the following dictionary entries:
103 sizes: a dictionary whose keys are categories (such as code, data,
104 vtable, etceteras) and whose values are the size, in bytes, of
105 those categories;
106 size: the total size, in bytes, of all the entries in the sizes dict
107 2. Non-leaf nodes, representing directories
108 These nodes have the following dictionary entries:
109 children: a dictionary whose keys are names (path entries; either
110 directory or file names) and whose values are other nodes;
111 size: the total size, in bytes, of all the leaf nodes that are
112 contained within the children dict (recursively expanded)
113
114 The result object is itself a dictionary that represents the common ancestor
115 of all child nodes, e.g. a path to which all other nodes beneath it are
116 relative. The 'size' attribute of this dict yields the sum of the size of all
117 leaf nodes within the data structure.
118 """
119 dirs = {'children': {}, 'size': 0}
120 for sym, type, size, path in symbols:
121 dirs['size'] += size
122 if path:
123 path = os.path.normpath(path)
124 if path.startswith('/'):
125 path = path[1:]
126
127 parts = None
128 if path:
129 parts = path.split('/')
130
131 if parts:
132 assert path
133 file_key = parts.pop()
134 tree = dirs
135 try:
136 # Traverse the tree to the parent of the file node, creating as needed
137 for part in parts:
138 assert part != ''
139 if part not in tree['children']:
140 tree['children'][part] = {'children': {}, 'size': 0}
141 tree = tree['children'][part]
142 tree['size'] += size
143
144 # Get (creating if necessary) the node for the file
145 # This node doesn't have a 'children' attribute
146 if file_key not in tree['children']:
147 tree['children'][file_key] = {'sizes': collections.defaultdict(int),
148 'size': 0}
149 tree = tree['children'][file_key]
150 tree['size'] += size
151
152 # Accumulate size into a bucket within the file
153 type = type.lower()
154 if 'vtable for ' in sym:
155 tree['sizes']['[vtable]'] += size
156 elif 'r' == type:
157 tree['sizes']['[rodata]'] += size
158 elif 'd' == type:
159 tree['sizes']['[data]'] += size
160 elif 'b' == type:
161 tree['sizes']['[bss]'] += size
162 elif 't' == type:
163 # 'text' in binary parlance means 'code'.
164 tree['sizes']['[code]'] += size
165 elif 'w' == type:
166 tree['sizes']['[weak]'] += size
167 else:
168 tree['sizes']['[other]'] += size
169 except:
170 print >>sys.stderr, sym, parts, key
171 raise
172 else:
173 key = 'symbols without paths'
174 if key not in dirs['children']:
175 dirs['children'][key] = {'sizes': collections.defaultdict(int),
176 'size': 0}
177 tree = dirs['children'][key]
178 subkey = 'misc'
179 if (sym.endswith('::__FUNCTION__') or
180 sym.endswith('::__PRETTY_FUNCTION__')):
181 subkey = '__FUNCTION__'
182 elif sym.startswith('CSWTCH.'):
183 subkey = 'CSWTCH'
184 elif '::' in sym:
185 subkey = sym[0:sym.find('::') + 2]
186 tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size
187 tree['size'] += size
188 return dirs
189
190
191 def JsonifyTree(tree, name):
192 """Convert TreeifySymbols output to a JSON treemap.
193
194 The format is very similar, with the notable exceptions being
195 lists of children instead of maps and some different attribute names."""
196 children = []
197 css_class_map = {
198 '[vtable]': 'vtable',
199 '[rodata]': 'read-only_data',
200 '[data]': 'data',
201 '[bss]': 'bss',
202 '[code]': 'code',
203 '[weak]': 'weak_symbol'
204 }
205 if 'children' in tree:
206 # Non-leaf node. Recurse.
207 for child_name, child in tree['children'].iteritems():
208 children.append(JsonifyTree(child, child_name))
209 else:
210 # Leaf node; dump per-file stats as entries in the treemap
211 for kind, size in tree['sizes'].iteritems():
212 child_json = {'name': kind + ' (' + FormatBytes(size) + ')',
213 'data': { '$area': size }}
214 css_class = css_class_map.get(kind)
215 if css_class is not None: child_json['data']['$symbol'] = css_class
216 children.append(child_json)
217 # Sort children by size, largest to smallest.
218 children.sort(key=lambda child: -child['data']['$area'])
219
220 # For leaf nodes, the 'size' attribute is the size of the leaf;
221 # Non-leaf nodes don't really have a size, but their 'size' attribute is
222 # the sum of the sizes of all their children.
223 return {'name': name + ' (' + FormatBytes(tree['size']) + ')',
224 'data': { '$area': tree['size'] },
225 'children': children }
226
227
228 def DumpTreemap(symbols, outfile):
229 dirs = TreeifySymbols(symbols)
230 out = open(outfile, 'w')
231 try:
232 out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))
233 finally:
234 out.flush()
235 out.close()
236
237
238 def DumpLargestSymbols(symbols, outfile, n):
239 # a list of (sym, type, size, path); sort by size.
240 symbols = sorted(symbols, key=lambda x: -x[2])
241 dumped = 0
242 out = open(outfile, 'w')
243 try:
244 out.write('var largestSymbols = [\n')
245 for sym, type, size, path in symbols:
246 if type in ('b', 'w'):
247 continue # skip bss and weak symbols
248 if path is None:
249 path = ''
250 entry = {'size': FormatBytes(size),
251 'symbol': sym,
252 'type': SymbolTypeToHuman(type),
253 'location': path }
254 out.write(json.dumps(entry))
255 out.write(',\n')
256 dumped += 1
257 if dumped >= n:
258 return
259 finally:
260 out.write('];\n')
261 out.flush()
262 out.close()
263
264
265 def MakeSourceMap(symbols):
266 sources = {}
267 for sym, type, size, path in symbols:
268 key = None
269 if path:
270 key = os.path.normpath(path)
271 else:
272 key = '[no path]'
273 if key not in sources:
274 sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
275 record = sources[key]
276 record['size'] += size
277 record['symbol_count'] += 1
278 return sources
279
280
281 def DumpLargestSources(symbols, outfile, n):
282 map = MakeSourceMap(symbols)
283 sources = sorted(map.values(), key=lambda x: -x['size'])
284 dumped = 0
285 out = open(outfile, 'w')
286 try:
287 out.write('var largestSources = [\n')
288 for record in sources:
289 entry = {'size': FormatBytes(record['size']),
290 'symbol_count': str(record['symbol_count']),
291 'location': record['path']}
292 out.write(json.dumps(entry))
293 out.write(',\n')
294 dumped += 1
295 if dumped >= n:
296 return
297 finally:
298 out.write('];\n')
299 out.flush()
300 out.close()
301
302
303 def DumpLargestVTables(symbols, outfile, n):
304 vtables = []
305 for symbol, type, size, path in symbols:
306 if 'vtable for ' in symbol:
307 vtables.append({'symbol': symbol, 'path': path, 'size': size})
308 vtables = sorted(vtables, key=lambda x: -x['size'])
309 dumped = 0
310 out = open(outfile, 'w')
311 try:
312 out.write('var largestVTables = [\n')
313 for record in vtables:
314 entry = {'size': FormatBytes(record['size']),
315 'symbol': record['symbol'],
316 'location': record['path']}
317 out.write(json.dumps(entry))
318 out.write(',\n')
319 dumped += 1
320 if dumped >= n:
321 return
322 finally:
323 out.write('];\n')
324 out.flush()
325 out.close()
326
327
328 def RunParallelAddress2Line(outfile, library, arch, jobs, verbose):
329 """Run a parallel addr2line processing engine to dump and resolve symbols."""
330 out_dir = os.getenv('CHROMIUM_OUT_DIR', 'out')
331 build_type = os.getenv('BUILDTYPE', 'Release')
332 classpath = os.path.join(out_dir, build_type, 'lib.java',
333 'binary_size_java.jar')
334 cmd = ['java',
335 '-classpath', classpath,
336 'org.chromium.tools.binary_size.ParallelAddress2Line',
337 '--disambiguate',
338 '--outfile', outfile,
339 '--library', library,
340 '--threads', jobs]
341 if verbose is True:
342 cmd.append('--verbose')
343 prefix = os.path.join('third_party', 'android_tools', 'ndk', 'toolchains')
344 if arch == 'android-arm':
345 prefix = os.path.join(prefix, 'arm-linux-androideabi-4.7', 'prebuilt',
346 'linux-x86_64', 'bin', 'arm-linux-androideabi-')
347 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
348 elif arch == 'android-mips':
349 prefix = os.path.join(prefix, 'mipsel-linux-android-4.7', 'prebuilt',
350 'linux-x86_64', 'bin', 'mipsel-linux-android-')
351 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
352 elif arch == 'android-x86':
353 prefix = os.path.join(prefix, 'x86-4.7', 'prebuilt',
354 'linux-x86_64', 'bin', 'i686-linux-android-')
355 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
356 # else, use whatever is in PATH (don't pass --nm or --addr2line)
357
358 if verbose:
359 print cmd
360
361 return_code = subprocess.call(cmd)
362 if return_code:
363 raise RuntimeError('Failed to run ParallelAddress2Line: returned ' +
364 str(return_code))
365
366
367 def GetNmSymbols(infile, outfile, library, arch, jobs, verbose):
368 if infile is None:
369 if outfile is None:
370 infile = tempfile.NamedTemporaryFile(delete=False).name
371 else:
372 infile = outfile
373
374 if verbose:
375 print 'Running parallel addr2line, dumping symbols to ' + infile;
376 RunParallelAddress2Line(outfile=infile, library=library, arch=arch,
377 jobs=jobs, verbose=verbose)
378 elif verbose:
379 print 'Using nm input from ' + infile
380 with file(infile, 'r') as infile:
381 return list(ParseNm(infile))
382
383
384 def main():
385 usage="""%prog [options]
386
387 Runs a spatial analysis on a given library, looking up the source locations
388 of its symbols and calculating how much space each directory, source file,
389 and so on is taking. The result is a report that can be used to pinpoint
390 sources of large portions of the binary, etceteras.
391
392 Under normal circumstances, you only need to pass two arguments, thusly:
393
394 %prog --library /path/to/library --destdir /path/to/output
395
396 In this mode, the program will dump the symbols from the specified library
397 and map those symbols back to source locations, producing a web-based
398 report in the specified output directory.
399
400 Other options are available via '--help'.
401 """
402 parser = optparse.OptionParser(usage=usage)
403 parser.add_option('--nm-in', metavar='PATH',
404 help='if specified, use nm input from <path> instead of '
405 'generating it. Note that source locations should be '
406 'present in the file; i.e., no addr2line symbol lookups '
407 'will be performed when this option is specified. '
408 'Mutually exclusive with --library.')
409 parser.add_option('--destdir', metavar='PATH',
410 help='write output to the specified directory. An HTML '
411 'report is generated here along with supporting files; '
412 'any existing report will be overwritten.')
413 parser.add_option('--library', metavar='PATH',
414 help='if specified, process symbols in the library at '
415 'the specified path. Mutually exclusive with --nm-in.')
416 parser.add_option('--arch',
417 help='the architecture that the library is targeted to. '
418 'Determines which nm/addr2line binaries are used. When '
419 '\'host-native\' is chosen, the program will use whichever '
420 'nm/addr2line binaries are on the PATH. This is '
421 'appropriate when you are analyzing a binary by and for '
422 'your computer. '
423 'This argument is only valid when using --library. '
424 'Default is \'host-native\'.',
425 choices=['host-native', 'android-arm',
426 'android-mips', 'android-x86'],)
427 parser.add_option('--jobs',
428 help='number of jobs to use for the parallel '
429 'addr2line processing pool; defaults to 1. More '
430 'jobs greatly improve throughput but eat RAM like '
431 'popcorn, and take several gigabytes each. Start low '
432 'and ramp this number up until your machine begins to '
433 'struggle with RAM. '
434 'This argument is only valid when using --library.')
435 parser.add_option('-v', dest='verbose', action='store_true',
436 help='be verbose, printing lots of status information.')
437 parser.add_option('--nm-out', metavar='PATH',
438 help='keep the nm output file, and store it at the '
439 'specified path. This is useful if you want to see the '
440 'fully processed nm output after the symbols have been '
441 'mapped to source locations. By default, a tempfile is '
442 'used and is deleted when the program terminates.'
443 'This argument is only valid when using --library.')
444 opts, args = parser.parse_args()
445
446 if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
447 parser.error('exactly one of --library or --nm-in is required')
448 if (opts.nm_in):
449 if opts.jobs:
450 print >> sys.stderr, ('WARNING: --jobs has no effect '
451 'when used with --nm-in')
452 if opts.arch:
453 print >> sys.stderr, ('WARNING: --arch has no effect '
454 'when used with --nm-in')
455 if not opts.destdir:
456 parser.error('--destdir is required argument')
457 if not opts.jobs:
458 opts.jobs = '1'
459 if not opts.arch:
460 opts.arch = 'host-native'
461
462 symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library, opts.arch,
463 opts.jobs, opts.verbose is True)
464 if not os.path.exists(opts.destdir):
465 os.makedirs(opts.destdir, 0755)
466
467 DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))
468 DumpLargestSymbols(symbols,
469 os.path.join(opts.destdir, 'largest-symbols.js'), 100)
470 DumpLargestSources(symbols,
471 os.path.join(opts.destdir, 'largest-sources.js'), 100)
472 DumpLargestVTables(symbols,
473 os.path.join(opts.destdir, 'largest-vtables.js'), 100)
474
475 # TODO(andrewhayden): Switch to D3 for greater flexibility
476 treemap_out = os.path.join(opts.destdir, 'webtreemap')
477 if not os.path.exists(treemap_out):
478 os.makedirs(treemap_out, 0755)
479 treemap_src = os.path.join('third_party', 'webtreemap', 'src',
480 'webtreemap-gh-pages')
481 shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)
482 shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)
483 shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)
484 shutil.copy(os.path.join('tools', 'binary_size', 'template', 'index.html'),
485 opts.destdir)
486 if opts.verbose:
487 print 'Report saved to ' + opts.destdir + '/index.html'
488
489
490 if __name__ == '__main__':
491 sys.exit(main())
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698