Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(164)

Side by Side Diff: tools/binary_size/run_binary_size_analysis.py

Issue 119083006: Add tool to help analyze binary size (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: <ol> -> <ul> for README.txt shortcomings list Created 6 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/python
2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 """Generate a spatial analysis against an arbitrary library.
7
8 To use, build the 'binary_size_tool' target. Then run this tool, passing
9 in the location of the library to be analyzed along with any other options
10 you desire.
11 """
12
13 import collections
14 import fileinput
15 import json
16 import optparse
17 import os
18 import pprint
19 import re
20 import shutil
21 import subprocess
22 import sys
23 import tempfile
24
25
26 def FormatBytes(bytes):
27 """Pretty-print a number of bytes."""
28 if bytes > 1e6:
29 bytes = bytes / 1.0e6
30 return '%.1fm' % bytes
31 if bytes > 1e3:
32 bytes = bytes / 1.0e3
33 return '%.1fk' % bytes
34 return str(bytes)
35
36
37 def SymbolTypeToHuman(type):
38 """Convert a symbol type as printed by nm into a human-readable name."""
39 return {'b': 'bss',
40 'd': 'data',
41 'r': 'read-only data',
42 't': 'code',
43 'w': 'weak symbol',
44 'v': 'weak symbol'}[type]
45
46
47 def ParseNm(input):
48 """Parse nm output.
49
50 Argument: an iterable over lines of nm output.
51
52 Yields: (symbol name, symbol type, symbol size, source file path).
53 Path may be None if nm couldn't figure out the source file.
54 """
55
56 # Match lines with size, symbol, optional location, optional discriminator
57 sym_re = re.compile(r'^[0-9a-f]{8} ' # address (8 hex digits)
58 '([0-9a-f]{8}) ' # size (8 hex digits)
59 '(.) ' # symbol type, one character
60 '([^\t]+)' # symbol name, separated from next by tab
61 '(?:\t(.*):[\d\?]+)?.*$') # location
62 # Match lines with addr but no size.
63 addr_re = re.compile(r'^[0-9a-f]{8} (.) ([^\t]+)(?:\t.*)?$')
64 # Match lines that don't have an address at all -- typically external symbols.
65 noaddr_re = re.compile(r'^ {8} (.) (.*)$')
66
67 for line in input:
68 line = line.rstrip()
69 match = sym_re.match(line)
70 if match:
71 size, type, sym = match.groups()[0:3]
72 size = int(size, 16)
73 type = type.lower()
74 if type == 'v':
75 type = 'w' # just call them all weak
76 if type == 'b':
77 continue # skip all BSS for now
78 path = match.group(4)
79 yield sym, type, size, path
80 continue
81 match = addr_re.match(line)
82 if match:
83 type, sym = match.groups()[0:2]
84 # No size == we don't care.
85 continue
86 match = noaddr_re.match(line)
87 if match:
88 type, sym = match.groups()
89 if type in ('U', 'w'):
90 # external or weak symbol
91 continue
92
93 print >>sys.stderr, 'unparsed:', repr(line)
94
95
96 def TreeifySymbols(symbols):
97 """Convert symbols into a path-based tree, calculating size information
98 along the way.
99
100 The result is a dictionary that contains two kinds of nodes:
101 1. Leaf nodes, representing source code locations (e.g., c++ files)
102 These nodes have the following dictionary entries:
103 sizes: a dictionary whose keys are categories (such as code, data,
104 vtable, etceteras) and whose values are the size, in bytes, of
105 those categories;
106 size: the total size, in bytes, of all the entries in the sizes dict
107 2. Non-leaf nodes, representing directories
108 These nodes have the following dictionary entries:
109 children: a dictionary whose keys are names (path entries; either
110 directory or file names) and whose values are other nodes;
111 size: the total size, in bytes, of all the leaf nodes that are
112 contained within the children dict (recursively expanded)
113
114 The result object is itself a dictionary that represents the common ancestor
115 of all child nodes, e.g. a path to which all other nodes beneath it are
116 relative. The 'size' attribute of this dict yields the sum of the size of all
117 leaf nodes within the data structure.
118 """
119 dirs = {'children': {}, 'size': 0}
120 for sym, type, size, path in symbols:
121 dirs['size'] += size
122 if path:
123 path = os.path.normpath(path)
124 if path.startswith('/'):
125 path = path[1:]
126
127 parts = None
128 if path:
129 parts = path.split('/')
130
131 if parts:
132 assert path
133 file_key = parts.pop()
134 tree = dirs
135 try:
136 # Traverse the tree to the parent of the file node, creating as needed
137 for part in parts:
138 assert part != ''
139 if part not in tree['children']:
140 tree['children'][part] = {'children': {}, 'size': 0}
141 tree = tree['children'][part]
142 tree['size'] += size
143
144 # Get (creating if necessary) the node for the file
145 # This node doesn't have a 'children' attribute
146 if file_key not in tree['children']:
147 tree['children'][file_key] = {'sizes': collections.defaultdict(int),
148 'size': 0}
149 tree = tree['children'][file_key]
150 tree['size'] += size
151
152 # Accumulate size into a bucket within the file
153 if 'vtable for ' in sym:
154 tree['sizes']['[vtable]'] += size
155 elif 'r' == type or 'R' == type:
156 tree['sizes']['[rodata]'] += size
157 elif 'd' == type or 'D' == type:
158 tree['sizes']['[data]'] += size
159 elif 'b' == type or 'B' == type:
160 tree['sizes']['[bss]'] += size
161 elif 't' == type or 'T' == type:
162 # 'text' in binary parlance means 'code'.
163 tree['sizes']['[code]'] += size
164 elif 'w' == type or 'W' == type:
165 tree['sizes']['[weak]'] += size
166 else:
167 tree['sizes']['[other]'] += size
168 except:
169 print >>sys.stderr, sym, parts, key
170 raise
171 else:
172 key = 'symbols without paths'
173 if key not in dirs['children']:
174 dirs['children'][key] = {'sizes': collections.defaultdict(int),
175 'size': 0}
176 tree = dirs['children'][key]
177 subkey = 'misc'
178 if (sym.endswith('::__FUNCTION__') or
179 sym.endswith('::__PRETTY_FUNCTION__')):
180 subkey = '__FUNCTION__'
181 elif sym.startswith('CSWTCH.'):
182 subkey = 'CSWTCH'
183 elif '::' in sym:
184 subkey = sym[0:sym.find('::') + 2]
185 tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size
186 tree['size'] += size
187 return dirs
188
189
190 def JsonifyTree(tree, name):
191 """Convert TreeifySymbols output to a JSON treemap.
192
193 The format is very similar, with the notable exceptions being
194 lists of children instead of maps and some different attribute names."""
195 children = []
196 if 'children' in tree:
197 # Non-leaf node. Recurse.
198 for child_name, child in tree['children'].iteritems():
199 children.append(JsonifyTree(child, child_name))
200 else:
201 # Leaf node; dump per-file stats as entries in the treemap
202 for kind, size in tree['sizes'].iteritems():
203 child_json = {'name': kind + ' (' + FormatBytes(size) + ')',
204 'data': { '$area': size }}
205 css_class = {
206 '[vtable]': 'vtable',
207 '[rodata]': 'read-only_data',
208 '[data]': 'data',
209 '[bss]': 'bss',
210 '[code]': 'code',
211 '[weak]': 'weak_symbol'
212 }.get(kind)
213 if css_class is not None: child_json['data']['$symbol'] = css_class
214 children.append(child_json)
215 # Sort children by size, largest to smallest.
216 children.sort(key=lambda child: -child['data']['$area'])
217
218 # For leaf nodes, the 'size' attribute is the size of the leaf;
219 # Non-leaf nodes don't really have a size, but their 'size' attribute is
220 # the sum of the sizes of all their children.
221 return {'name': name + ' (' + FormatBytes(tree['size']) + ')',
222 'data': { '$area': tree['size'] },
223 'children': children }
224
225
226 def DumpTreemap(symbols, outfile):
227 dirs = TreeifySymbols(symbols)
228 out = open(outfile, 'w')
229 try:
230 out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))
231 finally:
232 out.flush()
233 out.close()
234
235
236 def DumpLargestSymbols(symbols, outfile, n):
237 # a list of (sym, type, size, path); sort by size.
238 symbols = sorted(symbols, key=lambda x: -x[2])
239 dumped = 0
240 out = open(outfile, 'w')
241 try:
242 out.write('var largestSymbols = [\n')
243 for sym, type, size, path in symbols:
244 if type in ('b', 'w'):
245 continue # skip bss and weak symbols
246 if path is None:
247 path = ''
248 entry = {'size': FormatBytes(size),
249 'symbol': sym,
250 'type': SymbolTypeToHuman(type),
251 'location': path }
252 out.write(json.dumps(entry))
253 out.write(',\n')
254 dumped += 1
255 if dumped >= n:
256 return
257 finally:
258 out.write('];\n')
259 out.flush()
260 out.close()
261
262
263 def MakeSourceMap(symbols):
264 sources = {}
265 for sym, type, size, path in symbols:
266 key = None
267 if path:
268 key = os.path.normpath(path)
269 else:
270 key = '[no path]'
271 if key not in sources:
272 sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}
273 record = sources[key]
274 record['size'] += size
275 record['symbol_count'] += 1
276 return sources
277
278
279 def DumpLargestSources(symbols, outfile, n):
280 map = MakeSourceMap(symbols)
281 sources = sorted(map.values(), key=lambda x: -x['size'])
282 dumped = 0
283 out = open(outfile, 'w')
284 try:
285 out.write('var largestSources = [\n')
286 for record in sources:
287 entry = {'size': FormatBytes(record['size']),
288 'symbol_count': str(record['symbol_count']),
289 'location': record['path']}
290 out.write(json.dumps(entry))
291 out.write(',\n')
292 dumped += 1
293 if dumped >= n:
294 return
295 finally:
296 out.write('];\n')
297 out.flush()
298 out.close()
299
300
301 def DumpLargestVTables(symbols, outfile, n):
302 vtables = []
303 for symbol, type, size, path in symbols:
304 if 'vtable for ' in symbol:
305 vtables.append({'symbol': symbol, 'path': path, 'size': size})
306 vtables = sorted(vtables, key=lambda x: -x['size'])
307 dumped = 0
308 out = open(outfile, 'w')
309 try:
310 out.write('var largestVTables = [\n')
311 for record in vtables:
312 entry = {'size': FormatBytes(record['size']),
313 'symbol': record['symbol'],
314 'location': record['path']}
315 out.write(json.dumps(entry))
316 out.write(',\n')
317 dumped += 1
318 if dumped >= n:
319 return
320 finally:
321 out.write('];\n')
322 out.flush()
323 out.close()
324
325
326 def RunParallelAddress2Line(outfile, library, arch, jobs, verbose):
327 """Run a parallel addr2line processing engine to dump and resolve symbols."""
328 out_dir = os.getenv('CHROMIUM_OUT_DIR', 'out')
329 build_type = os.getenv('BUILDTYPE', 'Release')
330 classpath = os.path.join(out_dir, build_type, 'lib.java',
331 'binary_size_java.jar')
332 cmd = ['java',
333 '-classpath', classpath,
334 'org.chromium.tools.binary_size.ParallelAddress2Line',
335 '--disambiguate',
336 '--outfile', outfile,
337 '--library', library,
338 '--threads', jobs]
339 if verbose is True:
340 cmd.append('--verbose')
341 prefix = os.path.join('third_party', 'android_tools', 'ndk', 'toolchains')
342 if arch == 'android-arm':
343 prefix = os.path.join(prefix, 'arm-linux-androideabi-4.7', 'prebuilt',
344 'linux-x86_64', 'bin', 'arm-linux-androideabi-')
345 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
346 elif arch == 'android-mips':
347 prefix = os.path.join(prefix, 'mipsel-linux-android-4.7', 'prebuilt',
348 'linux-x86_64', 'bin', 'mipsel-linux-android-')
349 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
350 elif arch == 'android-x86':
351 prefix = os.path.join(prefix, 'x86-4.7', 'prebuilt',
352 'linux-x86_64', 'bin', 'i686-linux-android-')
353 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])
354 # else, use whatever is in PATH (don't pass --nm or --addr2line)
355
356 if verbose:
357 print cmd
358
359 return_code = subprocess.call(cmd)
360 if return_code:
361 raise RuntimeError('Failed to run ParallelAddress2Line: returned ' +
362 str(return_code))
363
364
365 def GetNmSymbols(infile, outfile, library, arch, jobs, verbose):
366 if infile is None:
367 if outfile is None:
368 infile = tempfile.NamedTemporaryFile(delete=False).name
369 else:
370 infile = outfile
371
372 if verbose:
373 print 'Running parallel addr2line, dumping symbols to ' + infile;
374 RunParallelAddress2Line(outfile=infile, library=library, arch=arch,
375 jobs=jobs, verbose=verbose)
376 elif verbose:
377 print 'Using nm input from ' + infile
378 with file(infile, 'r') as infile:
379 return list(ParseNm(infile))
380
381
382 def main():
383 usage="""%prog [options]
384
385 Runs a spatial analysis on a given library, looking up the source locations
386 of its symbols and calculating how much space each directory, source file,
387 and so on is taking. The result is a report that can be used to pinpoint
388 sources of large portions of the binary, etceteras.
389
390 Under normal circumstances, you only need to pass two arguments, thusly:
391
392 %prog --library /path/to/library --destdir /path/to/output
393
394 In this mode, the program will dump the symbols from the specified library
395 and map those symbols back to source locations, producing a web-based
396 report in the specified output directory.
397
398 Other options are available via '--help'.
399 """
400 parser = optparse.OptionParser(usage=usage)
401 parser.add_option('--nm-in', metavar='PATH',
402 help='if specified, use nm input from <path> instead of '
403 'generating it. Note that source locations should be '
404 'present in the file; i.e., no addr2line symbol lookups '
405 'will be performed when this option is specified. '
406 'Mutually exclusive with --library.')
407 parser.add_option('--destdir', metavar='PATH',
408 help='write output to the specified directory. An HTML '
409 'report is generated here along with supporting files; '
410 'any existing report will be overwritten.')
411 parser.add_option('--library', metavar='PATH',
412 help='if specified, process symbols in the library at '
413 'the specified path. Mutually exclusive with --nm-in.')
414 parser.add_option('--arch',
415 help='the architecture that the library is targeted to. '
416 'Determines which nm/addr2line binaries are used. When '
417 '\'host-native\' is chosen, the program will use whichever '
418 'nm/addr2line binaries are on the PATH. This is '
419 'appropriate when you are analyzing a binary by and for '
420 'your computer. '
421 'This argument is only valid when using --library. '
422 'Default is \'host-native\'.',
423 choices=['host-native', 'android-arm',
424 'android-mips', 'android-x86'],)
425 parser.add_option('--jobs',
426 help='number of jobs to use for the parallel '
427 'addr2line processing pool; defaults to 1. More '
428 'jobs greatly improve throughput but eat RAM like '
429 'popcorn, and take several gigabytes each. Start low '
430 'and ramp this number up until your machine begins to '
431 'struggle with RAM. '
432 'This argument is only valid when using --library.')
433 parser.add_option('-v', dest='verbose', action='store_true',
434 help='be verbose, printing lots of status information.')
435 parser.add_option('--nm-out', metavar='PATH',
436 help='keep the nm output file, and store it at the '
437 'specified path. This is useful if you want to see the '
438 'fully processed nm output after the symbols have been '
439 'mapped to source locations. By default, a tempfile is '
440 'used and is deleted when the program terminates.'
441 'This argument is only valid when using --library.')
442 opts, args = parser.parse_args()
443
444 if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):
445 parser.error('exactly one of --library or --nm-in is required')
446 if (opts.nm_in):
447 if opts.jobs:
448 print >> sys.stderr, ('WARNING: --jobs has no effect '
449 'when used with --nm-in')
450 if opts.arch:
451 print >> sys.stderr, ('WARNING: --arch has no effect '
452 'when used with --nm-in')
453 if not opts.destdir:
454 parser.error('--destdir is required argument')
455 if not opts.jobs:
456 opts.jobs = '1'
457 if not opts.arch:
458 opts.arch = 'host-native'
459
460 symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library, opts.arch,
461 opts.jobs, opts.verbose is True)
462 if not os.path.exists(opts.destdir):
463 os.makedirs(opts.destdir, 0755)
464
465 DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))
466 DumpLargestSymbols(symbols,
467 os.path.join(opts.destdir, 'largest-symbols.js'), 100)
468 DumpLargestSources(symbols,
469 os.path.join(opts.destdir, 'largest-sources.js'), 100)
470 DumpLargestVTables(symbols,
471 os.path.join(opts.destdir, 'largest-vtables.js'), 100)
472
473 # TODO(andrewhayden): Switch to D3 for greater flexibility
474 treemap_out = os.path.join(opts.destdir, 'webtreemap')
475 if not os.path.exists(treemap_out):
476 os.makedirs(treemap_out, 0755)
477 treemap_src = os.path.join('third_party', 'webtreemap', 'src',
478 'webtreemap-gh-pages')
479 shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)
480 shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)
481 shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)
482 shutil.copy(os.path.join('tools', 'binary_size', 'template', 'index.html'),
483 opts.destdir)
484 if opts.verbose:
485 print 'Report saved to ' + opts.destdir + '/index.html'
486
487
488 if __name__ == '__main__':
489 sys.exit(main())
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698