tools/binary_size/run_binary_size_analysis.py - Issue 119083006: Add tool to help analyze binary size

Side by Side Diff: tools/binary_size/run_binary_size_analysis.py

Issue 119083006: Add tool to help analyze binary size (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Trivial text update to README.txt Created 6 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/python

	2 # Copyright 2014 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 """Generate a spatial analysis against an arbitrary library.

	7

	8 To use, build the 'binary_size_tool' target. Then run this tool, passing

	9 in the location of the library to be analyzed along with any other options

	10 you desire.

	11 """

	12

	13 import collections

	14 import fileinput

	15 import json

	16 import optparse

	17 import os

	18 import pprint

	19 import re

	20 import shutil

	21 import subprocess

	22 import sys

	23 import tempfile

	24

	25

	26 def FormatBytes(bytes):

	27 """Pretty-print a number of bytes."""

	28 if bytes > 1e6:

	29 bytes = bytes / 1.0e6

	30 return '%.1fm' % bytes

	31 if bytes > 1e3:

	32 bytes = bytes / 1.0e3

	33 return '%.1fk' % bytes

	34 return str(bytes)

	35

	36

	37 def SymbolTypeToHuman(type):

	38 """Convert a symbol type as printed by nm into a human-readable name."""

	39 return {'b': 'bss',

	40 'd': 'data',

	41 'r': 'read-only data',

	42 't': 'code',

	43 'w': 'weak symbol',

	44 'v': 'weak symbol'}[type]

	45

	46

	47 def ParseNm(input):

	48 """Parse nm output.

	49

	50 Argument: an iterable over lines of nm output.

	51

	52 Yields: (symbol name, symbol type, symbol size, source file path).

	53 Path may be None if nm couldn't figure out the source file.

	54 """

	55

	56 # Match lines with size, symbol, optional location, optional discriminator

	57 sym_re = re.compile(r'^[0-9a-f]{8} ' # address (8 hex digits)

	58 '([0-9a-f]{8}) ' # size (8 hex digits)

	59 '(.) ' # symbol type, one character

	60 '([^\t]+)' # symbol name, separated from next by tab

	61 '(?:\t(.):[\d\?]+)?.$') # location

	62 # Match lines with addr but no size.

	63 addr_re = re.compile(r'^[0-9a-f]{8} (.) ([^\t]+)(?:\t.*)?$')

	64 # Match lines that don't have an address at all -- typically external symbols.

	65 noaddr_re = re.compile(r'^ {8} (.) (.*)$')

	66

	67 for line in input:

	68 line = line.rstrip()

	69 match = sym_re.match(line)

	70 if match:

	71 size, type, sym = match.groups()[0:3]

	72 size = int(size, 16)

	73 type = type.lower()

	74 if type == 'v':

	75 type = 'w' # just call them all weak

	76 if type == 'b':

	77 continue # skip all BSS for now

	78 path = match.group(4)

	79 yield sym, type, size, path

	80 continue

	81 match = addr_re.match(line)

	82 if match:

	83 type, sym = match.groups()[0:2]

	84 # No size == we don't care.

	85 continue

	86 match = noaddr_re.match(line)

	87 if match:

	88 type, sym = match.groups()

	89 if type in ('U', 'w'):

	90 # external or weak symbol

	91 continue

	92

	93 print >>sys.stderr, 'unparsed:', repr(line)

	94

	95

	96 def TreeifySymbols(symbols):

	97 """Convert symbols into a path-based tree, calculating size information

	98 along the way.

	99

	100 The result is a dictionary that contains two kinds of nodes:

	101 1. Leaf nodes, representing source code locations (e.g., c++ files)

	102 These nodes have the following dictionary entries:

	103 sizes: a dictionary whose keys are categories (such as code, data,

	104 vtable, etceteras) and whose values are the size, in bytes, of

	105 those categories;

	106 size: the total size, in bytes, of all the entries in the sizes dict

	107 2. Non-leaf nodes, representing directories

	108 These nodes have the following dictionary entries:

	109 children: a dictionary whose keys are names (path entries; either

	110 directory or file names) and whose values are other nodes;

	111 size: the total size, in bytes, of all the leaf nodes that are

	112 contained within the children dict (recursively expanded)

	113

	114 The result object is itself a dictionary that represents the common ancestor

	115 of all child nodes, e.g. a path to which all other nodes beneath it are

	116 relative. The 'size' attribute of this dict yields the sum of the size of all

	117 leaf nodes within the data structure.

	118 """

	119 dirs = {'children': {}, 'size': 0}

	120 for sym, type, size, path in symbols:

	121 dirs['size'] += size

	122 if path:

	123 path = os.path.normpath(path)

	124 if path.startswith('/'):

	125 path = path[1:]

	126

	127 parts = None

	128 if path:

	129 parts = path.split('/')

	130

	131 if parts:

	132 assert path

	133 file_key = parts.pop()

	134 tree = dirs

	135 try:

	136 # Traverse the tree to the parent of the file node, creating as needed

	137 for part in parts:

	138 assert part != ''

	139 if part not in tree['children']:

	140 tree['children'][part] = {'children': {}, 'size': 0}

	141 tree = tree['children'][part]

	142 tree['size'] += size

	143

	144 # Get (creating if necessary) the node for the file

	145 # This node doesn't have a 'children' attribute

	146 if file_key not in tree['children']:

	147 tree['children'][file_key] = {'sizes': collections.defaultdict(int),

	148 'size': 0}

	149 tree = tree['children'][file_key]

	150 tree['size'] += size

	151

	152 # Accumulate size into a bucket within the file

	153 if 'vtable for ' in sym:

	154 tree['sizes']['[vtable]'] += size

	155 elif 'r' == type or 'R' == type:
	bulach 2014/01/16 15:01:59 ok, let's leave the map for a v2, but it'd be simp ok, let's leave the map for a v2, but it'd be simpler to just have: type = type.lower() before the if chain and check for just one of them. Andrew Hayden (chromium.org) 2014/01/16 15:13:00 Ha, uh duh, yes. Sorry :) Show quoted text On 2014/01/16 15:01:59, bulach wrote: > ok, let's leave the map for a v2, but it'd be simpler to just have: > type = type.lower() before the if chain and check for just one of them. Ha, uh duh, yes. Sorry :)
	156 tree['sizes']['[rodata]'] += size

	157 elif 'd' == type or 'D' == type:

	158 tree['sizes']['[data]'] += size

	159 elif 'b' == type or 'B' == type:

	160 tree['sizes']['[bss]'] += size

	161 elif 't' == type or 'T' == type:

	162 # 'text' in binary parlance means 'code'.

	163 tree['sizes']['[code]'] += size

	164 elif 'w' == type or 'W' == type:

	165 tree['sizes']['[weak]'] += size

	166 else:

	167 tree['sizes']['[other]'] += size

	168 except:

	169 print >>sys.stderr, sym, parts, key

	170 raise

	171 else:

	172 key = 'symbols without paths'

	173 if key not in dirs['children']:

	174 dirs['children'][key] = {'sizes': collections.defaultdict(int),

	175 'size': 0}

	176 tree = dirs['children'][key]

	177 subkey = 'misc'

	178 if (sym.endswith('::__FUNCTION__') or

	179 sym.endswith('::__PRETTY_FUNCTION__')):

	180 subkey = '__FUNCTION__'

	181 elif sym.startswith('CSWTCH.'):

	182 subkey = 'CSWTCH'

	183 elif '::' in sym:

	184 subkey = sym[0:sym.find('::') + 2]

	185 tree['sizes'][subkey] = tree['sizes'].get(subkey, 0) + size

	186 tree['size'] += size

	187 return dirs

	188

	189

	190 def JsonifyTree(tree, name):

	191 """Convert TreeifySymbols output to a JSON treemap.

	192

	193 The format is very similar, with the notable exceptions being
	bulach 2014/01/16 15:01:59 nit: needs to be aligned with the """, i.e., inden nit: needs to be aligned with the """, i.e., indented by 2 Andrew Hayden (chromium.org) 2014/01/16 15:13:00 Done. Show quoted text On 2014/01/16 15:01:59, bulach wrote: > nit: needs to be aligned with the """, i.e., indented by 2 Done.
	194 lists of children instead of maps and some different attribute names."""

	195 children = []

	196 if 'children' in tree:

	197 # Non-leaf node. Recurse.

	198 for child_name, child in tree['children'].iteritems():

	199 children.append(JsonifyTree(child, child_name))

	200 else:

	201 # Leaf node; dump per-file stats as entries in the treemap

	202 for kind, size in tree['sizes'].iteritems():

	203 child_json = {'name': kind + ' (' + FormatBytes(size) + ')',

	204 'data': { '$area': size }}

	205 css_class = {
	bulach 2014/01/16 15:01:59 nit: may want to put this as a constant at the top nit: may want to put this as a constant at the top rather than inside the loop Andrew Hayden (chromium.org) 2014/01/16 15:13:00 Done. Show quoted text On 2014/01/16 15:01:59, bulach wrote: > nit: may want to put this as a constant at the top rather than inside the loop Done.
	206 '[vtable]': 'vtable',

	207 '[rodata]': 'read-only_data',

	208 '[data]': 'data',

	209 '[bss]': 'bss',

	210 '[code]': 'code',

	211 '[weak]': 'weak_symbol'

	212 }.get(kind)

	213 if css_class is not None: child_json['data']['$symbol'] = css_class

	214 children.append(child_json)

	215 # Sort children by size, largest to smallest.

	216 children.sort(key=lambda child: -child['data']['$area'])

	217

	218 # For leaf nodes, the 'size' attribute is the size of the leaf;

	219 # Non-leaf nodes don't really have a size, but their 'size' attribute is

	220 # the sum of the sizes of all their children.

	221 return {'name': name + ' (' + FormatBytes(tree['size']) + ')',

	222 'data': { '$area': tree['size'] },

	223 'children': children }

	224

	225

	226 def DumpTreemap(symbols, outfile):

	227 dirs = TreeifySymbols(symbols)

	228 out = open(outfile, 'w')

	229 try:

	230 out.write('var kTree = ' + json.dumps(JsonifyTree(dirs, '/')))

	231 finally:

	232 out.flush()

	233 out.close()

	234

	235

	236 def DumpLargestSymbols(symbols, outfile, n):

	237 # a list of (sym, type, size, path); sort by size.

	238 symbols = sorted(symbols, key=lambda x: -x[2])

	239 dumped = 0

	240 out = open(outfile, 'w')

	241 try:

	242 out.write('var largestSymbols = [\n')

	243 for sym, type, size, path in symbols:

	244 if type in ('b', 'w'):

	245 continue # skip bss and weak symbols

	246 if path is None:

	247 path = ''

	248 entry = {'size': FormatBytes(size),

	249 'symbol': sym,

	250 'type': SymbolTypeToHuman(type),

	251 'location': path }

	252 out.write(json.dumps(entry))

	253 out.write(',\n')

	254 dumped += 1

	255 if dumped >= n:

	256 return

	257 finally:

	258 out.write('];\n')

	259 out.flush()

	260 out.close()

	261

	262

	263 def MakeSourceMap(symbols):

	264 sources = {}

	265 for sym, type, size, path in symbols:

	266 key = None

	267 if path:

	268 key = os.path.normpath(path)

	269 else:

	270 key = '[no path]'

	271 if key not in sources:

	272 sources[key] = {'path': path, 'symbol_count': 0, 'size': 0}

	273 record = sources[key]

	274 record['size'] += size

	275 record['symbol_count'] += 1

	276 return sources

	277

	278

	279 def DumpLargestSources(symbols, outfile, n):

	280 map = MakeSourceMap(symbols)

	281 sources = sorted(map.values(), key=lambda x: -x['size'])

	282 dumped = 0

	283 out = open(outfile, 'w')

	284 try:

	285 out.write('var largestSources = [\n')

	286 for record in sources:

	287 entry = {'size': FormatBytes(record['size']),

	288 'symbol_count': str(record['symbol_count']),

	289 'location': record['path']}

	290 out.write(json.dumps(entry))

	291 out.write(',\n')

	292 dumped += 1

	293 if dumped >= n:

	294 return

	295 finally:

	296 out.write('];\n')

	297 out.flush()

	298 out.close()

	299

	300

	301 def DumpLargestVTables(symbols, outfile, n):

	302 vtables = []

	303 for symbol, type, size, path in symbols:

	304 if 'vtable for ' in symbol:

	305 vtables.append({'symbol': symbol, 'path': path, 'size': size})

	306 vtables = sorted(vtables, key=lambda x: -x['size'])

	307 dumped = 0

	308 out = open(outfile, 'w')

	309 try:

	310 out.write('var largestVTables = [\n')

	311 for record in vtables:

	312 entry = {'size': FormatBytes(record['size']),

	313 'symbol': record['symbol'],

	314 'location': record['path']}

	315 out.write(json.dumps(entry))

	316 out.write(',\n')

	317 dumped += 1

	318 if dumped >= n:

	319 return

	320 finally:

	321 out.write('];\n')

	322 out.flush()

	323 out.close()

	324

	325

	326 def RunParallelAddress2Line(outfile, library, arch, jobs, verbose):

	327 """Run a parallel addr2line processing engine to dump and resolve symbols."""

	328 out_dir = os.getenv('CHROMIUM_OUT_DIR', 'out')

	329 build_type = os.getenv('BUILDTYPE', 'Release')

	330 classpath = os.path.join(out_dir, build_type, 'lib.java',

	331 'binary_size_java.jar')

	332 cmd = ['java',

	333 '-classpath', classpath,

	334 'org.chromium.tools.binary_size.ParallelAddress2Line',

	335 '--disambiguate',

	336 '--outfile', outfile,

	337 '--library', library,

	338 '--threads', jobs]

	339 if verbose is True:

	340 cmd.append('--verbose')

	341 prefix = os.path.join('third_party', 'android_tools', 'ndk', 'toolchains')

	342 if arch == 'android-arm':

	343 prefix = os.path.join(prefix, 'arm-linux-androideabi-4.7', 'prebuilt',

	344 'linux-x86_64', 'bin', 'arm-linux-androideabi-')

	345 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])

	346 elif arch == 'android-mips':

	347 prefix = os.path.join(prefix, 'mipsel-linux-android-4.7', 'prebuilt',

	348 'linux-x86_64', 'bin', 'mipsel-linux-android-')

	349 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])

	350 elif arch == 'android-x86':

	351 prefix = os.path.join(prefix, 'x86-4.7', 'prebuilt',

	352 'linux-x86_64', 'bin', 'i686-linux-android-')

	353 cmd.extend(['--nm', prefix + 'nm', '--addr2line', prefix + 'addr2line'])

	354 # else, use whatever is in PATH (don't pass --nm or --addr2line)

	355

	356 if verbose:

	357 print cmd

	358

	359 return_code = subprocess.call(cmd)

	360 if return_code:

	361 raise RuntimeError('Failed to run ParallelAddress2Line: returned ' +

	362 str(return_code))

	363

	364

	365 def GetNmSymbols(infile, outfile, library, arch, jobs, verbose):

	366 if infile is None:

	367 if outfile is None:

	368 infile = tempfile.NamedTemporaryFile(delete=False).name

	369 else:

	370 infile = outfile

	371

	372 if verbose:

	373 print 'Running parallel addr2line, dumping symbols to ' + infile;

	374 RunParallelAddress2Line(outfile=infile, library=library, arch=arch,

	375 jobs=jobs, verbose=verbose)

	376 elif verbose:

	377 print 'Using nm input from ' + infile

	378 with file(infile, 'r') as infile:

	379 return list(ParseNm(infile))

	380

	381

	382 def main():

	383 usage="""%prog [options]

	384

	385 Runs a spatial analysis on a given library, looking up the source locations

	386 of its symbols and calculating how much space each directory, source file,

	387 and so on is taking. The result is a report that can be used to pinpoint

	388 sources of large portions of the binary, etceteras.

	389

	390 Under normal circumstances, you only need to pass two arguments, thusly:

	391

	392 %prog --library /path/to/library --destdir /path/to/output

	393

	394 In this mode, the program will dump the symbols from the specified library

	395 and map those symbols back to source locations, producing a web-based

	396 report in the specified output directory.

	397

	398 Other options are available via '--help'.

	399 """

	400 parser = optparse.OptionParser(usage=usage)

	401 parser.add_option('--nm-in', metavar='PATH',

	402 help='if specified, use nm input from <path> instead of '

	403 'generating it. Note that source locations should be '

	404 'present in the file; i.e., no addr2line symbol lookups '

	405 'will be performed when this option is specified. '

	406 'Mutually exclusive with --library.')

	407 parser.add_option('--destdir', metavar='PATH',

	408 help='write output to the specified directory. An HTML '

	409 'report is generated here along with supporting files; '

	410 'any existing report will be overwritten.')

	411 parser.add_option('--library', metavar='PATH',

	412 help='if specified, process symbols in the library at '

	413 'the specified path. Mutually exclusive with --nm-in.')

	414 parser.add_option('--arch',

	415 help='the architecture that the library is targeted to. '

	416 'Determines which nm/addr2line binaries are used. When '

	417 '\'host-native\' is chosen, the program will use whichever '

	418 'nm/addr2line binaries are on the PATH. This is '

	419 'appropriate when you are analyzing a binary by and for '

	420 'your computer. '

	421 'This argument is only valid when using --library. '

	422 'Default is \'host-native\'.',

	423 choices=['host-native', 'android-arm',

	424 'android-mips', 'android-x86'],)

	425 parser.add_option('--jobs',

	426 help='number of jobs to use for the parallel '

	427 'addr2line processing pool; defaults to 1. More '

	428 'jobs greatly improve throughput but eat RAM like '

	429 'popcorn, and take several gigabytes each. Start low '

	430 'and ramp this number up until your machine begins to '

	431 'struggle with RAM. '

	432 'This argument is only valid when using --library.')

	433 parser.add_option('-v', dest='verbose', action='store_true',

	434 help='be verbose, printing lots of status information.')

	435 parser.add_option('--nm-out', metavar='PATH',

	436 help='keep the nm output file, and store it at the '

	437 'specified path. This is useful if you want to see the '

	438 'fully processed nm output after the symbols have been '

	439 'mapped to source locations. By default, a tempfile is '

	440 'used and is deleted when the program terminates.'

	441 'This argument is only valid when using --library.')

	442 opts, args = parser.parse_args()

	443

	444 if ((not opts.library) and (not opts.nm_in)) or (opts.library and opts.nm_in):

	445 parser.error('exactly one of --library or --nm-in is required')

	446 if (opts.nm_in):

	447 if opts.jobs:

	448 print >> sys.stderr, ('WARNING: --jobs has no effect '

	449 'when used with --nm-in')

	450 if opts.arch:

	451 print >> sys.stderr, ('WARNING: --arch has no effect '

	452 'when used with --nm-in')

	453 if not opts.destdir:

	454 parser.error('--destdir is required argument')

	455 if not opts.jobs:

	456 opts.jobs = '1'

	457 if not opts.arch:

	458 opts.arch = 'host-native'

	459

	460 symbols = GetNmSymbols(opts.nm_in, opts.nm_out, opts.library, opts.arch,

	461 opts.jobs, opts.verbose is True)

	462 if not os.path.exists(opts.destdir):

	463 os.makedirs(opts.destdir, 0755)

	464

	465 DumpTreemap(symbols, os.path.join(opts.destdir, 'treemap-dump.js'))

	466 DumpLargestSymbols(symbols,

	467 os.path.join(opts.destdir, 'largest-symbols.js'), 100)

	468 DumpLargestSources(symbols,

	469 os.path.join(opts.destdir, 'largest-sources.js'), 100)

	470 DumpLargestVTables(symbols,

	471 os.path.join(opts.destdir, 'largest-vtables.js'), 100)

	472

	473 # TODO(andrewhayden): Switch to D3 for greater flexibility

	474 treemap_out = os.path.join(opts.destdir, 'webtreemap')

	475 if not os.path.exists(treemap_out):

	476 os.makedirs(treemap_out, 0755)

	477 treemap_src = os.path.join('third_party', 'webtreemap', 'src',

	478 'webtreemap-gh-pages')

	479 shutil.copy(os.path.join(treemap_src, 'COPYING'), treemap_out)

	480 shutil.copy(os.path.join(treemap_src, 'webtreemap.js'), treemap_out)

	481 shutil.copy(os.path.join(treemap_src, 'webtreemap.css'), treemap_out)

	482 shutil.copy(os.path.join('tools', 'binary_size', 'template', 'index.html'),

	483 opts.destdir)

	484 if opts.verbose:

	485 print 'Report saved to ' + opts.destdir + '/index.html'

	486

	487

	488 if __name__ == '__main__':

	489 sys.exit(main())

OLD	NEW

« tools/binary_size/README.txt ('K') | « tools/binary_size/java/src/org/chromium/tools/binary_size/Record.java ('k') | tools/binary_size/template/.gitignore » ('j') | no next file with comments »