tools/binary_size/libsupersize/archive.py - Issue 2851473003: supersize: Track symbol aliases and shared symbols

Unified Diff: tools/binary_size/libsupersize/archive.py

Issue 2851473003: supersize: Track symbol aliases and shared symbols (Closed)

Patch Set: fix regression in calculate padding introduced in ps3 Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: tools/binary_size/libsupersize/archive.py

diff --git a/tools/binary_size/libsupersize/archive.py b/tools/binary_size/libsupersize/archive.py

index 13441d070c3190038cde06df0a087843acc67a2f..68b1cf5fbfe447067f51aa70aebdda54c05f0893 100644

--- a/tools/binary_size/libsupersize/archive.py

+++ b/tools/binary_size/libsupersize/archive.py

@@ -18,13 +18,14 @@ import sys

import tempfile

import zipfile

+import concurrent

import describe

import file_format

import function_signature

-import helpers

import linker_map_parser

import models

import ninja_parser

+import nm

import paths

@@ -127,21 +128,18 @@ def _NormalizeNames(symbols):

logging.debug('Found name prefixes of: %r', found_prefixes)

-def _NormalizeObjectPaths(symbols):

- """Ensures that all paths are formatted in a useful way."""

- for symbol in symbols:

- path = symbol.object_path

- if path.startswith('obj/'):

- # Convert obj/third_party/... -> third_party/...

- path = path[4:]

- elif path.startswith('../../'):

- # Convert ../../third_party/... -> third_party/...

- path = path[6:]

- if path.endswith(')'):

- # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o

- start_idx = path.index('(')

- path = os.path.join(path[:start_idx], path[start_idx + 1:-1])

- symbol.object_path = path

+def _NormalizeObjectPath(path):

+ if path.startswith('obj/'):

+ # Convert obj/third_party/... -> third_party/...

+ path = path[4:]

+ elif path.startswith('../../'):

+ # Convert ../../third_party/... -> third_party/...

+ path = path[6:]

+ if path.endswith(')'):

+ # Convert foo/bar.a(baz.o) -> foo/bar.a/baz.o

+ start_idx = path.index('(')

+ path = os.path.join(path[:start_idx], path[start_idx + 1:-1])

+ return path

def _NormalizeSourcePath(path):

@@ -154,19 +152,111 @@ def _NormalizeSourcePath(path):

return path

+def _SourcePathForObjectPath(object_path, source_mapper):

+ # We don't have source info for prebuilt .a files.

+ if not os.path.isabs(object_path) and not object_path.startswith('..'):

+ source_path = source_mapper.FindSourceForPath(object_path)

+ if source_path:

+ return _NormalizeSourcePath(source_path)

+ return ''

def _ExtractSourcePaths(symbols, source_mapper):

- """Fills in the .source_path attribute of all symbols."""

+ """Fills in the |source_path| attribute."""

logging.debug('Parsed %d .ninja files.', source_mapper.parsed_file_count)

for symbol in symbols:

object_path = symbol.object_path

- if symbol.source_path or not object_path:

+ if object_path and not symbol.source_path:

+ symbol.source_path = _SourcePathForObjectPath(object_path, source_mapper)

+def _ComputeAnscestorPath(path_list):

+ """Returns the common anscestor of the given paths."""

+ # Ignore missing paths.

+ path_list = [p for p in path_list if p]

+ prefix = os.path.commonprefix(path_list)

+ # Put the path count as a subdirectory to allow for better grouping when

+ # path-based breakdowns.

+ if not prefix:

+ if len(path_list) < 2:

+ return ''

+ return os.path.join('{shared}', str(len(path_list)))

+ if prefix == path_list[0]:

+ return prefix

+ assert len(path_list) > 1, 'path_list: ' + repr(path_list)

+ return os.path.join(os.path.dirname(prefix), '{shared}', str(len(path_list)))

+# This must normalize object paths at the same time because normalization

+# needs to occur before finding common ancestor.

+def _ComputeAnscestorPathsAndNormalizeObjectPaths(

+ symbols, object_paths_by_name, source_mapper):

+ num_found_paths = 0

+ num_unknown_names = 0

+ num_path_mismatches = 0

+ num_unmatched_aliases = 0

+ for symbol in symbols:

+ name = symbol.name

+ if (symbol.IsBss() or

+ not name or

+ name[0] in '*.' or # e.g. ** merge symbols, .Lswitch.table

+ name == 'startup'):

+ symbol.object_path = _NormalizeObjectPath(symbol.object_path)

continue

- # We don't have source info for prebuilt .a files.

- if not os.path.isabs(object_path) and not object_path.startswith('..'):

- source_path = source_mapper.FindSourceForPath(object_path)

- if source_path:

- symbol.source_path = _NormalizeSourcePath(source_path)

+ object_paths = object_paths_by_name.get(name)

+ if object_paths:

+ num_found_paths += 1

+ else:

+ if not symbol.object_path and symbol.aliases:

+ # Happens when aliases are from object files where all symbols were

+ # pruned or de-duped as aliases. Since we are only scanning .o files

+ # referenced by included symbols, such files are missed.

+ # TODO(agrieve): This could be fixed by retrieving linker inputs from

+ # build.ninja, or by looking for paths within the .map file's

+ # discarded sections.

+ num_unmatched_aliases += 1

+ continue

+ if num_unknown_names < 10:

+ logging.warning('Symbol not found in any .o files: %r', symbol)

+ num_unknown_names += 1

+ symbol.object_path = _NormalizeObjectPath(symbol.object_path)

+ continue

+ if symbol.object_path and symbol.object_path not in object_paths:

+ if num_path_mismatches < 10:

+ logging.warning('Symbol path reported by .map not found by nm.')

+ logging.warning('sym=%r', symbol)

+ logging.warning('paths=%r', object_paths)

+ num_path_mismatches += 1

+ if source_mapper:

+ source_paths = [

+ _SourcePathForObjectPath(p, source_mapper) for p in object_paths]

+ symbol.source_path = _ComputeAnscestorPath(source_paths)

+ object_paths = [_NormalizeObjectPath(p) for p in object_paths]

+ symbol.object_path = _ComputeAnscestorPath(object_paths)

+ logging.debug('Cross-referenced %d symbols with nm output. '

+ 'num_unknown_names=%d num_path_mismatches=%d '

+ 'num_unused_aliases=%d', num_found_paths, num_unknown_names,

+ num_path_mismatches, num_unmatched_aliases)

+def _DiscoverMissedObjectPaths(symbols, elf_object_paths):

+ # Missing object paths are caused by .a files added by -l flags, which are not

+ # listed as explicit inputs within .ninja rules.

+ parsed_inputs = set(elf_object_paths)

+ missed_inputs = set()

+ for symbol in symbols:

+ path = symbol.object_path

+ if path.endswith(')'):

+ # Convert foo/bar.a(baz.o) -> foo/bar.a

+ path = path[:path.index('(')]

+ if path and path not in parsed_inputs:

+ missed_inputs.add(path)

+ return missed_inputs

def _CalculatePadding(symbols):

@@ -184,29 +274,27 @@ def _CalculatePadding(symbols):

continue

if symbol.address <= 0 or prev_symbol.address <= 0:

continue

- # Padding-only symbols happen for ** symbol gaps.

- prev_is_padding_only = prev_symbol.size_without_padding == 0

- if symbol.address == prev_symbol.address and not prev_is_padding_only:

- assert False, 'Found duplicate symbols:\n%r\n%r' % (prev_symbol, symbol)

- # Even with symbols at the same address removed, overlaps can still

- # happen. In this case, padding will be negative (and this is fine).

+ if symbol.address == prev_symbol.address:

+ if symbol.aliases and symbol.aliases is prev_symbol.aliases:

+ symbol.padding = prev_symbol.padding

+ symbol.size = prev_symbol.size

+ continue

+ # Padding-only symbols happen for ** symbol gaps.

+ assert prev_symbol.size_without_padding == 0, (

+ 'Found duplicate symbols:\n%r\n%r' % (prev_symbol, symbol))

padding = symbol.address - prev_symbol.end_address

- # These thresholds were found by manually auditing arm32 Chrome.

- # E.g.: Set them to 0 and see what warnings get logged.

+ # These thresholds were found by experimenting with arm32 Chrome.

+ # E.g.: Set them to 0 and see what warnings get logged, then take max value.

# TODO(agrieve): See if these thresholds make sense for architectures

# other than arm32.

if not symbol.name.startswith('*') and (

symbol.section in 'rd' and padding >= 256 or

symbol.section in 't' and padding >= 64):

- # For nm data, this is caused by data that has no associated symbol.

- # The linker map file lists them with no name, but with a file.

- # Example:

- # .data 0x02d42764 0x120 .../V8SharedWorkerGlobalScope.o

- # Where as most look like:

- # .data.MANGLED_NAME...

- logging.debug('Large padding of %d between:\n A) %r\n B) %r' % (

- padding, prev_symbol, symbol))

- continue

+ # Should not happen.

+ logging.warning('Large padding of %d between:\n A) %r\n B) %r' % (

+ padding, prev_symbol, symbol))

symbol.padding = padding

symbol.size += padding

assert symbol.size >= 0, (

@@ -317,6 +405,51 @@ def _ClusterSymbols(symbols):

return grouped_symbols

+def _AddSymbolAliases(symbols, aliases_by_address):

+ # Step 1: Create list of (index_of_symbol, name_list).

+ logging.debug('Creating alias list')

+ replacements = []

+ num_new_symbols = 0

+ for i, s in enumerate(symbols):

+ # Don't alias padding-only symbols (e.g. ** symbol gap)

+ if s.size_without_padding == 0:

+ continue

+ name_list = aliases_by_address.get(s.address)

+ if name_list:

+ if s.name not in name_list:

+ logging.warning('Name missing from aliases: %s %s', s.name, name_list)

+ continue

+ replacements.append((i, name_list))

+ num_new_symbols += len(name_list) - 1

+ # Step 2: Create new symbols as siblings to each existing one.

+ logging.debug('Creating %d aliases', num_new_symbols)

+ src_cursor_end = len(symbols)

+ symbols += [None] * num_new_symbols

+ dst_cursor_end = len(symbols)

+ for src_index, name_list in reversed(replacements):

+ # Copy over symbols that come after the current one.

+ chunk_size = src_cursor_end - src_index - 1

+ dst_cursor_end -= chunk_size

+ src_cursor_end -= chunk_size

+ symbols[dst_cursor_end:dst_cursor_end + chunk_size] = (

+ symbols[src_cursor_end:src_cursor_end + chunk_size])

+ sym = symbols[src_index]

+ src_cursor_end -= 1

+ # Create aliases (does not bother reusing the existing symbol).

+ aliases = [None] * len(name_list)

+ for i, name in enumerate(name_list):

+ aliases[i] = models.Symbol(

+ sym.section_name, sym.size, address=sym.address, name=name,

+ aliases=aliases)

+ dst_cursor_end -= len(aliases)

+ symbols[dst_cursor_end:dst_cursor_end + len(aliases)] = aliases

+ assert dst_cursor_end == src_cursor_end

def LoadAndPostProcessSizeInfo(path):

"""Returns a SizeInfo for the given |path|."""

logging.debug('Loading results from: %s', path)

@@ -336,24 +469,102 @@ def _PostProcessSizeInfo(size_info):

logging.info('Processed %d symbols', len(size_info.raw_symbols))

-def CreateSizeInfo(map_path, lazy_paths=None, no_source_paths=False,

+def CreateMetadata(map_path, elf_path, apk_path, tool_prefix, output_directory):

+ metadata = None

+ if elf_path:

+ logging.debug('Constructing metadata')

+ git_rev = _DetectGitRevision(os.path.dirname(elf_path))

+ architecture = _ArchFromElf(elf_path, tool_prefix)

+ build_id = BuildIdFromElf(elf_path, tool_prefix)

+ timestamp_obj = datetime.datetime.utcfromtimestamp(os.path.getmtime(

+ elf_path))

+ timestamp = calendar.timegm(timestamp_obj.timetuple())

+ metadata = {

+ models.METADATA_GIT_REVISION: git_rev,

+ models.METADATA_ELF_ARCHITECTURE: architecture,

+ models.METADATA_ELF_MTIME: timestamp,

+ models.METADATA_ELF_BUILD_ID: build_id,

+ }

+ if output_directory:

+ relative_to_out = lambda path: os.path.relpath(path, output_directory)

+ gn_args = _ParseGnArgs(os.path.join(output_directory, 'args.gn'))

+ metadata[models.METADATA_MAP_FILENAME] = relative_to_out(map_path)

+ metadata[models.METADATA_ELF_FILENAME] = relative_to_out(elf_path)

+ metadata[models.METADATA_GN_ARGS] = gn_args

+ if apk_path:

+ metadata[models.METADATA_APK_FILENAME] = relative_to_out(apk_path)

+ return metadata

+def CreateSizeInfo(map_path, elf_path, tool_prefix, output_directory,

raw_only=False):

- """Creates a SizeInfo from the given map file."""

- # tool_prefix needed for c++filt.

- lazy_paths.VerifyToolPrefix()

+ """Creates a SizeInfo.

+ Args:

+ map_path: Path to the linker .map(.gz) file to parse.

+ elf_path: Path to the corresponding unstripped ELF file. Used to find symbol

+ aliases and inlined functions. Can be None.

+ tool_prefix: Prefix for c++filt & nm (required).

+ output_directory: Build output directory. If None, source_paths and symbol

+ alias information will not be recorded.

+ raw_only: Fill in just the information required for creating a .size file.

+ """

+ source_mapper = None

+ if output_directory:

+ # Start by finding the elf_object_paths, so that nm can run on them while

+ # the linker .map is being parsed.

+ logging.info('Parsing ninja files.')

+ source_mapper, elf_object_paths = ninja_parser.Parse(

+ output_directory, elf_path)

+ assert not elf_path or elf_object_paths, (

+ 'Failed to find link command in ninja files for ' +

+ os.path.relpath(elf_path, output_directory))

- if not no_source_paths:

- # Parse .ninja files at the same time as parsing the .map file.

- source_mapper_result = helpers.ForkAndCall(

- ninja_parser.Parse, lazy_paths.VerifyOutputDirectory())

+ if elf_path:

+ # Run nm on the elf file to retrieve the list of symbol names per-address.

+ # This list is required because the .map file contains only a single name

+ # for each address, yet multiple symbols are often coalesced when they are

+ # identical. This coalescing happens mainly for small symbols and for C++

+ # templates. Such symbols make up ~500kb of libchrome.so on Android.

+ elf_nm_result = nm.CollectAliasesByAddressAsync(elf_path, tool_prefix)

+ # Run nm on all .o/.a files to retrieve the symbol names within them.

+ # The list is used to detect when mutiple .o files contain the same symbol

+ # (e.g. inline functions), and to update the object_path / source_path

+ # fields accordingly.

+ # Looking in object files is required because the .map file choses a

+ # single path for these symbols.

+ # Rather than record all paths for each symbol, set the paths to be the

+ # common ancestor of all paths.

+ if output_directory:

+ bulk_analyzer = nm.BulkObjectFileAnalyzer(tool_prefix, output_directory)

+ bulk_analyzer.AnalyzePaths(elf_object_paths)

with _OpenMaybeGz(map_path) as map_file:

section_sizes, raw_symbols = (

linker_map_parser.MapFileParser().Parse(map_file))

- if not no_source_paths:

- logging.info('Extracting source paths from .ninja files')

- source_mapper = source_mapper_result.get()

+ if elf_path:

+ logging.debug('Validating section sizes')

+ elf_section_sizes = _SectionSizesFromElf(elf_path, tool_prefix)

+ for k, v in elf_section_sizes.iteritems():

+ if v != section_sizes.get(k):

+ logging.error('ELF file and .map file do not agree on section sizes.')

+ logging.error('.map file: %r', section_sizes)

+ logging.error('readelf: %r', elf_section_sizes)

+ sys.exit(1)

+ if elf_path and output_directory:

+ missed_object_paths = _DiscoverMissedObjectPaths(

+ raw_symbols, elf_object_paths)

+ bulk_analyzer.AnalyzePaths(missed_object_paths)

+ bulk_analyzer.Close()

+ if source_mapper:

+ logging.info('Looking up source paths from ninja files')

_ExtractSourcePaths(raw_symbols, source_mapper)

assert source_mapper.unmatched_paths_count == 0, (

'One or more source file paths could not be found. Likely caused by '

@@ -363,9 +574,31 @@ def CreateSizeInfo(map_path, lazy_paths=None, no_source_paths=False,

_StripLinkerAddedSymbolPrefixes(raw_symbols)

# Map file for some reason doesn't unmangle all names.

# Unmangle prints its own log statement.

- _UnmangleRemainingSymbols(raw_symbols, lazy_paths.tool_prefix)

- logging.info('Normalizing object paths')

- _NormalizeObjectPaths(raw_symbols)

+ _UnmangleRemainingSymbols(raw_symbols, tool_prefix)

+ if elf_path:

+ logging.info('Adding aliased symbols, as reported by nm')

+ # This normally does not block (it's finished by this time).

+ aliases_by_address = elf_nm_result.get()

+ _AddSymbolAliases(raw_symbols, aliases_by_address)

+ if output_directory:

+ # For aliases, this provides path information where there wasn't any.

+ logging.info('Computing ancestor paths for inline functions and '

+ 'normalizing object paths')

+ object_paths_by_name = bulk_analyzer.Get()

+ logging.debug('Fetched path information for %d symbols from %d files',

+ len(object_paths_by_name),

+ len(elf_object_paths) + len(missed_object_paths))

+ _ComputeAnscestorPathsAndNormalizeObjectPaths(

+ raw_symbols, object_paths_by_name, source_mapper)

+ if not elf_path or not output_directory:

+ logging.info('Normalizing object paths.')

+ for symbol in raw_symbols:

+ symbol.object_path = _NormalizeObjectPath(symbol.object_path)

size_info = models.SizeInfo(section_sizes, raw_symbols)

# Name normalization not strictly required, but makes for smaller files.

@@ -488,7 +721,7 @@ def Run(args, parser):

# secondary architectures.

apk_so_path = max(lib_infos, key=lambda x:x.file_size).filename

logging.debug('Sub-apk path=%s', apk_so_path)

- if not elf_path:

+ if not elf_path and lazy_paths.output_directory:

elf_path = os.path.join(

lazy_paths.output_directory, 'lib.unstripped',

os.path.basename(apk_so_path.replace('crazy.', '')))

@@ -504,56 +737,34 @@ def Run(args, parser):

if not os.path.exists(map_path):

parser.error('Could not find .map(.gz)? file. Use --map-file.')

- metadata = None

- if elf_path:

- logging.debug('Constructing metadata')

- git_rev = _DetectGitRevision(os.path.dirname(elf_path))

- architecture = _ArchFromElf(elf_path, lazy_paths.tool_prefix)

- build_id = BuildIdFromElf(elf_path, lazy_paths.tool_prefix)

- timestamp_obj = datetime.datetime.utcfromtimestamp(os.path.getmtime(

- elf_path))

- timestamp = calendar.timegm(timestamp_obj.timetuple())

- gn_args = _ParseGnArgs(os.path.join(lazy_paths.output_directory, 'args.gn'))

- def relative_to_out(path):

- return os.path.relpath(path, lazy_paths.VerifyOutputDirectory())

- metadata = {

- models.METADATA_GIT_REVISION: git_rev,

- models.METADATA_MAP_FILENAME: relative_to_out(map_path),

- models.METADATA_ELF_ARCHITECTURE: architecture,

- models.METADATA_ELF_FILENAME: relative_to_out(elf_path),

- models.METADATA_ELF_MTIME: timestamp,

- models.METADATA_ELF_BUILD_ID: build_id,

- models.METADATA_GN_ARGS: gn_args,

- }

+ tool_prefix = lazy_paths.VerifyToolPrefix()

+ output_directory = None

+ if not args.no_source_paths:

+ output_directory = lazy_paths.VerifyOutputDirectory()

- if apk_path:

- metadata[models.METADATA_APK_FILENAME] = relative_to_out(apk_path)

- # Extraction takes around 1 second, so do it in parallel.

- apk_elf_result = helpers.ForkAndCall(

- _ElfInfoFromApk, apk_path, apk_so_path, lazy_paths.tool_prefix)

+ metadata = CreateMetadata(map_path, elf_path, apk_path, tool_prefix,

+ output_directory)

+ if apk_path and elf_path:

+ # Extraction takes around 1 second, so do it in parallel.

+ apk_elf_result = concurrent.ForkAndCall(

+ _ElfInfoFromApk, (apk_path, apk_so_path, tool_prefix))

size_info = CreateSizeInfo(

- map_path, lazy_paths, no_source_paths=args.no_source_paths, raw_only=True)

+ map_path, elf_path, tool_prefix, output_directory, raw_only=True)

if metadata:

size_info.metadata = metadata

- logging.debug('Validating section sizes')

- elf_section_sizes = _SectionSizesFromElf(elf_path, lazy_paths.tool_prefix)

- for k, v in elf_section_sizes.iteritems():

- assert v == size_info.section_sizes.get(k), (

- 'ELF file and .map file do not match.')

if apk_path:

logging.debug('Extracting section sizes from .so within .apk')

unstripped_section_sizes = size_info.section_sizes

apk_build_id, size_info.section_sizes = apk_elf_result.get()

- assert apk_build_id == build_id, (

+ assert apk_build_id == metadata[models.METADATA_ELF_BUILD_ID], (

'BuildID for %s within %s did not match the one at %s' %

(apk_so_path, apk_path, elf_path))

packed_section_name = None

+ architecture = metadata[models.METADATA_ELF_ARCHITECTURE]

if architecture == 'ARM':

packed_section_name = '.rel.dyn'

elif architecture == 'AArch64':

« no previous file with comments | « no previous file | tools/binary_size/libsupersize/concurrent.py » ('j') | no next file with comments »