Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(323)

Side by Side Diff: tools/cygprofile/symbolize.py

Issue 874683004: Refactor the symbolize step of the orderfile generation. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Naming and typos. Created 5 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | tools/cygprofile/symbolize_unittest.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/python 1 #!/usr/bin/python
2 # Copyright 2013 The Chromium Authors. All rights reserved. 2 # Copyright 2013 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 """Symbolize log file produced by cypgofile instrumentation. 6 """Symbolize log file produced by cypgofile instrumentation.
pasko 2015/01/30 13:34:48 nit: cypgofile instrumentation (mergetraces.py)
Benoit L 2015/01/30 16:22:38 Done.
7 7
8 Given a log file and the binary being profiled (e.g. executable, shared 8 Given a log file and the binary being profiled, create an order file.
pasko 2015/01/30 13:34:48 it's a misleading name for the file, since symboli
Benoit L 2015/01/30 16:22:38 Done.
9 library), the script can produce three different outputs: 1) symbols for the
10 addresses, 2) function and line numbers for the addresses, or 3) an order file.
11 """ 9 """
12 10
11 import logging
12 import multiprocessing
13 import optparse 13 import optparse
14 import os 14 import os
15 import string 15 import string
16 import subprocess
17 import sys 16 import sys
18 17
18 import symbol_extractor
19 19
20 def ParseLogLines(log_file_lines): 20
21 """Parse a log file produced by the profiled run of clank. 21 def _ParseLogLines(log_file_lines):
22 """Parse a log file produced by the profiled run of chrome.
pasko 2015/01/30 13:34:48 nit: maybe enhance the comment with: (i.e. "merge
Benoit L 2015/01/30 16:22:38 Done.
22 23
23 Args: 24 Args:
24 log_file_lines: array of lines in log file produced by profiled run 25 log_file_lines: array of lines in log file produced by profiled run
25 lib_name: library or executable containing symbols 26 lib_name: library or executable containing symbols
26 27
27 Below is an example of a small log file: 28 Below is an example of a small log file:
28 5086e000-52e92000 r-xp 00000000 b3:02 51276 libchromeview.so 29 5086e000-52e92000 r-xp 00000000 b3:02 51276 libchromeview.so
29 secs usecs pid:threadid func 30 secs usecs pid:threadid func
30 START 31 START
31 1314897086 795828 3587:1074648168 0x509e105c 32 1314897086 795828 3587:1074648168 0x509e105c
32 1314897086 795874 3587:1074648168 0x509e0eb4 33 1314897086 795874 3587:1074648168 0x509e0eb4
33 1314897086 796326 3587:1074648168 0x509e0e3c 34 1314897086 796326 3587:1074648168 0x509e0e3c
34 1314897086 796552 3587:1074648168 0x509e07bc 35 1314897086 796552 3587:1074648168 0x509e07bc
35 END 36 END
36 37
37 Returns: 38 Returns:
38 call_info list with list of tuples of the format (sec, usec, call id, 39 call_info list with list of tuples of the format (sec, usec, call id,
pasko 2015/01/30 13:34:48 s/call_info list with//
Benoit L 2015/01/30 16:22:38 Done.
39 function address called) 40 function address called)
40 """ 41 """
41 call_lines = [] 42 call_lines = []
42 vm_start = 0 43 vm_start = 0
43 line = log_file_lines[0] 44 line = log_file_lines[0]
44 assert("r-xp" in line) 45 assert "r-xp" in line
pasko 2015/01/30 13:34:48 s/"/'/
Benoit L 2015/01/30 16:22:38 Done.
45 end_index = line.find('-') 46 end_index = line.find('-')
46 vm_start = int(line[:end_index], 16) 47 vm_start = int(line[:end_index], 16)
47 for line in log_file_lines[2:]: 48 for line in log_file_lines[2:]:
48 # print hex(vm_start)
49 fields = line.split() 49 fields = line.split()
50 if len(fields) == 4: 50 if len(fields) == 4:
51 call_lines.append(fields) 51 call_lines.append(fields)
52 52
53 # Convert strings to int in fields. 53 # Convert strings to int in fields.
54 call_info = [] 54 call_info = []
55 for call_line in call_lines: 55 for call_line in call_lines:
56 (sec_timestamp, usec_timestamp) = map(int, call_line[0:2]) 56 (sec_timestamp, usec_timestamp) = map(int, call_line[0:2])
57 callee_id = call_line[2] 57 callee_id = call_line[2]
58 addr = int(call_line[3], 16) 58 addr = int(call_line[3], 16)
59 if vm_start < addr: 59 if vm_start < addr:
60 addr -= vm_start 60 addr -= vm_start
61 call_info.append((sec_timestamp, usec_timestamp, callee_id, addr)) 61 call_info.append((sec_timestamp, usec_timestamp, callee_id, addr))
62 62
63 return call_info 63 return call_info
64 64
65 def GetStdOutputLines(cmd):
66 p = subprocess.Popen(cmd, stdout=subprocess.PIPE)
67 output = p.communicate()[0]
68 return output.split('\n')
69 65
70 def ParseLibSymbols(lib_file): 66 def _GroupLibrarySymbolInfosByOffset(lib_filename):
71 """Get output from running nm and greping for text symbols. 67 """Return a dict {offset: [SymbolInfo]} from a library."""
68 symbol_infos = symbol_extractor.SymbolInfosFromBinary(lib_filename)
69 return symbol_extractor.GroupSymbolInfosByOffset(symbol_infos)
72 70
73 Args:
74 lib_file: the library or executable that contains the profiled code
75
76 Returns:
77 list of sorted unique addresses and corresponding size of function symbols
78 in lib_file and map of addresses to all symbols at a particular address
79 """
80 cmd = ['nm', '-S', '-n', lib_file]
81 nm_lines = GetStdOutputLines(cmd)
82
83 nm_symbols = []
84 for nm_line in nm_lines:
85 if any(str in nm_line for str in (' t ', ' W ', ' T ')):
86 nm_symbols.append(nm_line)
87
88 nm_index = 0
89 unique_addrs = []
90 address_map = {}
91 while nm_index < len(nm_symbols):
92
93 # If the length of the split line is not 4, then it does not contain all the
94 # information needed to symbolize (i.e. address, size and symbol name).
95 if len(nm_symbols[nm_index].split()) == 4:
96 (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]]
97
98 # Multiple symbols may be at the same address. This is do to aliasing
99 # done by the compiler. Since there is no way to be sure which one was
100 # called in profiled run, we will symbolize to include all symbol names at
101 # a particular address.
102 fnames = []
103 while (nm_index < len(nm_symbols) and
104 addr == int(nm_symbols[nm_index].split()[0], 16)):
105 if len(nm_symbols[nm_index].split()) == 4:
106 fnames.append(nm_symbols[nm_index].split()[3])
107 nm_index += 1
108 address_map[addr] = fnames
109 unique_addrs.append((addr, size))
110 else:
111 nm_index += 1
112
113 return (unique_addrs, address_map)
114 71
115 class SymbolNotFoundException(Exception): 72 class SymbolNotFoundException(Exception):
116 def __init__(self,value): 73 def __init__(self, value):
117 super(SymbolNotFoundException,self).__init__(value) 74 super(SymbolNotFoundException, self).__init__(value)
118 self.value = value 75 self.value = value
76
119 def __str__(self): 77 def __str__(self):
120 return repr(self.value) 78 return repr(self.value)
121 79
122 def BinarySearchAddresses(addr, start, end, arr):
123 """Find starting address of a symbol at a particular address.
124 80
125 The reason we can not directly use the address provided by the log file is 81 def _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset):
126 that the log file may give an address after the start of the symbol. The 82 """Find all SymbolInfo at a given offset.
127 logged address is often one byte after the start. By using this search
128 function rather than just subtracting one from the logged address allows
129 the logging instrumentation to log any address in a function.
130 83
131 Args: 84 Args:
132 addr: the address being searched for 85 offset_to_symbol_infos: {offset: [SymbolInfo]}
133 start: the starting index for the binary search 86 offset: offset to look the symbols at
134 end: the ending index for the binary search
135 arr: the list being searched containing tuple of address and size
136 87
137 Returns: 88 Returns:
138 the starting address of the symbol at address addr 89 The list of SymbolInfo at the given offset
139 90
140 Raises: 91 Raises:
141 Exception: if address not found. Functions expects all logged addresses 92 SymbolNotFoundException if the offset doesn't match any symbol.
142 to be found
143 """ 93 """
144 # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end) 94 if offset in offset_to_symbol_infos:
145 if start >= end or start == end - 1: 95 return offset_to_symbol_infos[offset]
146 # arr[i] is a tuple of address and size. Check if addr inside range 96 elif offset % 2 and (offset - 1) in offset_to_symbol_infos:
147 if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]: 97 # On ARM, odd addresses are used to signal thumb instruction. They are
148 return arr[start][0] 98 # generated by setting the LSB to 1 (see
149 elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]: 99 # http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0471e/Babfj hia.html).
150 return arr[end][0] 100 # TODO(lizeb): Make sure this hack doesn't propagate to other archs.
151 else: 101 return offset_to_symbol_infos[offset - 1]
152 raise SymbolNotFoundException(addr)
153 else: 102 else:
154 halfway = (start + end) / 2 103 raise SymbolNotFoundException(offset)
pasko 2015/01/30 13:34:48 If we raise an exception like this, it will be ign
Benoit L 2015/01/30 16:22:38 Done.
155 (nm_addr, size) = arr[halfway]
156 # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway)
157 if addr >= nm_addr and addr < nm_addr + size:
158 return nm_addr
159 elif addr < nm_addr:
160 return BinarySearchAddresses(addr, start, halfway-1, arr)
161 else:
162 # Condition (addr >= nm_addr + size) must be true.
163 return BinarySearchAddresses(addr, halfway+1, end, arr)
164 104
165 105
166 def FindFunctions(addr, unique_addrs, address_map): 106 class WarningCollector(object):
167 """Find function symbol names at address addr.""" 107 """Collect warnings, but limit the number printed to a set value."""
168 return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1, 108 def __init__(self, max_warnings):
169 unique_addrs)] 109 self._warnings = 0
110 self._max_warnings = max_warnings
111
112 def Write(self, message):
113 if self._warnings < self._max_warnings:
114 logging.warning(message)
115 self._warnings += 1
116
117 def WriteEnd(self, message):
118 if self._warnings > self._max_warnings:
119 logging.warning('%d more warnings for %s' % (
120 self._warnings - self._max_warnings, message))
170 121
171 122
172 def AddrToLine(addr, lib_file): 123 def _GetObjectFileNames(obj_dir):
173 """Use addr2line to determine line info of a particular address.""" 124 """Return the list of object files in a directory."""
174 cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)]
175 output = GetStdOutputLines(cmd)
176 assert(len(output) == 2)
177 return ':'.join(output)
178
179 def GetObjectFileNames(obj_dir):
180 """ Gets the list of object files in the output directory. """
181 obj_files = [] 125 obj_files = []
182 for (dirpath, _, filenames) in os.walk(obj_dir): 126 for (dirpath, _, filenames) in os.walk(obj_dir):
183 for file_name in filenames: 127 for file_name in filenames:
184 if file_name.endswith('.o'): 128 if file_name.endswith('.o'):
185 obj_files.append(os.path.join(dirpath, file_name)) 129 obj_files.append(os.path.join(dirpath, file_name))
186 return obj_files 130 return obj_files
187 131
188 class WarningCollector(object):
189 def __init__(self, max_warnings):
190 self._warnings = 0
191 self._max_warnings = max_warnings
192 132
193 def Write(self, message): 133 def _AllSymbolInfos(object_filenames):
194 if self._warnings < self._max_warnings: 134 """Return a list of SymbolInfo from an iterable of filenames."""
195 sys.stderr.write(message + '\n') 135 pool = multiprocessing.Pool()
196 self._warnings += 1 136 symbol_infos_nested = pool.map(
137 symbol_extractor.SymbolInfosFromBinary, object_filenames)
138 result = []
139 for symbol_infos in symbol_infos_nested:
140 result += symbol_infos
141 return result
197 142
198 def WriteEnd(self, message):
199 if self._warnings > self._max_warnings:
200 sys.stderr.write(str(self._warnings - self._max_warnings) +
201 ' more warnings for: ' + message + '\n')
202 143
203 def SymbolToSection(obj_dir): 144 def _GetSymbolToSectionMapFromObjectFiles(obj_dir):
204 """ Gets a mapping from symbol to linker section name by scanning all 145 """ Create a mapping from symbol to linker section name by scanning all
205 of the object files. """ 146 the object files.
206 object_files = GetObjectFileNames(obj_dir) 147 """
148 object_files = _GetObjectFileNames(obj_dir)
207 symbol_to_section_map = {} 149 symbol_to_section_map = {}
208 symbol_warnings = WarningCollector(300) 150 symbol_warnings = WarningCollector(300)
209 for obj_file in object_files: 151 symbol_infos = _AllSymbolInfos(object_files)
210 cmd = ['objdump', '-w', '-t', obj_file] 152 symbol_infos = [s for s in symbol_infos if not s.name.startswith('.LTHUNK')]
211 symbol_lines = GetStdOutputLines(cmd) 153 for symbol_info in symbol_infos:
212 for symbol_line in symbol_lines: 154 symbol = symbol_info.name
213 items = symbol_line.split() 155 section = symbol_info.section
214 # All of the symbol lines we care about are in the form 156 if ((symbol in symbol_to_section_map) and
215 # 0000000000 g F .text.foo 000000000 [.hidden] foo 157 (symbol_to_section_map[symbol] != symbol_info.section)):
216 # where g (global) might also be l (local) or w (weak). 158 symbol_warnings.Write('Symbol ' + symbol +
217 if len(items) > 4 and items[2] == 'F': 159 ' in conflicting sections ' + section +
218 # This symbol is a function 160 ' and ' + symbol_to_section_map[symbol])
219 symbol = items[len(items) - 1] 161 elif not section.startswith('.text'):
220 if symbol.startswith('.LTHUNK'): 162 symbol_warnings.Write('Symbol ' + symbol +
221 continue 163 ' in incorrect section ' + section)
222 section = items[3] 164 else:
223 if ((symbol in symbol_to_section_map) and 165 symbol_to_section_map[symbol] = section
224 (symbol_to_section_map[symbol] != section)):
225 symbol_warnings.Write('WARNING: Symbol ' + symbol +
226 ' in conflicting sections ' + section +
227 ' and ' + symbol_to_section_map[symbol])
228 elif not section.startswith('.text.'):
229 symbol_warnings.Write('WARNING: Symbol ' + symbol +
230 ' in incorrect section ' + section)
231 else:
232 symbol_to_section_map[symbol] = section
233 symbol_warnings.WriteEnd('bad sections') 166 symbol_warnings.WriteEnd('bad sections')
234 return symbol_to_section_map 167 return symbol_to_section_map
235 168
169
170 def _WarnAboutDuplicates(offsets):
171 """Warn about duplicate offsets.
172
173 Args:
174 offsets: list of offsets to check for duplicates
175
176 Returns:
177 True if there are no duplicates, False otherwise.
178 """
179 seen_offsets = set()
180 ok = True
181 for offset in offsets:
182 if offset not in seen_offsets:
183 seen_offsets.add(offset)
184 else:
185 ok = False
186 logging.warning('Duplicate offset: ' + hex(offset))
187 return ok
188
189
190 def _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map,
191 output_file):
192 """Output the order file to output_file.
193
194 Args:
195 offsets: Iterable of offsets to match to section names
196 offset_to_symbol_infos: {offset: [SymbolInfo]}
197 symbol_to_section_map: {name: section}
198 output_file: file-like object to write the results to
199 """
200 unknown_symbol_warnings = WarningCollector(300)
201 symbol_not_found_warnings = WarningCollector(300)
202 for offset in offsets:
203 try:
204 symbol_infos = _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset)
205 for symbol_info in symbol_infos:
206 if symbol_info.name in symbol_to_section_map:
207 output_file.write(symbol_to_section_map[symbol_info.name] + '\n')
208 else:
209 unknown_symbol_warnings.Write(
210 'No known section for symbol ' + symbol_info.name)
211 except SymbolNotFoundException:
212 symbol_not_found_warnings.Write(
213 'Did not find function in binary. offset: ' + hex(offset))
214 unknown_symbol_warnings.WriteEnd('no known section for symbol')
215 symbol_not_found_warnings.WriteEnd('did not find function')
216
217
236 def main(): 218 def main():
237 """Write output for profiled run to standard out.
238
239 The format of the output depends on the output type specified as the third
240 command line argument. The default output type is to symbolize the addresses
241 of the functions called.
242 """
243 parser = optparse.OptionParser('usage: %prog [options] log_file lib_file') 219 parser = optparse.OptionParser('usage: %prog [options] log_file lib_file')
244 parser.add_option('-t', '--outputType', dest='output_type', 220 # TODO(lizeb): Remove this unused flag once the client script is updated.
245 default='symbolize', type='string', 221 parser.add_option('-t', dest='output_type', choices=['orderfile'],
246 help='lineize or symbolize or orderfile') 222 help='lineize or symbolize or orderfile')
247 223
248 # Option for output type. The log file and lib file arguments are required 224 # Option for output type. The log file and lib file arguments are required
249 # by the script and therefore are not options. 225 # by the script and therefore are not options.
250 (options, args) = parser.parse_args() 226 (_, args) = parser.parse_args()
251 if len(args) != 2: 227 if len(args) != 2:
252 parser.error('expected 2 args: log_file lib_file') 228 parser.error('expected 2 args: log_file lib_file')
253
254 (log_file, lib_file) = args 229 (log_file, lib_file) = args
255 output_type = options.output_type
256 230
257 obj_dir = os.path.abspath(os.path.join(os.path.dirname(lib_file), '../obj')) 231 obj_dir = os.path.abspath(os.path.join(os.path.dirname(lib_file), '../obj'))
258 232
259 log_file_lines = map(string.rstrip, open(log_file).readlines()) 233 log_file_lines = map(string.rstrip, open(log_file).readlines())
260 call_info = ParseLogLines(log_file_lines) 234 call_info = _ParseLogLines(log_file_lines)
261 (unique_addrs, address_map) = ParseLibSymbols(lib_file) 235 offsets = [call[3] for call in call_info]
236 offset_to_symbol_infos = _GroupLibrarySymbolInfosByOffset(lib_file)
262 237
263 # Check for duplicate addresses in the log file, and print a warning if 238 _WarnAboutDuplicates(offsets)
pasko 2015/01/30 13:34:48 how many warnings are produced in practice?
Benoit L 2015/01/30 16:22:38 0 +/- 0 (95% CL)
pasko 2015/01/30 19:31:32 CI ?
264 # duplicates are found. The instrumentation that produces the log file
265 # should only print the first time a function is entered.
266 addr_list = []
267 for call in call_info:
268 addr = call[3]
269 if addr not in addr_list:
270 addr_list.append(addr)
271 else:
272 print('WARNING: Address ' + hex(addr) + ' (line= ' +
273 AddrToLine(addr, lib_file) + ') already profiled.')
274 239
275 symbol_to_section_map = SymbolToSection(obj_dir) 240 symbol_to_section_map = _GetSymbolToSectionMapFromObjectFiles(obj_dir)
241 _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map,
242 sys.stdout)
276 243
277 unknown_symbol_warnings = WarningCollector(300)
278 symbol_not_found_warnings = WarningCollector(300)
279 for call in call_info:
280 addr = call[3]
281 if output_type == 'lineize':
282 symbol = AddrToLine(addr, lib_file)
283 print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
284 + symbol)
285 elif output_type == 'orderfile':
286 try:
287 symbols = FindFunctions(addr, unique_addrs, address_map)
288 for symbol in symbols:
289 if symbol in symbol_to_section_map:
290 print symbol_to_section_map[symbol]
291 else:
292 unknown_symbol_warnings.Write(
293 'WARNING: No known section for symbol ' + symbol)
294 print ''
295 except SymbolNotFoundException:
296 symbol_not_found_warnings.Write(
297 'WARNING: Did not find function in binary. addr: '
298 + hex(addr))
299 else:
300 try:
301 symbols = FindFunctions(addr, unique_addrs, address_map)
302 print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t'
303 + symbols[0])
304 first_symbol = True
305 for symbol in symbols:
306 if not first_symbol:
307 print '\t\t\t\t\t' + symbol
308 else:
309 first_symbol = False
310 except SymbolNotFoundException:
311 symbol_not_found_warnings.Write(
312 'WARNING: Did not find function in binary. addr: '
313 + hex(addr))
314 unknown_symbol_warnings.WriteEnd('no known section for symbol')
315 symbol_not_found_warnings.WriteEnd('did not find function')
316 244
317 if __name__ == '__main__': 245 if __name__ == '__main__':
318 main() 246 main()
pasko 2015/01/30 13:34:48 shouldn't we also set the logging level above this
Benoit L 2015/01/30 16:22:38 Done.
OLDNEW
« no previous file with comments | « no previous file | tools/cygprofile/symbolize_unittest.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698