OLD | NEW |
---|---|
1 #!/usr/bin/python | 1 #!/usr/bin/python |
2 # Copyright 2013 The Chromium Authors. All rights reserved. | 2 # Copyright 2013 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 """Symbolize log file produced by cypgofile instrumentation. | 6 """Symbolize log file produced by cypgofile instrumentation. |
pasko
2015/01/30 13:34:48
nit: cypgofile instrumentation (mergetraces.py)
Benoit L
2015/01/30 16:22:38
Done.
| |
7 | 7 |
8 Given a log file and the binary being profiled (e.g. executable, shared | 8 Given a log file and the binary being profiled, create an order file. |
pasko
2015/01/30 13:34:48
it's a misleading name for the file, since symboli
Benoit L
2015/01/30 16:22:38
Done.
| |
9 library), the script can produce three different outputs: 1) symbols for the | |
10 addresses, 2) function and line numbers for the addresses, or 3) an order file. | |
11 """ | 9 """ |
12 | 10 |
11 import logging | |
12 import multiprocessing | |
13 import optparse | 13 import optparse |
14 import os | 14 import os |
15 import string | 15 import string |
16 import subprocess | |
17 import sys | 16 import sys |
18 | 17 |
18 import symbol_extractor | |
19 | 19 |
20 def ParseLogLines(log_file_lines): | 20 |
21 """Parse a log file produced by the profiled run of clank. | 21 def _ParseLogLines(log_file_lines): |
22 """Parse a log file produced by the profiled run of chrome. | |
pasko
2015/01/30 13:34:48
nit: maybe enhance the comment with:
(i.e. "merge
Benoit L
2015/01/30 16:22:38
Done.
| |
22 | 23 |
23 Args: | 24 Args: |
24 log_file_lines: array of lines in log file produced by profiled run | 25 log_file_lines: array of lines in log file produced by profiled run |
25 lib_name: library or executable containing symbols | 26 lib_name: library or executable containing symbols |
26 | 27 |
27 Below is an example of a small log file: | 28 Below is an example of a small log file: |
28 5086e000-52e92000 r-xp 00000000 b3:02 51276 libchromeview.so | 29 5086e000-52e92000 r-xp 00000000 b3:02 51276 libchromeview.so |
29 secs usecs pid:threadid func | 30 secs usecs pid:threadid func |
30 START | 31 START |
31 1314897086 795828 3587:1074648168 0x509e105c | 32 1314897086 795828 3587:1074648168 0x509e105c |
32 1314897086 795874 3587:1074648168 0x509e0eb4 | 33 1314897086 795874 3587:1074648168 0x509e0eb4 |
33 1314897086 796326 3587:1074648168 0x509e0e3c | 34 1314897086 796326 3587:1074648168 0x509e0e3c |
34 1314897086 796552 3587:1074648168 0x509e07bc | 35 1314897086 796552 3587:1074648168 0x509e07bc |
35 END | 36 END |
36 | 37 |
37 Returns: | 38 Returns: |
38 call_info list with list of tuples of the format (sec, usec, call id, | 39 call_info list with list of tuples of the format (sec, usec, call id, |
pasko
2015/01/30 13:34:48
s/call_info list with//
Benoit L
2015/01/30 16:22:38
Done.
| |
39 function address called) | 40 function address called) |
40 """ | 41 """ |
41 call_lines = [] | 42 call_lines = [] |
42 vm_start = 0 | 43 vm_start = 0 |
43 line = log_file_lines[0] | 44 line = log_file_lines[0] |
44 assert("r-xp" in line) | 45 assert "r-xp" in line |
pasko
2015/01/30 13:34:48
s/"/'/
Benoit L
2015/01/30 16:22:38
Done.
| |
45 end_index = line.find('-') | 46 end_index = line.find('-') |
46 vm_start = int(line[:end_index], 16) | 47 vm_start = int(line[:end_index], 16) |
47 for line in log_file_lines[2:]: | 48 for line in log_file_lines[2:]: |
48 # print hex(vm_start) | |
49 fields = line.split() | 49 fields = line.split() |
50 if len(fields) == 4: | 50 if len(fields) == 4: |
51 call_lines.append(fields) | 51 call_lines.append(fields) |
52 | 52 |
53 # Convert strings to int in fields. | 53 # Convert strings to int in fields. |
54 call_info = [] | 54 call_info = [] |
55 for call_line in call_lines: | 55 for call_line in call_lines: |
56 (sec_timestamp, usec_timestamp) = map(int, call_line[0:2]) | 56 (sec_timestamp, usec_timestamp) = map(int, call_line[0:2]) |
57 callee_id = call_line[2] | 57 callee_id = call_line[2] |
58 addr = int(call_line[3], 16) | 58 addr = int(call_line[3], 16) |
59 if vm_start < addr: | 59 if vm_start < addr: |
60 addr -= vm_start | 60 addr -= vm_start |
61 call_info.append((sec_timestamp, usec_timestamp, callee_id, addr)) | 61 call_info.append((sec_timestamp, usec_timestamp, callee_id, addr)) |
62 | 62 |
63 return call_info | 63 return call_info |
64 | 64 |
65 def GetStdOutputLines(cmd): | |
66 p = subprocess.Popen(cmd, stdout=subprocess.PIPE) | |
67 output = p.communicate()[0] | |
68 return output.split('\n') | |
69 | 65 |
70 def ParseLibSymbols(lib_file): | 66 def _GroupLibrarySymbolInfosByOffset(lib_filename): |
71 """Get output from running nm and greping for text symbols. | 67 """Return a dict {offset: [SymbolInfo]} from a library.""" |
68 symbol_infos = symbol_extractor.SymbolInfosFromBinary(lib_filename) | |
69 return symbol_extractor.GroupSymbolInfosByOffset(symbol_infos) | |
72 | 70 |
73 Args: | |
74 lib_file: the library or executable that contains the profiled code | |
75 | |
76 Returns: | |
77 list of sorted unique addresses and corresponding size of function symbols | |
78 in lib_file and map of addresses to all symbols at a particular address | |
79 """ | |
80 cmd = ['nm', '-S', '-n', lib_file] | |
81 nm_lines = GetStdOutputLines(cmd) | |
82 | |
83 nm_symbols = [] | |
84 for nm_line in nm_lines: | |
85 if any(str in nm_line for str in (' t ', ' W ', ' T ')): | |
86 nm_symbols.append(nm_line) | |
87 | |
88 nm_index = 0 | |
89 unique_addrs = [] | |
90 address_map = {} | |
91 while nm_index < len(nm_symbols): | |
92 | |
93 # If the length of the split line is not 4, then it does not contain all the | |
94 # information needed to symbolize (i.e. address, size and symbol name). | |
95 if len(nm_symbols[nm_index].split()) == 4: | |
96 (addr, size) = [int(x, 16) for x in nm_symbols[nm_index].split()[0:2]] | |
97 | |
98 # Multiple symbols may be at the same address. This is do to aliasing | |
99 # done by the compiler. Since there is no way to be sure which one was | |
100 # called in profiled run, we will symbolize to include all symbol names at | |
101 # a particular address. | |
102 fnames = [] | |
103 while (nm_index < len(nm_symbols) and | |
104 addr == int(nm_symbols[nm_index].split()[0], 16)): | |
105 if len(nm_symbols[nm_index].split()) == 4: | |
106 fnames.append(nm_symbols[nm_index].split()[3]) | |
107 nm_index += 1 | |
108 address_map[addr] = fnames | |
109 unique_addrs.append((addr, size)) | |
110 else: | |
111 nm_index += 1 | |
112 | |
113 return (unique_addrs, address_map) | |
114 | 71 |
115 class SymbolNotFoundException(Exception): | 72 class SymbolNotFoundException(Exception): |
116 def __init__(self,value): | 73 def __init__(self, value): |
117 super(SymbolNotFoundException,self).__init__(value) | 74 super(SymbolNotFoundException, self).__init__(value) |
118 self.value = value | 75 self.value = value |
76 | |
119 def __str__(self): | 77 def __str__(self): |
120 return repr(self.value) | 78 return repr(self.value) |
121 | 79 |
122 def BinarySearchAddresses(addr, start, end, arr): | |
123 """Find starting address of a symbol at a particular address. | |
124 | 80 |
125 The reason we can not directly use the address provided by the log file is | 81 def _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset): |
126 that the log file may give an address after the start of the symbol. The | 82 """Find all SymbolInfo at a given offset. |
127 logged address is often one byte after the start. By using this search | |
128 function rather than just subtracting one from the logged address allows | |
129 the logging instrumentation to log any address in a function. | |
130 | 83 |
131 Args: | 84 Args: |
132 addr: the address being searched for | 85 offset_to_symbol_infos: {offset: [SymbolInfo]} |
133 start: the starting index for the binary search | 86 offset: offset to look the symbols at |
134 end: the ending index for the binary search | |
135 arr: the list being searched containing tuple of address and size | |
136 | 87 |
137 Returns: | 88 Returns: |
138 the starting address of the symbol at address addr | 89 The list of SymbolInfo at the given offset |
139 | 90 |
140 Raises: | 91 Raises: |
141 Exception: if address not found. Functions expects all logged addresses | 92 SymbolNotFoundException if the offset doesn't match any symbol. |
142 to be found | |
143 """ | 93 """ |
144 # print "addr: " + str(addr) + " start: " + str(start) + " end: " + str(end) | 94 if offset in offset_to_symbol_infos: |
145 if start >= end or start == end - 1: | 95 return offset_to_symbol_infos[offset] |
146 # arr[i] is a tuple of address and size. Check if addr inside range | 96 elif offset % 2 and (offset - 1) in offset_to_symbol_infos: |
147 if addr >= arr[start][0] and addr < arr[start][0] + arr[start][1]: | 97 # On ARM, odd addresses are used to signal thumb instruction. They are |
148 return arr[start][0] | 98 # generated by setting the LSB to 1 (see |
149 elif addr >= arr[end][0] and addr < arr[end][0] + arr[end][1]: | 99 # http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dui0471e/Babfj hia.html). |
150 return arr[end][0] | 100 # TODO(lizeb): Make sure this hack doesn't propagate to other archs. |
151 else: | 101 return offset_to_symbol_infos[offset - 1] |
152 raise SymbolNotFoundException(addr) | |
153 else: | 102 else: |
154 halfway = (start + end) / 2 | 103 raise SymbolNotFoundException(offset) |
pasko
2015/01/30 13:34:48
If we raise an exception like this, it will be ign
Benoit L
2015/01/30 16:22:38
Done.
| |
155 (nm_addr, size) = arr[halfway] | |
156 # print "nm_addr: " + str(nm_addr) + " halfway: " + str(halfway) | |
157 if addr >= nm_addr and addr < nm_addr + size: | |
158 return nm_addr | |
159 elif addr < nm_addr: | |
160 return BinarySearchAddresses(addr, start, halfway-1, arr) | |
161 else: | |
162 # Condition (addr >= nm_addr + size) must be true. | |
163 return BinarySearchAddresses(addr, halfway+1, end, arr) | |
164 | 104 |
165 | 105 |
166 def FindFunctions(addr, unique_addrs, address_map): | 106 class WarningCollector(object): |
167 """Find function symbol names at address addr.""" | 107 """Collect warnings, but limit the number printed to a set value.""" |
168 return address_map[BinarySearchAddresses(addr, 0, len(unique_addrs) - 1, | 108 def __init__(self, max_warnings): |
169 unique_addrs)] | 109 self._warnings = 0 |
110 self._max_warnings = max_warnings | |
111 | |
112 def Write(self, message): | |
113 if self._warnings < self._max_warnings: | |
114 logging.warning(message) | |
115 self._warnings += 1 | |
116 | |
117 def WriteEnd(self, message): | |
118 if self._warnings > self._max_warnings: | |
119 logging.warning('%d more warnings for %s' % ( | |
120 self._warnings - self._max_warnings, message)) | |
170 | 121 |
171 | 122 |
172 def AddrToLine(addr, lib_file): | 123 def _GetObjectFileNames(obj_dir): |
173 """Use addr2line to determine line info of a particular address.""" | 124 """Return the list of object files in a directory.""" |
174 cmd = ['addr2line', '-f', '-e', lib_file, hex(addr)] | |
175 output = GetStdOutputLines(cmd) | |
176 assert(len(output) == 2) | |
177 return ':'.join(output) | |
178 | |
179 def GetObjectFileNames(obj_dir): | |
180 """ Gets the list of object files in the output directory. """ | |
181 obj_files = [] | 125 obj_files = [] |
182 for (dirpath, _, filenames) in os.walk(obj_dir): | 126 for (dirpath, _, filenames) in os.walk(obj_dir): |
183 for file_name in filenames: | 127 for file_name in filenames: |
184 if file_name.endswith('.o'): | 128 if file_name.endswith('.o'): |
185 obj_files.append(os.path.join(dirpath, file_name)) | 129 obj_files.append(os.path.join(dirpath, file_name)) |
186 return obj_files | 130 return obj_files |
187 | 131 |
188 class WarningCollector(object): | |
189 def __init__(self, max_warnings): | |
190 self._warnings = 0 | |
191 self._max_warnings = max_warnings | |
192 | 132 |
193 def Write(self, message): | 133 def _AllSymbolInfos(object_filenames): |
194 if self._warnings < self._max_warnings: | 134 """Return a list of SymbolInfo from an iterable of filenames.""" |
195 sys.stderr.write(message + '\n') | 135 pool = multiprocessing.Pool() |
196 self._warnings += 1 | 136 symbol_infos_nested = pool.map( |
137 symbol_extractor.SymbolInfosFromBinary, object_filenames) | |
138 result = [] | |
139 for symbol_infos in symbol_infos_nested: | |
140 result += symbol_infos | |
141 return result | |
197 | 142 |
198 def WriteEnd(self, message): | |
199 if self._warnings > self._max_warnings: | |
200 sys.stderr.write(str(self._warnings - self._max_warnings) + | |
201 ' more warnings for: ' + message + '\n') | |
202 | 143 |
203 def SymbolToSection(obj_dir): | 144 def _GetSymbolToSectionMapFromObjectFiles(obj_dir): |
204 """ Gets a mapping from symbol to linker section name by scanning all | 145 """ Create a mapping from symbol to linker section name by scanning all |
205 of the object files. """ | 146 the object files. |
206 object_files = GetObjectFileNames(obj_dir) | 147 """ |
148 object_files = _GetObjectFileNames(obj_dir) | |
207 symbol_to_section_map = {} | 149 symbol_to_section_map = {} |
208 symbol_warnings = WarningCollector(300) | 150 symbol_warnings = WarningCollector(300) |
209 for obj_file in object_files: | 151 symbol_infos = _AllSymbolInfos(object_files) |
210 cmd = ['objdump', '-w', '-t', obj_file] | 152 symbol_infos = [s for s in symbol_infos if not s.name.startswith('.LTHUNK')] |
211 symbol_lines = GetStdOutputLines(cmd) | 153 for symbol_info in symbol_infos: |
212 for symbol_line in symbol_lines: | 154 symbol = symbol_info.name |
213 items = symbol_line.split() | 155 section = symbol_info.section |
214 # All of the symbol lines we care about are in the form | 156 if ((symbol in symbol_to_section_map) and |
215 # 0000000000 g F .text.foo 000000000 [.hidden] foo | 157 (symbol_to_section_map[symbol] != symbol_info.section)): |
216 # where g (global) might also be l (local) or w (weak). | 158 symbol_warnings.Write('Symbol ' + symbol + |
217 if len(items) > 4 and items[2] == 'F': | 159 ' in conflicting sections ' + section + |
218 # This symbol is a function | 160 ' and ' + symbol_to_section_map[symbol]) |
219 symbol = items[len(items) - 1] | 161 elif not section.startswith('.text'): |
220 if symbol.startswith('.LTHUNK'): | 162 symbol_warnings.Write('Symbol ' + symbol + |
221 continue | 163 ' in incorrect section ' + section) |
222 section = items[3] | 164 else: |
223 if ((symbol in symbol_to_section_map) and | 165 symbol_to_section_map[symbol] = section |
224 (symbol_to_section_map[symbol] != section)): | |
225 symbol_warnings.Write('WARNING: Symbol ' + symbol + | |
226 ' in conflicting sections ' + section + | |
227 ' and ' + symbol_to_section_map[symbol]) | |
228 elif not section.startswith('.text.'): | |
229 symbol_warnings.Write('WARNING: Symbol ' + symbol + | |
230 ' in incorrect section ' + section) | |
231 else: | |
232 symbol_to_section_map[symbol] = section | |
233 symbol_warnings.WriteEnd('bad sections') | 166 symbol_warnings.WriteEnd('bad sections') |
234 return symbol_to_section_map | 167 return symbol_to_section_map |
235 | 168 |
169 | |
170 def _WarnAboutDuplicates(offsets): | |
171 """Warn about duplicate offsets. | |
172 | |
173 Args: | |
174 offsets: list of offsets to check for duplicates | |
175 | |
176 Returns: | |
177 True if there are no duplicates, False otherwise. | |
178 """ | |
179 seen_offsets = set() | |
180 ok = True | |
181 for offset in offsets: | |
182 if offset not in seen_offsets: | |
183 seen_offsets.add(offset) | |
184 else: | |
185 ok = False | |
186 logging.warning('Duplicate offset: ' + hex(offset)) | |
187 return ok | |
188 | |
189 | |
190 def _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map, | |
191 output_file): | |
192 """Output the order file to output_file. | |
193 | |
194 Args: | |
195 offsets: Iterable of offsets to match to section names | |
196 offset_to_symbol_infos: {offset: [SymbolInfo]} | |
197 symbol_to_section_map: {name: section} | |
198 output_file: file-like object to write the results to | |
199 """ | |
200 unknown_symbol_warnings = WarningCollector(300) | |
201 symbol_not_found_warnings = WarningCollector(300) | |
202 for offset in offsets: | |
203 try: | |
204 symbol_infos = _FindSymbolInfosAtOffset(offset_to_symbol_infos, offset) | |
205 for symbol_info in symbol_infos: | |
206 if symbol_info.name in symbol_to_section_map: | |
207 output_file.write(symbol_to_section_map[symbol_info.name] + '\n') | |
208 else: | |
209 unknown_symbol_warnings.Write( | |
210 'No known section for symbol ' + symbol_info.name) | |
211 except SymbolNotFoundException: | |
212 symbol_not_found_warnings.Write( | |
213 'Did not find function in binary. offset: ' + hex(offset)) | |
214 unknown_symbol_warnings.WriteEnd('no known section for symbol') | |
215 symbol_not_found_warnings.WriteEnd('did not find function') | |
216 | |
217 | |
236 def main(): | 218 def main(): |
237 """Write output for profiled run to standard out. | |
238 | |
239 The format of the output depends on the output type specified as the third | |
240 command line argument. The default output type is to symbolize the addresses | |
241 of the functions called. | |
242 """ | |
243 parser = optparse.OptionParser('usage: %prog [options] log_file lib_file') | 219 parser = optparse.OptionParser('usage: %prog [options] log_file lib_file') |
244 parser.add_option('-t', '--outputType', dest='output_type', | 220 # TODO(lizeb): Remove this unused flag once the client script is updated. |
245 default='symbolize', type='string', | 221 parser.add_option('-t', dest='output_type', choices=['orderfile'], |
246 help='lineize or symbolize or orderfile') | 222 help='lineize or symbolize or orderfile') |
247 | 223 |
248 # Option for output type. The log file and lib file arguments are required | 224 # Option for output type. The log file and lib file arguments are required |
249 # by the script and therefore are not options. | 225 # by the script and therefore are not options. |
250 (options, args) = parser.parse_args() | 226 (_, args) = parser.parse_args() |
251 if len(args) != 2: | 227 if len(args) != 2: |
252 parser.error('expected 2 args: log_file lib_file') | 228 parser.error('expected 2 args: log_file lib_file') |
253 | |
254 (log_file, lib_file) = args | 229 (log_file, lib_file) = args |
255 output_type = options.output_type | |
256 | 230 |
257 obj_dir = os.path.abspath(os.path.join(os.path.dirname(lib_file), '../obj')) | 231 obj_dir = os.path.abspath(os.path.join(os.path.dirname(lib_file), '../obj')) |
258 | 232 |
259 log_file_lines = map(string.rstrip, open(log_file).readlines()) | 233 log_file_lines = map(string.rstrip, open(log_file).readlines()) |
260 call_info = ParseLogLines(log_file_lines) | 234 call_info = _ParseLogLines(log_file_lines) |
261 (unique_addrs, address_map) = ParseLibSymbols(lib_file) | 235 offsets = [call[3] for call in call_info] |
236 offset_to_symbol_infos = _GroupLibrarySymbolInfosByOffset(lib_file) | |
262 | 237 |
263 # Check for duplicate addresses in the log file, and print a warning if | 238 _WarnAboutDuplicates(offsets) |
pasko
2015/01/30 13:34:48
how many warnings are produced in practice?
Benoit L
2015/01/30 16:22:38
0 +/- 0 (95% CL)
pasko
2015/01/30 19:31:32
CI ?
| |
264 # duplicates are found. The instrumentation that produces the log file | |
265 # should only print the first time a function is entered. | |
266 addr_list = [] | |
267 for call in call_info: | |
268 addr = call[3] | |
269 if addr not in addr_list: | |
270 addr_list.append(addr) | |
271 else: | |
272 print('WARNING: Address ' + hex(addr) + ' (line= ' + | |
273 AddrToLine(addr, lib_file) + ') already profiled.') | |
274 | 239 |
275 symbol_to_section_map = SymbolToSection(obj_dir) | 240 symbol_to_section_map = _GetSymbolToSectionMapFromObjectFiles(obj_dir) |
241 _OutputOrderfile(offsets, offset_to_symbol_infos, symbol_to_section_map, | |
242 sys.stdout) | |
276 | 243 |
277 unknown_symbol_warnings = WarningCollector(300) | |
278 symbol_not_found_warnings = WarningCollector(300) | |
279 for call in call_info: | |
280 addr = call[3] | |
281 if output_type == 'lineize': | |
282 symbol = AddrToLine(addr, lib_file) | |
283 print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t' | |
284 + symbol) | |
285 elif output_type == 'orderfile': | |
286 try: | |
287 symbols = FindFunctions(addr, unique_addrs, address_map) | |
288 for symbol in symbols: | |
289 if symbol in symbol_to_section_map: | |
290 print symbol_to_section_map[symbol] | |
291 else: | |
292 unknown_symbol_warnings.Write( | |
293 'WARNING: No known section for symbol ' + symbol) | |
294 print '' | |
295 except SymbolNotFoundException: | |
296 symbol_not_found_warnings.Write( | |
297 'WARNING: Did not find function in binary. addr: ' | |
298 + hex(addr)) | |
299 else: | |
300 try: | |
301 symbols = FindFunctions(addr, unique_addrs, address_map) | |
302 print(str(call[0]) + ' ' + str(call[1]) + '\t' + str(call[2]) + '\t' | |
303 + symbols[0]) | |
304 first_symbol = True | |
305 for symbol in symbols: | |
306 if not first_symbol: | |
307 print '\t\t\t\t\t' + symbol | |
308 else: | |
309 first_symbol = False | |
310 except SymbolNotFoundException: | |
311 symbol_not_found_warnings.Write( | |
312 'WARNING: Did not find function in binary. addr: ' | |
313 + hex(addr)) | |
314 unknown_symbol_warnings.WriteEnd('no known section for symbol') | |
315 symbol_not_found_warnings.WriteEnd('did not find function') | |
316 | 244 |
317 if __name__ == '__main__': | 245 if __name__ == '__main__': |
318 main() | 246 main() |
pasko
2015/01/30 13:34:48
shouldn't we also set the logging level above this
Benoit L
2015/01/30 16:22:38
Done.
| |
OLD | NEW |