OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # coding=utf-8 | |
3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
4 # Use of this source code is governed by a BSD-style license that can be | |
5 # found in the LICENSE file. | |
6 | |
7 """Traces an executable and its child processes and extract the files accessed | |
8 by them. | |
9 | |
10 The implementation uses OS-specific API. The native Kernel logger and the ETL | |
11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise. | |
12 The OS-specific implementation is hidden in an 'API' interface. | |
13 | |
14 The results are embedded in a Results instance. The tracing is done in two | |
15 phases, the first is to do the actual trace and generate an | |
16 implementation-specific log file. Then the log file is parsed to extract the | |
17 information, including the individual child processes and the files accessed | |
18 from the log. | |
19 """ | |
20 | |
21 import codecs | |
22 import csv | |
23 import getpass | |
24 import glob | |
25 import json | |
26 import logging | |
27 import optparse | |
28 import os | |
29 import re | |
30 import subprocess | |
31 import sys | |
32 import tempfile | |
33 import threading | |
34 import time | |
35 import weakref | |
36 | |
37 ## OS-specific imports | |
38 | |
39 if sys.platform == 'win32': | |
40 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p | |
41 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611 | |
42 from ctypes.wintypes import GetLastError # pylint: disable=E0611 | |
43 elif sys.platform == 'darwin': | |
44 import Carbon.File # pylint: disable=F0401 | |
45 import MacOS # pylint: disable=F0401 | |
46 | |
47 | |
48 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
49 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR)) | |
50 | |
51 | |
52 class TracingFailure(Exception): | |
53 """An exception occured during tracing.""" | |
54 def __init__(self, description, pid, line_number, line, *args): | |
55 super(TracingFailure, self).__init__( | |
56 description, pid, line_number, line, *args) | |
57 self.description = description | |
58 self.pid = pid | |
59 self.line_number = line_number | |
60 self.line = line | |
61 self.extra = args | |
62 | |
63 def __str__(self): | |
64 out = self.description | |
65 if self.pid: | |
66 out += '\npid: %d' % self.pid | |
67 if self.line_number: | |
68 out += '\nline: %d' % self.line_number | |
69 if self.line: | |
70 out += '\n%s' % self.line | |
71 if self.extra: | |
72 out += '\n' + ', '.join(map(str, filter(None, self.extra))) | |
73 return out | |
74 | |
75 | |
76 ## OS-specific functions | |
77 | |
78 if sys.platform == 'win32': | |
79 def QueryDosDevice(drive_letter): | |
80 """Returns the Windows 'native' path for a DOS drive letter.""" | |
81 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter | |
82 # Guesswork. QueryDosDeviceW never returns the required number of bytes. | |
83 chars = 1024 | |
84 drive_letter = unicode(drive_letter) | |
85 p = create_unicode_buffer(chars) | |
86 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars): | |
87 err = GetLastError() | |
88 if err: | |
89 # pylint: disable=E0602 | |
90 raise WindowsError( | |
91 err, | |
92 'QueryDosDevice(%s): %s (%d)' % ( | |
93 str(drive_letter), FormatError(err), err)) | |
94 return p.value | |
95 | |
96 | |
97 def GetShortPathName(long_path): | |
98 """Returns the Windows short path equivalent for a 'long' path.""" | |
99 long_path = unicode(long_path) | |
100 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is | |
101 # not enforced. | |
102 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'): | |
103 long_path = '\\\\?\\' + long_path | |
104 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0) | |
105 if chars: | |
106 p = create_unicode_buffer(chars) | |
107 if windll.kernel32.GetShortPathNameW(long_path, p, chars): | |
108 return p.value | |
109 | |
110 err = GetLastError() | |
111 if err: | |
112 # pylint: disable=E0602 | |
113 raise WindowsError( | |
114 err, | |
115 'GetShortPathName(%s): %s (%d)' % ( | |
116 str(long_path), FormatError(err), err)) | |
117 | |
118 | |
119 def GetLongPathName(short_path): | |
120 """Returns the Windows long path equivalent for a 'short' path.""" | |
121 short_path = unicode(short_path) | |
122 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is | |
123 # not enforced. | |
124 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'): | |
125 short_path = '\\\\?\\' + short_path | |
126 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0) | |
127 if chars: | |
128 p = create_unicode_buffer(chars) | |
129 if windll.kernel32.GetLongPathNameW(short_path, p, chars): | |
130 return p.value | |
131 | |
132 err = GetLastError() | |
133 if err: | |
134 # pylint: disable=E0602 | |
135 raise WindowsError( | |
136 err, | |
137 'GetLongPathName(%s): %s (%d)' % ( | |
138 str(short_path), FormatError(err), err)) | |
139 | |
140 | |
141 def get_current_encoding(): | |
142 """Returns the 'ANSI' code page associated to the process.""" | |
143 return 'cp%d' % int(windll.kernel32.GetACP()) | |
144 | |
145 | |
146 class DosDriveMap(object): | |
147 """Maps \Device\HarddiskVolumeN to N: on Windows.""" | |
148 # Keep one global cache. | |
149 _MAPPING = {} | |
150 | |
151 def __init__(self): | |
152 """Lazy loads the cache.""" | |
153 if not self._MAPPING: | |
154 # This is related to UNC resolver on windows. Ignore that. | |
155 self._MAPPING['\\Device\\Mup'] = None | |
156 self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot'] | |
157 | |
158 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)): | |
159 try: | |
160 letter = '%s:' % letter | |
161 mapped = QueryDosDevice(letter) | |
162 if mapped in self._MAPPING: | |
163 logging.warn( | |
164 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk' | |
165 '. Drive letters are a user-mode concept and the kernel ' | |
166 'traces only have NT path, so all accesses will be ' | |
167 'associated with the first drive letter, independent of the ' | |
168 'actual letter used by the code') % ( | |
169 self._MAPPING[mapped], letter)) | |
170 else: | |
171 self._MAPPING[mapped] = letter | |
172 except WindowsError: # pylint: disable=E0602 | |
173 pass | |
174 | |
175 def to_win32(self, path): | |
176 """Converts a native NT path to Win32/DOS compatible path.""" | |
177 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path) | |
178 if not match: | |
179 raise ValueError( | |
180 'Can\'t convert %s into a Win32 compatible path' % path, | |
181 path) | |
182 if not match.group(1) in self._MAPPING: | |
183 # Unmapped partitions may be accessed by windows for the | |
184 # fun of it while the test is running. Discard these. | |
185 return None | |
186 drive = self._MAPPING[match.group(1)] | |
187 if not drive or not match.group(2): | |
188 return drive | |
189 return drive + match.group(2) | |
190 | |
191 | |
192 def isabs(path): | |
193 """Accepts X: as an absolute path, unlike python's os.path.isabs().""" | |
194 return os.path.isabs(path) or len(path) == 2 and path[1] == ':' | |
195 | |
196 | |
197 def get_native_path_case(p): | |
198 """Returns the native path case for an existing file. | |
199 | |
200 On Windows, removes any leading '\\?\'. | |
201 """ | |
202 if not isabs(p): | |
203 raise ValueError( | |
204 'Can\'t get native path case for a non-absolute path: %s' % p, | |
205 p) | |
206 # Windows used to have an option to turn on case sensitivity on non Win32 | |
207 # subsystem but that's out of scope here and isn't supported anymore. | |
208 # Go figure why GetShortPathName() is needed. | |
209 try: | |
210 out = GetLongPathName(GetShortPathName(p)) | |
211 except OSError, e: | |
212 if e.args[0] in (2, 3, 5): | |
213 # The path does not exist. Try to recurse and reconstruct the path. | |
214 base = os.path.dirname(p) | |
215 rest = os.path.basename(p) | |
216 return os.path.join(get_native_path_case(base), rest) | |
217 raise | |
218 if out.startswith('\\\\?\\'): | |
219 out = out[4:] | |
220 # Always upper case the first letter since GetLongPathName() will return the | |
221 # drive letter in the case it was given. | |
222 return out[0].upper() + out[1:] | |
223 | |
224 | |
225 def CommandLineToArgvW(command_line): | |
226 """Splits a commandline into argv using CommandLineToArgvW().""" | |
227 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx | |
228 size = c_int() | |
229 ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size)) | |
230 try: | |
231 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)] | |
232 finally: | |
233 windll.kernel32.LocalFree(ptr) | |
234 | |
235 | |
236 elif sys.platform == 'darwin': | |
237 | |
238 | |
239 # On non-windows, keep the stdlib behavior. | |
240 isabs = os.path.isabs | |
241 | |
242 | |
243 def _find_item_native_case(root_path, item): | |
244 """Gets the native path case of a single item based at root_path. | |
245 | |
246 There is no API to get the native path case of symlinks on OSX. So it | |
247 needs to be done the slow way. | |
248 """ | |
249 item = item.lower() | |
250 for element in os.listdir(root_path): | |
251 if element.lower() == item: | |
252 return element | |
253 | |
254 | |
255 def _native_case(p): | |
256 """Gets the native path case. Warning: this function resolves symlinks.""" | |
257 logging.debug('native_case(%s)' % p) | |
258 try: | |
259 rel_ref, _ = Carbon.File.FSPathMakeRef(p) | |
260 out = rel_ref.FSRefMakePath() | |
261 if p.endswith(os.path.sep) and not out.endswith(os.path.sep): | |
262 return out + os.path.sep | |
263 return out | |
264 except MacOS.Error, e: | |
265 if e.args[0] in (-43, -120): | |
266 # The path does not exist. Try to recurse and reconstruct the path. | |
267 # -43 means file not found. | |
268 # -120 means directory not found. | |
269 base = os.path.dirname(p) | |
270 rest = os.path.basename(p) | |
271 return os.path.join(_native_case(base), rest) | |
272 raise OSError( | |
273 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1]) | |
274 | |
275 | |
276 def _split_at_symlink_native(base_path, rest): | |
277 """Returns the native path for a symlink.""" | |
278 base, symlink, rest = split_at_symlink(base_path, rest) | |
279 if symlink: | |
280 if not base_path: | |
281 base_path = base | |
282 else: | |
283 base_path = safe_join(base_path, base) | |
284 symlink = _find_item_native_case(base_path, symlink) | |
285 return base, symlink, rest | |
286 | |
287 | |
288 def get_native_path_case(path): | |
289 """Returns the native path case for an existing file. | |
290 | |
291 Technically, it's only HFS+ on OSX that is case preserving and | |
292 insensitive. It's the default setting on HFS+ but can be changed. | |
293 """ | |
294 if not isabs(path): | |
295 raise ValueError( | |
296 'Can\'t get native path case for a non-absolute path: %s' % path, | |
297 path) | |
298 if path.startswith('/dev'): | |
299 # /dev is not visible from Carbon, causing an exception. | |
300 return path | |
301 | |
302 # Starts assuming there is no symlink along the path. | |
303 resolved = _native_case(path) | |
304 if resolved.lower() == path.lower(): | |
305 # This code path is incredibly faster. | |
306 return resolved | |
307 | |
308 # There was a symlink, process it. | |
309 base, symlink, rest = _split_at_symlink_native(None, path) | |
310 assert symlink, (path, base, symlink, rest, resolved) | |
311 prev = base | |
312 base = safe_join(_native_case(base), symlink) | |
313 assert len(base) > len(prev) | |
314 while rest: | |
315 prev = base | |
316 relbase, symlink, rest = _split_at_symlink_native(base, rest) | |
317 base = safe_join(base, relbase) | |
318 assert len(base) > len(prev), (prev, base, symlink) | |
319 if symlink: | |
320 base = safe_join(base, symlink) | |
321 assert len(base) > len(prev), (prev, base, symlink) | |
322 # Make sure no symlink was resolved. | |
323 assert base.lower() == path.lower(), (base, path) | |
324 return base | |
325 | |
326 | |
327 else: # OSes other than Windows and OSX. | |
328 | |
329 | |
330 # On non-windows, keep the stdlib behavior. | |
331 isabs = os.path.isabs | |
332 | |
333 | |
334 def get_native_path_case(path): | |
335 """Returns the native path case for an existing file. | |
336 | |
337 On OSes other than OSX and Windows, assume the file system is | |
338 case-sensitive. | |
339 | |
340 TODO(maruel): This is not strictly true. Implement if necessary. | |
341 """ | |
342 if not isabs(path): | |
343 raise ValueError( | |
344 'Can\'t get native path case for a non-absolute path: %s' % path, | |
345 path) | |
346 # Give up on cygwin, as GetLongPathName() can't be called. | |
347 # Linux traces tends to not be normalized so use this occasion to normalize | |
348 # it. This function implementation already normalizes the path on the other | |
349 # OS so this needs to be done here to be coherent between OSes. | |
350 out = os.path.normpath(path) | |
351 if path.endswith(os.path.sep) and not out.endswith(os.path.sep): | |
352 return out + os.path.sep | |
353 return out | |
354 | |
355 | |
356 if sys.platform != 'win32': # All non-Windows OSes. | |
357 | |
358 | |
359 def safe_join(*args): | |
360 """Joins path elements like os.path.join() but doesn't abort on absolute | |
361 path. | |
362 | |
363 os.path.join('foo', '/bar') == '/bar' | |
364 but safe_join('foo', '/bar') == 'foo/bar'. | |
365 """ | |
366 out = '' | |
367 for element in args: | |
368 if element.startswith(os.path.sep): | |
369 if out.endswith(os.path.sep): | |
370 out += element[1:] | |
371 else: | |
372 out += element | |
373 else: | |
374 if out.endswith(os.path.sep): | |
375 out += element | |
376 else: | |
377 out += os.path.sep + element | |
378 return out | |
379 | |
380 | |
381 def split_at_symlink(base_dir, relfile): | |
382 """Scans each component of relfile and cut the string at the symlink if | |
383 there is any. | |
384 | |
385 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if | |
386 not symlink was found. | |
387 """ | |
388 if base_dir: | |
389 assert relfile | |
390 assert os.path.isabs(base_dir) | |
391 index = 0 | |
392 else: | |
393 assert os.path.isabs(relfile) | |
394 index = 1 | |
395 | |
396 def at_root(rest): | |
397 if base_dir: | |
398 return safe_join(base_dir, rest) | |
399 return rest | |
400 | |
401 while True: | |
402 try: | |
403 index = relfile.index(os.path.sep, index) | |
404 except ValueError: | |
405 index = len(relfile) | |
406 full = at_root(relfile[:index]) | |
407 if os.path.islink(full): | |
408 # A symlink! | |
409 base = os.path.dirname(relfile[:index]) | |
410 symlink = os.path.basename(relfile[:index]) | |
411 rest = relfile[index:] | |
412 logging.debug( | |
413 'split_at_symlink(%s, %s) -> (%s, %s, %s)' % | |
414 (base_dir, relfile, base, symlink, rest)) | |
415 return base, symlink, rest | |
416 if index == len(relfile): | |
417 break | |
418 index += 1 | |
419 return relfile, None, None | |
420 | |
421 | |
422 def fix_python_path(cmd): | |
423 """Returns the fixed command line to call the right python executable.""" | |
424 out = cmd[:] | |
425 if out[0] == 'python': | |
426 out[0] = sys.executable | |
427 elif out[0].endswith('.py'): | |
428 out.insert(0, sys.executable) | |
429 return out | |
430 | |
431 | |
432 def create_thunk(): | |
433 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py') | |
434 os.write( | |
435 handle, | |
436 ( | |
437 'import subprocess\n' | |
438 'import sys\n' | |
439 'sys.exit(subprocess.call(sys.argv[2:]))\n' | |
440 )) | |
441 os.close(handle) | |
442 return name | |
443 | |
444 | |
445 def strace_process_quoted_arguments(text): | |
446 """Extracts quoted arguments on a string and return the arguments as a list. | |
447 | |
448 Implemented as an automaton. Supports incomplete strings in the form | |
449 '"foo"...'. | |
450 | |
451 Example: | |
452 With text = '"foo", "bar"', the function will return ['foo', 'bar'] | |
453 | |
454 TODO(maruel): Implement escaping. | |
455 """ | |
456 # All the possible states of the DFA. | |
457 ( NEED_QUOTE, # Begining of a new arguments. | |
458 INSIDE_STRING, # Inside an argument. | |
459 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is. | |
460 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be | |
461 # a serie of 3 dots or a comma. | |
462 NEED_SPACE, # Right after a comma | |
463 NEED_DOT_2, # Found a dot, need a second one. | |
464 NEED_DOT_3, # Found second dot, need a third one. | |
465 NEED_COMMA, # Found third dot, need a comma. | |
466 ) = range(8) | |
467 | |
468 state = NEED_QUOTE | |
469 out = [] | |
470 for index, char in enumerate(text): | |
471 if char == '"': | |
472 if state == NEED_QUOTE: | |
473 state = INSIDE_STRING | |
474 # A new argument was found. | |
475 out.append('') | |
476 elif state == INSIDE_STRING: | |
477 # The argument is now closed. | |
478 state = NEED_COMMA_OR_DOT | |
479 elif state == ESCAPED: | |
480 out[-1] += char | |
481 state = INSIDE_STRING | |
482 else: | |
483 raise ValueError( | |
484 'Can\'t process char at column %d for: %r' % (index, text), | |
485 index, | |
486 text) | |
487 elif char == ',': | |
488 if state in (NEED_COMMA_OR_DOT, NEED_COMMA): | |
489 state = NEED_SPACE | |
490 elif state == INSIDE_STRING: | |
491 out[-1] += char | |
492 elif state == ESCAPED: | |
493 out[-1] += char | |
494 state = INSIDE_STRING | |
495 else: | |
496 raise ValueError( | |
497 'Can\'t process char at column %d for: %r' % (index, text), | |
498 index, | |
499 text) | |
500 elif char == ' ': | |
501 if state == NEED_SPACE: | |
502 state = NEED_QUOTE | |
503 elif state == INSIDE_STRING: | |
504 out[-1] += char | |
505 elif state == ESCAPED: | |
506 out[-1] += char | |
507 state = INSIDE_STRING | |
508 else: | |
509 raise ValueError( | |
510 'Can\'t process char at column %d for: %r' % (index, text), | |
511 index, | |
512 text) | |
513 elif char == '.': | |
514 if state == NEED_COMMA_OR_DOT: | |
515 # The string is incomplete, this mean the strace -s flag should be | |
516 # increased. | |
517 state = NEED_DOT_2 | |
518 elif state == NEED_DOT_2: | |
519 state = NEED_DOT_3 | |
520 elif state == NEED_DOT_3: | |
521 state = NEED_COMMA | |
522 elif state == INSIDE_STRING: | |
523 out[-1] += char | |
524 elif state == ESCAPED: | |
525 out[-1] += char | |
526 state = INSIDE_STRING | |
527 else: | |
528 raise ValueError( | |
529 'Can\'t process char at column %d for: %r' % (index, text), | |
530 index, | |
531 text) | |
532 elif char == '\\': | |
533 if state == ESCAPED: | |
534 out[-1] += char | |
535 state = INSIDE_STRING | |
536 elif state == INSIDE_STRING: | |
537 state = ESCAPED | |
538 else: | |
539 raise ValueError( | |
540 'Can\'t process char at column %d for: %r' % (index, text), | |
541 index, | |
542 text) | |
543 else: | |
544 if state == INSIDE_STRING: | |
545 out[-1] += char | |
546 else: | |
547 raise ValueError( | |
548 'Can\'t process char at column %d for: %r' % (index, text), | |
549 index, | |
550 text) | |
551 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT): | |
552 raise ValueError( | |
553 'String is incorrectly terminated: %r' % text, | |
554 text) | |
555 return out | |
556 | |
557 | |
558 def read_json(filepath): | |
559 with open(filepath, 'r') as f: | |
560 return json.load(f) | |
561 | |
562 | |
563 def write_json(filepath_or_handle, data, dense): | |
564 """Writes data into filepath or file handle encoded as json. | |
565 | |
566 If dense is True, the json is packed. Otherwise, it is human readable. | |
567 """ | |
568 if hasattr(filepath_or_handle, 'write'): | |
569 if dense: | |
570 filepath_or_handle.write( | |
571 json.dumps(data, sort_keys=True, separators=(',',':'))) | |
572 else: | |
573 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2)) | |
574 else: | |
575 with open(filepath_or_handle, 'wb') as f: | |
576 if dense: | |
577 json.dump(data, f, sort_keys=True, separators=(',',':')) | |
578 else: | |
579 json.dump(data, f, sort_keys=True, indent=2) | |
580 | |
581 | |
582 class Results(object): | |
583 """Results of a trace session.""" | |
584 | |
585 class _TouchedObject(object): | |
586 """Something, a file or a directory, that was accessed.""" | |
587 def __init__(self, root, path, tainted, size, nb_files): | |
588 logging.debug( | |
589 '%s(%s, %s, %s, %s, %s)' % | |
590 (self.__class__.__name__, root, path, tainted, size, nb_files)) | |
591 self.root = root | |
592 self.path = path | |
593 self.tainted = tainted | |
594 self.nb_files = nb_files | |
595 # Can be used as a cache or a default value, depending on context. | |
596 self._size = size | |
597 # These are cache only. | |
598 self._real_path = None | |
599 | |
600 # Check internal consistency. | |
601 assert path, path | |
602 assert tainted or bool(root) != bool(isabs(path)), (root, path) | |
603 assert tainted or ( | |
604 not os.path.exists(self.full_path) or | |
605 (self.full_path == get_native_path_case(self.full_path))), ( | |
606 tainted, self.full_path, get_native_path_case(self.full_path)) | |
607 | |
608 @property | |
609 def existent(self): | |
610 return self.size != -1 | |
611 | |
612 @property | |
613 def full_path(self): | |
614 if self.root: | |
615 return os.path.join(self.root, self.path) | |
616 return self.path | |
617 | |
618 @property | |
619 def real_path(self): | |
620 """Returns the path with symlinks resolved.""" | |
621 if not self._real_path: | |
622 self._real_path = os.path.realpath(self.full_path) | |
623 return self._real_path | |
624 | |
625 @property | |
626 def size(self): | |
627 """File's size. -1 is not existent.""" | |
628 if self._size is None and not self.tainted: | |
629 try: | |
630 self._size = os.stat(self.full_path).st_size | |
631 except OSError: | |
632 self._size = -1 | |
633 return self._size | |
634 | |
635 def flatten(self): | |
636 """Returns a dict representing this object. | |
637 | |
638 A 'size' of 0 means the file was only touched and not read. | |
639 """ | |
640 return { | |
641 'path': self.path, | |
642 'size': self.size, | |
643 } | |
644 | |
645 def replace_variables(self, variables): | |
646 """Replaces the root of this File with one of the variables if it matches. | |
647 | |
648 If a variable replacement occurs, the cloned object becomes tainted. | |
649 """ | |
650 for variable, root_path in variables.iteritems(): | |
651 if self.path.startswith(root_path): | |
652 return self._clone( | |
653 self.root, variable + self.path[len(root_path):], True) | |
654 # No need to clone, returns ourself. | |
655 return self | |
656 | |
657 def strip_root(self, root): | |
658 """Returns a clone of itself with 'root' stripped off.""" | |
659 # Check internal consistency. | |
660 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root | |
661 if not self.full_path.startswith(root): | |
662 # Now try to resolve the symlinks to see if it can be reached this way. | |
663 # Only try *after* trying without resolving symlink. | |
664 if not self.real_path.startswith(root): | |
665 return None | |
666 path = self.real_path | |
667 else: | |
668 path = self.full_path | |
669 return self._clone(root, path[len(root):], self.tainted) | |
670 | |
671 def _clone(self, new_root, new_path, tainted): | |
672 raise NotImplementedError(self.__class__.__name__) | |
673 | |
674 class File(_TouchedObject): | |
675 """A file that was accessed. May not be present anymore. | |
676 | |
677 If tainted is true, it means it is not a real path anymore as a variable | |
678 replacement occured. | |
679 | |
680 If touched_only is True, this means the file was probed for existence, and | |
681 it is existent, but was never _opened_. If touched_only is True, the file | |
682 must have existed. | |
683 """ | |
684 def __init__(self, root, path, tainted, size): | |
685 super(Results.File, self).__init__(root, path, tainted, size, 1) | |
686 | |
687 def _clone(self, new_root, new_path, tainted): | |
688 """Clones itself keeping meta-data.""" | |
689 # Keep the self.size and self._real_path caches for performance reason. It | |
690 # is also important when the file becomes tainted (with a variable instead | |
691 # of the real path) since self.path is not an on-disk path anymore so | |
692 # out._size cannot be updated. | |
693 out = self.__class__(new_root, new_path, tainted, self.size) | |
694 out._real_path = self._real_path | |
695 return out | |
696 | |
697 class Directory(_TouchedObject): | |
698 """A directory of files. Must exist.""" | |
699 def __init__(self, root, path, tainted, size, nb_files): | |
700 """path='.' is a valid value and must be handled appropriately.""" | |
701 assert not path.endswith(os.path.sep), path | |
702 super(Results.Directory, self).__init__( | |
703 root, path + os.path.sep, tainted, size, nb_files) | |
704 # For a Directory instance, self.size is not a cache, it's an actual value | |
705 # that is never modified and represents the total size of the files | |
706 # contained in this directory. It is possible that the directory is empty | |
707 # so that size == 0; this happens if there's only an invalid symlink in | |
708 # it. | |
709 | |
710 def flatten(self): | |
711 out = super(Results.Directory, self).flatten() | |
712 out['nb_files'] = self.nb_files | |
713 return out | |
714 | |
715 def _clone(self, new_root, new_path, tainted): | |
716 """Clones itself keeping meta-data.""" | |
717 out = self.__class__( | |
718 new_root, | |
719 new_path.rstrip(os.path.sep), | |
720 tainted, | |
721 self.size, | |
722 self.nb_files) | |
723 out._real_path = self._real_path | |
724 return out | |
725 | |
726 class Process(object): | |
727 """A process that was traced. | |
728 | |
729 Contains references to the files accessed by this process and its children. | |
730 """ | |
731 def __init__(self, pid, files, executable, command, initial_cwd, children): | |
732 logging.debug('Process(%s, %d, ...)' % (pid, len(files))) | |
733 self.pid = pid | |
734 self.files = sorted(files, key=lambda x: x.path) | |
735 self.children = children | |
736 self.executable = executable | |
737 self.command = command | |
738 self.initial_cwd = initial_cwd | |
739 | |
740 # Check internal consistency. | |
741 assert len(set(f.path for f in self.files)) == len(self.files), sorted( | |
742 f.path for f in self.files) | |
743 assert isinstance(self.children, list) | |
744 assert isinstance(self.files, list) | |
745 | |
746 @property | |
747 def all(self): | |
748 for child in self.children: | |
749 for i in child.all: | |
750 yield i | |
751 yield self | |
752 | |
753 def flatten(self): | |
754 return { | |
755 'children': [c.flatten() for c in self.children], | |
756 'command': self.command, | |
757 'executable': self.executable, | |
758 'files': [f.flatten() for f in self.files], | |
759 'initial_cwd': self.initial_cwd, | |
760 'pid': self.pid, | |
761 } | |
762 | |
763 def strip_root(self, root): | |
764 assert isabs(root) and root.endswith(os.path.sep), root | |
765 # Loads the files after since they are constructed as objects. | |
766 out = self.__class__( | |
767 self.pid, | |
768 filter(None, (f.strip_root(root) for f in self.files)), | |
769 self.executable, | |
770 self.command, | |
771 self.initial_cwd, | |
772 [c.strip_root(root) for c in self.children]) | |
773 logging.debug( | |
774 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files))) | |
775 return out | |
776 | |
777 def __init__(self, process): | |
778 self.process = process | |
779 # Cache. | |
780 self._files = None | |
781 | |
782 def flatten(self): | |
783 return { | |
784 'root': self.process.flatten(), | |
785 } | |
786 | |
787 @property | |
788 def files(self): | |
789 if self._files is None: | |
790 self._files = sorted( | |
791 sum((p.files for p in self.process.all), []), | |
792 key=lambda x: x.path) | |
793 return self._files | |
794 | |
795 @property | |
796 def existent(self): | |
797 return [f for f in self.files if f.existent] | |
798 | |
799 @property | |
800 def non_existent(self): | |
801 return [f for f in self.files if not f.existent] | |
802 | |
803 def strip_root(self, root): | |
804 """Returns a clone with all the files outside the directory |root| removed | |
805 and converts all the path to be relative paths. | |
806 """ | |
807 # Resolve any symlink | |
808 root = os.path.realpath(root) | |
809 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep | |
810 logging.debug('strip_root(%s)' % root) | |
811 return Results(self.process.strip_root(root)) | |
812 | |
813 | |
814 class ApiBase(object): | |
815 """OS-agnostic API to trace a process and its children.""" | |
816 class Context(object): | |
817 """Processes one log line at a time and keeps the list of traced processes. | |
818 | |
819 The parsing is complicated by the fact that logs are traced out of order for | |
820 strace but in-order for dtrace and logman. In addition, on Windows it is | |
821 very frequent that processids are reused so a flat list cannot be used. But | |
822 at the same time, it is impossible to faithfully construct a graph when the | |
823 logs are processed out of order. So both a tree and a flat mapping are used, | |
824 the tree is the real process tree, while the flat mapping stores the last | |
825 valid process for the corresponding processid. For the strace case, the | |
826 tree's head is guessed at the last moment. | |
827 """ | |
828 class Process(object): | |
829 """Keeps context for one traced child process. | |
830 | |
831 Logs all the files this process touched. Ignores directories. | |
832 """ | |
833 def __init__(self, blacklist, pid, initial_cwd): | |
834 # Check internal consistency. | |
835 assert isinstance(pid, int), repr(pid) | |
836 self.pid = pid | |
837 # children are Process instances. | |
838 self.children = [] | |
839 self.initial_cwd = initial_cwd | |
840 self.cwd = None | |
841 self.files = set() | |
842 self.only_touched = set() | |
843 self.executable = None | |
844 self.command = None | |
845 self._blacklist = blacklist | |
846 | |
847 def to_results_process(self): | |
848 """Resolves file case sensitivity and or late-bound strings.""" | |
849 # When resolving files, it's normal to get dupe because a file could be | |
850 # opened multiple times with different case. Resolve the deduplication | |
851 # here. | |
852 def fix_path(x): | |
853 """Returns the native file path case. | |
854 | |
855 Converts late-bound strings. | |
856 """ | |
857 if not x: | |
858 # Do not convert None instance to 'None'. | |
859 return x | |
860 # TODO(maruel): Do not upconvert to unicode here, on linux we don't | |
861 # know the file path encoding so they must be treated as bytes. | |
862 x = unicode(x) | |
863 if os.path.isabs(x): | |
864 # If the path is not absolute, which tends to happen occasionally on | |
865 # Windows, it is not possible to get the native path case so ignore | |
866 # that trace. It mostly happens for 'executable' value. | |
867 x = get_native_path_case(x) | |
868 return x | |
869 | |
870 def fix_and_blacklist_path(x): | |
871 x = fix_path(x) | |
872 if not x: | |
873 return | |
874 # The blacklist needs to be reapplied, since path casing could | |
875 # influence blacklisting. | |
876 if self._blacklist(x): | |
877 return | |
878 return x | |
879 | |
880 # Filters out directories. Some may have passed through. | |
881 files = set(f for f in map(fix_and_blacklist_path, self.files) if f) | |
882 only_touched = set( | |
883 f for f in map(fix_and_blacklist_path, self.only_touched) if f) | |
884 only_touched -= files | |
885 | |
886 files = [ | |
887 Results.File(None, f, False, None) for f in files | |
888 if not os.path.isdir(f) | |
889 ] | |
890 # Using 0 as size means the file's content is ignored since the file was | |
891 # never opened for I/O. | |
892 files.extend( | |
893 Results.File(None, f, False, 0) for f in only_touched | |
894 if not os.path.isdir(f) | |
895 ) | |
896 return Results.Process( | |
897 self.pid, | |
898 files, | |
899 fix_path(self.executable), | |
900 self.command, | |
901 fix_path(self.initial_cwd), | |
902 [c.to_results_process() for c in self.children]) | |
903 | |
904 def add_file(self, filepath, touch_only): | |
905 """Adds a file if it passes the blacklist.""" | |
906 if self._blacklist(unicode(filepath)): | |
907 return | |
908 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only)) | |
909 # Note that filepath and not unicode(filepath) is added. It is because | |
910 # filepath could be something else than a string, like a RelativePath | |
911 # instance for dtrace logs. | |
912 if touch_only: | |
913 self.only_touched.add(filepath) | |
914 else: | |
915 self.files.add(filepath) | |
916 | |
917 def __init__(self, blacklist): | |
918 self.blacklist = blacklist | |
919 # Initial process. | |
920 self.root_process = None | |
921 # dict to accelerate process lookup, to not have to lookup the whole graph | |
922 # each time. | |
923 self._process_lookup = {} | |
924 | |
925 class Tracer(object): | |
926 """During it's lifetime, the tracing subsystem is enabled.""" | |
927 def __init__(self, logname): | |
928 self._logname = logname | |
929 self._lock = threading.Lock() | |
930 self._traces = [] | |
931 self._initialized = True | |
932 | |
933 def trace(self, cmd, cwd, tracename, output): | |
934 """Runs the OS-specific trace program on an executable. | |
935 | |
936 Arguments: | |
937 - cmd: The command (a list) to run. | |
938 - cwd: Current directory to start the child process in. | |
939 - tracename: Name of the trace in the logname file. | |
940 - output: If False, redirects output to PIPEs. | |
941 | |
942 Returns a tuple (resultcode, output) and updates the internal trace | |
943 entries. | |
944 """ | |
945 # The implementation adds an item to self._traces. | |
946 raise NotImplementedError(self.__class__.__name__) | |
947 | |
948 def close(self, _timeout=None): | |
949 """Saves the meta-data in the logname file. | |
950 | |
951 For kernel-based tracing, stops the tracing subsystem. | |
952 | |
953 Must not be used manually when using 'with' construct. | |
954 """ | |
955 with self._lock: | |
956 assert self._initialized | |
957 try: | |
958 data = { | |
959 'traces': self._traces, | |
960 } | |
961 write_json(self._logname, data, False) | |
962 finally: | |
963 self._initialized = False | |
964 | |
965 def post_process_log(self): | |
966 """Post-processes the log so it becomes faster to load afterward. | |
967 | |
968 Must not be used manually when using 'with' construct. | |
969 """ | |
970 assert not self._initialized, 'Must stop tracing first.' | |
971 | |
972 def __enter__(self): | |
973 """Enables 'with' statement.""" | |
974 return self | |
975 | |
976 def __exit__(self, exc_type, exc_value, traceback): | |
977 """Enables 'with' statement.""" | |
978 self.close() | |
979 # If an exception was thrown, do not process logs. | |
980 if not exc_type: | |
981 self.post_process_log() | |
982 | |
983 def get_tracer(self, logname): | |
984 """Returns an ApiBase.Tracer instance. | |
985 | |
986 Initializes the tracing subsystem, which is a requirement for kernel-based | |
987 tracers. Only one tracer instance should be live at a time! | |
988 | |
989 logname is the filepath to the json file that will contain the meta-data | |
990 about the logs. | |
991 """ | |
992 return self.Tracer(logname) | |
993 | |
994 @staticmethod | |
995 def clean_trace(logname): | |
996 """Deletes an old log.""" | |
997 raise NotImplementedError() | |
998 | |
999 @classmethod | |
1000 def parse_log(cls, logname, blacklist): | |
1001 """Processes trace logs and returns the files opened and the files that do | |
1002 not exist. | |
1003 | |
1004 It does not track directories. | |
1005 | |
1006 Most of the time, files that do not exist are temporary test files that | |
1007 should be put in /tmp instead. See http://crbug.com/116251. | |
1008 | |
1009 Returns a list of dict with keys: | |
1010 - results: A Results instance. | |
1011 - trace: The corresponding tracename parameter provided to | |
1012 get_tracer().trace(). | |
1013 - output: Output gathered during execution, if get_tracer().trace(..., | |
1014 output=False) was used. | |
1015 """ | |
1016 raise NotImplementedError(cls.__class__.__name__) | |
1017 | |
1018 | |
1019 class Strace(ApiBase): | |
1020 """strace implies linux.""" | |
1021 class Context(ApiBase.Context): | |
1022 """Processes a strace log line and keeps the list of existent and non | |
1023 existent files accessed. | |
1024 | |
1025 Ignores directories. | |
1026 | |
1027 Uses late-binding to processes the cwd of each process. The problem is that | |
1028 strace generates one log file per process it traced but doesn't give any | |
1029 information about which process was started when and by who. So we don't | |
1030 even know which process is the initial one. So process the logs out of | |
1031 order and use late binding with RelativePath to be able to deduce the | |
1032 initial directory of each process once all the logs are parsed. | |
1033 """ | |
1034 class Process(ApiBase.Context.Process): | |
1035 """Represents the state of a process. | |
1036 | |
1037 Contains all the information retrieved from the pid-specific log. | |
1038 """ | |
1039 # Function names are using ([a-z_0-9]+) | |
1040 # This is the most common format. function(args) = result | |
1041 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.+?)\)\s+= (.+)$') | |
1042 # An interrupted function call, only grab the minimal header. | |
1043 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$') | |
1044 # A resumed function call. | |
1045 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$') | |
1046 # A process received a signal. | |
1047 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---') | |
1048 # A process didn't handle a signal. Ignore any junk appearing before, | |
1049 # because the process was forcibly killed so it won't open any new file. | |
1050 RE_KILLED = re.compile( | |
1051 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$') | |
1052 # The process has exited. | |
1053 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+') | |
1054 # A call was canceled. Ignore any prefix. | |
1055 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$') | |
1056 # Happens when strace fails to even get the function name. | |
1057 UNNAMED_FUNCTION = '????' | |
1058 | |
1059 # Corner-case in python, a class member function decorator must not be | |
1060 # @staticmethod. | |
1061 def parse_args(regexp, expect_zero): # pylint: disable=E0213 | |
1062 """Automatically convert the str 'args' into a list of processed | |
1063 arguments. | |
1064 | |
1065 Arguments: | |
1066 - regexp is used to parse args. | |
1067 - expect_zero: one of True, False or None. | |
1068 - True: will check for result.startswith('0') first and will ignore | |
1069 the trace line completely otherwise. This is important because for | |
1070 many functions, the regexp will not process if the call failed. | |
1071 - False: will check for not result.startswith(('?', '-1')) for the | |
1072 same reason than with True. | |
1073 - None: ignore result. | |
1074 """ | |
1075 def meta_hook(function): | |
1076 assert function.__name__.startswith('handle_') | |
1077 def hook(self, args, result): | |
1078 if expect_zero is True and not result.startswith('0'): | |
1079 return | |
1080 if expect_zero is False and result.startswith(('?', '-1')): | |
1081 return | |
1082 match = re.match(regexp, args) | |
1083 if not match: | |
1084 raise TracingFailure( | |
1085 'Failed to parse %s(%s) = %s' % | |
1086 (function.__name__[len('handle_'):], args, result), | |
1087 None, None, None) | |
1088 return function(self, match.groups(), result) | |
1089 return hook | |
1090 return meta_hook | |
1091 | |
1092 class RelativePath(object): | |
1093 """A late-bound relative path.""" | |
1094 def __init__(self, parent, value): | |
1095 self.parent = parent | |
1096 self.value = value | |
1097 | |
1098 def render(self): | |
1099 """Returns the current directory this instance is representing. | |
1100 | |
1101 This function is used to return the late-bound value. | |
1102 """ | |
1103 if self.value and self.value.startswith(u'/'): | |
1104 # An absolute path. | |
1105 return self.value | |
1106 parent = self.parent.render() if self.parent else u'<None>' | |
1107 if self.value: | |
1108 return os.path.normpath(os.path.join(parent, self.value)) | |
1109 return parent | |
1110 | |
1111 def __unicode__(self): | |
1112 """Acts as a string whenever needed.""" | |
1113 return unicode(self.render()) | |
1114 | |
1115 def __str__(self): | |
1116 """Acts as a string whenever needed.""" | |
1117 return str(self.render()) | |
1118 | |
1119 def __init__(self, root, pid): | |
1120 """Keeps enough information to be able to guess the original process | |
1121 root. | |
1122 | |
1123 strace doesn't store which process was the initial process. So more | |
1124 information needs to be kept so the graph can be reconstructed from the | |
1125 flat map. | |
1126 """ | |
1127 logging.info('%s(%d)' % (self.__class__.__name__, pid)) | |
1128 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None) | |
1129 assert isinstance(root, ApiBase.Context) | |
1130 self._root = weakref.ref(root) | |
1131 # The dict key is the function name of the pending call, like 'open' | |
1132 # or 'execve'. | |
1133 self._pending_calls = {} | |
1134 self._line_number = 0 | |
1135 # Current directory when the process started. | |
1136 self.initial_cwd = self.RelativePath(self._root(), None) | |
1137 self.parentid = None | |
1138 | |
1139 def get_cwd(self): | |
1140 """Returns the best known value of cwd.""" | |
1141 return self.cwd or self.initial_cwd | |
1142 | |
1143 def render(self): | |
1144 """Returns the string value of the RelativePath() object. | |
1145 | |
1146 Used by RelativePath. Returns the initial directory and not the | |
1147 current one since the current directory 'cwd' validity is time-limited. | |
1148 | |
1149 The validity is only guaranteed once all the logs are processed. | |
1150 """ | |
1151 return self.initial_cwd.render() | |
1152 | |
1153 def on_line(self, line): | |
1154 self._line_number += 1 | |
1155 if self.RE_SIGNAL.match(line): | |
1156 # Ignore signals. | |
1157 return | |
1158 | |
1159 try: | |
1160 match = self.RE_KILLED.match(line) | |
1161 if match: | |
1162 # Converts a '+++ killed by Foo +++' trace into an exit_group(). | |
1163 self.handle_exit_group(match.group(1), None) | |
1164 return | |
1165 | |
1166 match = self.RE_PROCESS_EXITED.match(line) | |
1167 if match: | |
1168 # Converts a '+++ exited with 1 +++' trace into an exit_group() | |
1169 self.handle_exit_group(match.group(1), None) | |
1170 return | |
1171 | |
1172 match = self.RE_UNFINISHED.match(line) | |
1173 if match: | |
1174 if match.group(1) in self._pending_calls: | |
1175 raise TracingFailure( | |
1176 'Found two unfinished calls for the same function', | |
1177 None, None, None, | |
1178 self._pending_calls) | |
1179 self._pending_calls[match.group(1)] = ( | |
1180 match.group(1) + match.group(2)) | |
1181 return | |
1182 | |
1183 match = self.RE_UNAVAILABLE.match(line) | |
1184 if match: | |
1185 # This usually means a process was killed and a pending call was | |
1186 # canceled. | |
1187 # TODO(maruel): Look up the last exit_group() trace just above and | |
1188 # make sure any self._pending_calls[anything] is properly flushed. | |
1189 return | |
1190 | |
1191 match = self.RE_RESUMED.match(line) | |
1192 if match: | |
1193 if match.group(1) not in self._pending_calls: | |
1194 raise TracingFailure( | |
1195 'Found a resumed call that was not logged as unfinished', | |
1196 None, None, None, | |
1197 self._pending_calls) | |
1198 pending = self._pending_calls.pop(match.group(1)) | |
1199 # Reconstruct the line. | |
1200 line = pending + match.group(2) | |
1201 | |
1202 match = self.RE_HEADER.match(line) | |
1203 if not match: | |
1204 raise TracingFailure( | |
1205 'Found an invalid line: %s' % line, | |
1206 None, None, None) | |
1207 if match.group(1) == self.UNNAMED_FUNCTION: | |
1208 return | |
1209 | |
1210 # It's a valid line, handle it. | |
1211 handler = getattr(self, 'handle_%s' % match.group(1), None) | |
1212 if not handler: | |
1213 self._handle_unknown(match.group(1), match.group(2), match.group(3)) | |
1214 return handler(match.group(2), match.group(3)) | |
1215 except TracingFailure, e: | |
1216 # Hack in the values since the handler could be a static function. | |
1217 e.pid = self.pid | |
1218 e.line = line | |
1219 e.line_number = self._line_number | |
1220 # Re-raise the modified exception. | |
1221 raise | |
1222 except (KeyError, NotImplementedError, ValueError), e: | |
1223 raise TracingFailure( | |
1224 'Trace generated a %s exception: %s' % ( | |
1225 e.__class__.__name__, str(e)), | |
1226 self.pid, | |
1227 self._line_number, | |
1228 line, | |
1229 e) | |
1230 | |
1231 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True) | |
1232 def handle_access(self, args, _result): | |
1233 self._handle_file(args[0], True) | |
1234 | |
1235 @parse_args(r'^\"(.+?)\"$', True) | |
1236 def handle_chdir(self, args, _result): | |
1237 """Updates cwd.""" | |
1238 self.cwd = self.RelativePath(self, args[0]) | |
1239 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd)) | |
1240 | |
1241 def handle_clone(self, _args, result): | |
1242 """Transfers cwd.""" | |
1243 if result.startswith(('?', '-1')): | |
1244 # The call failed. | |
1245 return | |
1246 # Update the other process right away. | |
1247 childpid = int(result) | |
1248 child = self._root().get_or_set_proc(childpid) | |
1249 if child.parentid is not None or childpid in self.children: | |
1250 raise TracingFailure( | |
1251 'Found internal inconsitency in process lifetime detection ' | |
1252 'during a clone() call', | |
1253 None, None, None) | |
1254 | |
1255 # Copy the cwd object. | |
1256 child.initial_cwd = self.get_cwd() | |
1257 child.parentid = self.pid | |
1258 # It is necessary because the logs are processed out of order. | |
1259 self.children.append(child) | |
1260 | |
1261 def handle_close(self, _args, _result): | |
1262 pass | |
1263 | |
1264 def handle_chmod(self, _args, _result): | |
1265 pass | |
1266 | |
1267 def handle_creat(self, _args, _result): | |
1268 # Ignore files created, since they didn't need to exist. | |
1269 pass | |
1270 | |
1271 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True) | |
1272 def handle_execve(self, args, _result): | |
1273 # Even if in practice execve() doesn't returns when it succeeds, strace | |
1274 # still prints '0' as the result. | |
1275 filepath = args[0] | |
1276 self._handle_file(filepath, False) | |
1277 self.executable = self.RelativePath(self.get_cwd(), filepath) | |
1278 self.command = strace_process_quoted_arguments(args[1]) | |
1279 | |
1280 def handle_exit_group(self, _args, _result): | |
1281 """Removes cwd.""" | |
1282 self.cwd = None | |
1283 | |
1284 def handle_fork(self, args, result): | |
1285 self._handle_unknown('fork', args, result) | |
1286 | |
1287 def handle_getcwd(self, _args, _result): | |
1288 pass | |
1289 | |
1290 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True) | |
1291 def handle_link(self, args, _result): | |
1292 self._handle_file(args[0], False) | |
1293 self._handle_file(args[1], False) | |
1294 | |
1295 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True) | |
1296 def handle_lstat(self, args, _result): | |
1297 self._handle_file(args[0], True) | |
1298 | |
1299 def handle_mkdir(self, _args, _result): | |
1300 pass | |
1301 | |
1302 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False) | |
1303 def handle_open(self, args, _result): | |
1304 if 'O_DIRECTORY' in args[1]: | |
1305 return | |
1306 self._handle_file(args[0], False) | |
1307 | |
1308 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False) | |
1309 def handle_openat(self, args, _result): | |
1310 if 'O_DIRECTORY' in args[2]: | |
1311 return | |
1312 if args[1] == 'AT_FDCWD': | |
1313 self._handle_file(args[1], False) | |
1314 else: | |
1315 # TODO(maruel): Implement relative open if necessary instead of the | |
1316 # AT_FDCWD flag, let's hope not since this means tracking all active | |
1317 # directory handles. | |
1318 raise Exception('Relative open via openat not implemented.') | |
1319 | |
1320 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False) | |
1321 def handle_readlink(self, args, _result): | |
1322 self._handle_file(args[0], False) | |
1323 | |
1324 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True) | |
1325 def handle_rename(self, args, _result): | |
1326 self._handle_file(args[0], False) | |
1327 self._handle_file(args[1], False) | |
1328 | |
1329 def handle_rmdir(self, _args, _result): | |
1330 pass | |
1331 | |
1332 def handle_setxattr(self, _args, _result): | |
1333 pass | |
1334 | |
1335 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True) | |
1336 def handle_stat(self, args, _result): | |
1337 self._handle_file(args[0], True) | |
1338 | |
1339 def handle_symlink(self, _args, _result): | |
1340 pass | |
1341 | |
1342 def handle_unlink(self, _args, _result): | |
1343 # In theory, the file had to be created anyway. | |
1344 pass | |
1345 | |
1346 def handle_statfs(self, _args, _result): | |
1347 pass | |
1348 | |
1349 def handle_vfork(self, args, result): | |
1350 self._handle_unknown('vfork', args, result) | |
1351 | |
1352 @staticmethod | |
1353 def _handle_unknown(function, args, result): | |
1354 raise TracingFailure( | |
1355 'Unexpected/unimplemented trace %s(%s)= %s' % | |
1356 (function, args, result), | |
1357 None, None, None) | |
1358 | |
1359 def _handle_file(self, filepath, touch_only): | |
1360 filepath = self.RelativePath(self.get_cwd(), filepath) | |
1361 #assert not touch_only, unicode(filepath) | |
1362 self.add_file(filepath, touch_only) | |
1363 | |
1364 def __init__(self, blacklist, initial_cwd): | |
1365 super(Strace.Context, self).__init__(blacklist) | |
1366 self.initial_cwd = initial_cwd | |
1367 | |
1368 def render(self): | |
1369 """Returns the string value of the initial cwd of the root process. | |
1370 | |
1371 Used by RelativePath. | |
1372 """ | |
1373 return self.initial_cwd | |
1374 | |
1375 def on_line(self, pid, line): | |
1376 """Transfers control into the Process.on_line() function.""" | |
1377 self.get_or_set_proc(pid).on_line(line.strip()) | |
1378 | |
1379 def to_results(self): | |
1380 """Finds back the root process and verify consistency.""" | |
1381 # TODO(maruel): Absolutely unecessary, fix me. | |
1382 root = [p for p in self._process_lookup.itervalues() if not p.parentid] | |
1383 if len(root) != 1: | |
1384 raise TracingFailure( | |
1385 'Found internal inconsitency in process lifetime detection ' | |
1386 'while finding the root process', | |
1387 None, | |
1388 None, | |
1389 None, | |
1390 sorted(p.pid for p in root)) | |
1391 self.root_process = root[0] | |
1392 process = self.root_process.to_results_process() | |
1393 if sorted(self._process_lookup) != sorted(p.pid for p in process.all): | |
1394 raise TracingFailure( | |
1395 'Found internal inconsitency in process lifetime detection ' | |
1396 'while looking for len(tree) == len(list)', | |
1397 None, | |
1398 None, | |
1399 None, | |
1400 sorted(self._process_lookup), | |
1401 sorted(p.pid for p in process.all)) | |
1402 return Results(process) | |
1403 | |
1404 def get_or_set_proc(self, pid): | |
1405 """Returns the Context.Process instance for this pid or creates a new one. | |
1406 """ | |
1407 if not pid or not isinstance(pid, int): | |
1408 raise TracingFailure( | |
1409 'Unpexpected value for pid: %r' % pid, | |
1410 pid, | |
1411 None, | |
1412 None, | |
1413 pid) | |
1414 if pid not in self._process_lookup: | |
1415 self._process_lookup[pid] = self.Process(self, pid) | |
1416 return self._process_lookup[pid] | |
1417 | |
1418 @classmethod | |
1419 def traces(cls): | |
1420 """Returns the list of all handled traces to pass this as an argument to | |
1421 strace. | |
1422 """ | |
1423 prefix = 'handle_' | |
1424 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)] | |
1425 | |
1426 class Tracer(ApiBase.Tracer): | |
1427 MAX_LEN = 256 | |
1428 | |
1429 def trace(self, cmd, cwd, tracename, output): | |
1430 """Runs strace on an executable.""" | |
1431 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) | |
1432 assert os.path.isabs(cmd[0]), cmd[0] | |
1433 assert os.path.isabs(cwd), cwd | |
1434 assert os.path.normpath(cwd) == cwd, cwd | |
1435 with self._lock: | |
1436 if not self._initialized: | |
1437 raise TracingFailure( | |
1438 'Called Tracer.trace() on an unitialized object', | |
1439 None, None, None, tracename) | |
1440 assert tracename not in (i['trace'] for i in self._traces) | |
1441 stdout = stderr = None | |
1442 if output: | |
1443 stdout = subprocess.PIPE | |
1444 stderr = subprocess.STDOUT | |
1445 # Ensure all file related APIs are hooked. | |
1446 traces = ','.join(Strace.Context.traces() + ['file']) | |
1447 trace_cmd = [ | |
1448 'strace', | |
1449 '-ff', | |
1450 '-s', '%d' % self.MAX_LEN, | |
1451 '-e', 'trace=%s' % traces, | |
1452 '-o', self._logname + '.' + tracename, | |
1453 ] | |
1454 child = subprocess.Popen( | |
1455 trace_cmd + cmd, | |
1456 cwd=cwd, | |
1457 stdin=subprocess.PIPE, | |
1458 stdout=stdout, | |
1459 stderr=stderr) | |
1460 out = child.communicate()[0] | |
1461 # TODO(maruel): Walk the logs and figure out the root process would | |
1462 # simplify parsing the logs a *lot*. | |
1463 with self._lock: | |
1464 assert tracename not in (i['trace'] for i in self._traces) | |
1465 self._traces.append( | |
1466 { | |
1467 'cmd': cmd, | |
1468 'cwd': cwd, | |
1469 # The pid of strace process, not very useful. | |
1470 'pid': child.pid, | |
1471 'trace': tracename, | |
1472 'output': out, | |
1473 }) | |
1474 return child.returncode, out | |
1475 | |
1476 @staticmethod | |
1477 def clean_trace(logname): | |
1478 if os.path.isfile(logname): | |
1479 os.remove(logname) | |
1480 # Also delete any pid specific file from previous traces. | |
1481 for i in glob.iglob(logname + '.*'): | |
1482 if i.rsplit('.', 1)[1].isdigit(): | |
1483 os.remove(i) | |
1484 | |
1485 @classmethod | |
1486 def parse_log(cls, logname, blacklist): | |
1487 logging.info('parse_log(%s, %s)' % (logname, blacklist)) | |
1488 data = read_json(logname) | |
1489 out = [] | |
1490 for item in data['traces']: | |
1491 result = { | |
1492 'trace': item['trace'], | |
1493 'output': item['output'], | |
1494 } | |
1495 try: | |
1496 context = cls.Context(blacklist, item['cwd']) | |
1497 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])): | |
1498 pid = pidfile.rsplit('.', 1)[1] | |
1499 if pid.isdigit(): | |
1500 pid = int(pid) | |
1501 # TODO(maruel): Load as utf-8 | |
1502 for line in open(pidfile, 'rb'): | |
1503 context.on_line(pid, line) | |
1504 result['results'] = context.to_results() | |
1505 except TracingFailure, e: | |
1506 result['exception'] = e | |
1507 out.append(result) | |
1508 return out | |
1509 | |
1510 | |
1511 class Dtrace(ApiBase): | |
1512 """Uses DTrace framework through dtrace. Requires root access. | |
1513 | |
1514 Implies Mac OSX. | |
1515 | |
1516 dtruss can't be used because it has compatibility issues with python. | |
1517 | |
1518 Also, the pid->cwd handling needs to be done manually since OSX has no way to | |
1519 get the absolute path of the 'cwd' dtrace variable from the probe. | |
1520 | |
1521 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see | |
1522 https://discussions.apple.com/thread/1980539. So resort to handling execve() | |
1523 manually. | |
1524 | |
1525 errno is not printed in the log since this implementation currently only cares | |
1526 about files that were successfully opened. | |
1527 """ | |
1528 class Context(ApiBase.Context): | |
1529 # Format: index pid function(args) | |
1530 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$') | |
1531 | |
1532 # Arguments parsing. | |
1533 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$') | |
1534 RE_CHDIR = re.compile(r'^\"(.+?)\"$') | |
1535 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$') | |
1536 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$') | |
1537 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$') | |
1538 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$') | |
1539 | |
1540 O_DIRECTORY = 0x100000 | |
1541 | |
1542 class Process(ApiBase.Context.Process): | |
1543 def __init__(self, *args): | |
1544 super(Dtrace.Context.Process, self).__init__(*args) | |
1545 self.cwd = self.initial_cwd | |
1546 | |
1547 def __init__(self, blacklist, tracer_pid, initial_cwd): | |
1548 logging.info( | |
1549 '%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd)) | |
1550 super(Dtrace.Context, self).__init__(blacklist) | |
1551 # Process ID of the temporary script created by create_thunk(). | |
1552 self._tracer_pid = tracer_pid | |
1553 self._initial_cwd = initial_cwd | |
1554 self._line_number = 0 | |
1555 | |
1556 def on_line(self, line): | |
1557 self._line_number += 1 | |
1558 match = self.RE_HEADER.match(line) | |
1559 if not match: | |
1560 raise TracingFailure( | |
1561 'Found malformed line: %s' % line, | |
1562 None, | |
1563 self._line_number, | |
1564 line) | |
1565 fn = getattr( | |
1566 self, | |
1567 'handle_%s' % match.group(2).replace('-', '_'), | |
1568 self._handle_ignored) | |
1569 # It is guaranteed to succeed because of the regexp. Or at least I thought | |
1570 # it would. | |
1571 pid = int(match.group(1)) | |
1572 try: | |
1573 return fn(pid, match.group(3)) | |
1574 except TracingFailure, e: | |
1575 # Hack in the values since the handler could be a static function. | |
1576 e.pid = pid | |
1577 e.line = line | |
1578 e.line_number = self._line_number | |
1579 # Re-raise the modified exception. | |
1580 raise | |
1581 except (KeyError, NotImplementedError, ValueError), e: | |
1582 raise TracingFailure( | |
1583 'Trace generated a %s exception: %s' % ( | |
1584 e.__class__.__name__, str(e)), | |
1585 pid, | |
1586 self._line_number, | |
1587 line, | |
1588 e) | |
1589 | |
1590 def to_results(self): | |
1591 process = self.root_process.to_results_process() | |
1592 # Internal concistency check. | |
1593 if sorted(self._process_lookup) != sorted(p.pid for p in process.all): | |
1594 raise TracingFailure( | |
1595 'Found internal inconsitency in process lifetime detection ' | |
1596 'while looking for len(tree) == len(list)', | |
1597 None, | |
1598 None, | |
1599 None, | |
1600 sorted(self._process_lookup), | |
1601 sorted(p.pid for p in process.all)) | |
1602 return Results(process) | |
1603 | |
1604 def handle_dtrace_BEGIN(self, _pid, args): | |
1605 if not self.RE_DTRACE_BEGIN.match(args): | |
1606 raise TracingFailure( | |
1607 'Found internal inconsitency in dtrace_BEGIN log line', | |
1608 None, None, None) | |
1609 | |
1610 def handle_proc_start(self, pid, args): | |
1611 """Transfers cwd. | |
1612 | |
1613 The dtrace script already takes care of only tracing the processes that | |
1614 are child of the traced processes so there is no need to verify the | |
1615 process hierarchy. | |
1616 """ | |
1617 if pid in self._process_lookup: | |
1618 raise TracingFailure( | |
1619 'Found internal inconsitency in proc_start: %d started two times' % | |
1620 pid, | |
1621 None, None, None) | |
1622 match = self.RE_PROC_START.match(args) | |
1623 if not match: | |
1624 raise TracingFailure( | |
1625 'Failed to parse arguments: %s' % args, | |
1626 None, None, None) | |
1627 ppid = int(match.group(1)) | |
1628 if ppid == self._tracer_pid and not self.root_process: | |
1629 proc = self.root_process = self.Process( | |
1630 self.blacklist, pid, self._initial_cwd) | |
1631 elif ppid in self._process_lookup: | |
1632 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd) | |
1633 self._process_lookup[ppid].children.append(proc) | |
1634 else: | |
1635 # Another process tree, ignore. | |
1636 return | |
1637 self._process_lookup[pid] = proc | |
1638 logging.debug( | |
1639 'New child: %s -> %d cwd:%s' % | |
1640 (ppid, pid, unicode(proc.initial_cwd))) | |
1641 | |
1642 def handle_proc_exit(self, pid, _args): | |
1643 """Removes cwd.""" | |
1644 if pid in self._process_lookup: | |
1645 # self._tracer_pid is not traced itself and other traces run neither. | |
1646 self._process_lookup[pid].cwd = None | |
1647 | |
1648 def handle_execve(self, pid, args): | |
1649 """Sets the process' executable. | |
1650 | |
1651 TODO(maruel): Read command line arguments. See | |
1652 https://discussions.apple.com/thread/1980539 for an example. | |
1653 https://gist.github.com/1242279 | |
1654 | |
1655 Will have to put the answer at http://stackoverflow.com/questions/7556249. | |
1656 :) | |
1657 """ | |
1658 if not pid in self._process_lookup: | |
1659 # Another process tree, ignore. | |
1660 return | |
1661 match = self.RE_EXECVE.match(args) | |
1662 if not match: | |
1663 raise TracingFailure( | |
1664 'Failed to parse arguments: %r' % args, | |
1665 None, None, None) | |
1666 proc = self._process_lookup[pid] | |
1667 proc.executable = match.group(1) | |
1668 proc.command = self.process_escaped_arguments(match.group(3)) | |
1669 if int(match.group(2)) != len(proc.command): | |
1670 raise TracingFailure( | |
1671 'Failed to parse execve() arguments: %s' % args, | |
1672 None, None, None) | |
1673 | |
1674 def handle_chdir(self, pid, args): | |
1675 """Updates cwd.""" | |
1676 if pid not in self._process_lookup: | |
1677 # Another process tree, ignore. | |
1678 return | |
1679 cwd = self.RE_CHDIR.match(args).group(1) | |
1680 if not cwd.startswith('/'): | |
1681 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd) | |
1682 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2)) | |
1683 else: | |
1684 logging.debug('handle_chdir(%d, %s)' % (pid, cwd)) | |
1685 cwd2 = cwd | |
1686 self._process_lookup[pid].cwd = cwd2 | |
1687 | |
1688 def handle_open_nocancel(self, pid, args): | |
1689 """Redirects to handle_open().""" | |
1690 return self.handle_open(pid, args) | |
1691 | |
1692 def handle_open(self, pid, args): | |
1693 if pid not in self._process_lookup: | |
1694 # Another process tree, ignore. | |
1695 return | |
1696 match = self.RE_OPEN.match(args) | |
1697 if not match: | |
1698 raise TracingFailure( | |
1699 'Failed to parse arguments: %s' % args, | |
1700 None, None, None) | |
1701 flag = int(match.group(2), 16) | |
1702 if self.O_DIRECTORY & flag == self.O_DIRECTORY: | |
1703 # Ignore directories. | |
1704 return | |
1705 self._handle_file(pid, match.group(1)) | |
1706 | |
1707 def handle_rename(self, pid, args): | |
1708 if pid not in self._process_lookup: | |
1709 # Another process tree, ignore. | |
1710 return | |
1711 match = self.RE_RENAME.match(args) | |
1712 if not match: | |
1713 raise TracingFailure( | |
1714 'Failed to parse arguments: %s' % args, | |
1715 None, None, None) | |
1716 self._handle_file(pid, match.group(1)) | |
1717 self._handle_file(pid, match.group(2)) | |
1718 | |
1719 def _handle_file(self, pid, filepath): | |
1720 if not filepath.startswith('/'): | |
1721 filepath = os.path.join(self._process_lookup[pid].cwd, filepath) | |
1722 # We can get '..' in the path. | |
1723 filepath = os.path.normpath(filepath) | |
1724 # Sadly, still need to filter out directories here; | |
1725 # saw open_nocancel(".", 0, 0) = 0 lines. | |
1726 if os.path.isdir(filepath): | |
1727 return | |
1728 self._process_lookup[pid].add_file(filepath, False) | |
1729 | |
1730 def handle_ftruncate(self, pid, args): | |
1731 """Just used as a signal to kill dtrace, ignoring.""" | |
1732 pass | |
1733 | |
1734 @staticmethod | |
1735 def _handle_ignored(pid, args): | |
1736 """Is called for all the event traces that are not handled.""" | |
1737 raise NotImplementedError('Please implement me') | |
1738 | |
1739 @staticmethod | |
1740 def process_escaped_arguments(text): | |
1741 """Extracts escaped arguments on a string and return the arguments as a | |
1742 list. | |
1743 | |
1744 Implemented as an automaton. | |
1745 | |
1746 Example: | |
1747 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the | |
1748 function will return ['python2.7', '-c', 'print("hi")] | |
1749 """ | |
1750 if not text.endswith('\\0'): | |
1751 raise ValueError('String is not null terminated: %r' % text, text) | |
1752 text = text[:-2] | |
1753 | |
1754 def unescape(x): | |
1755 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?.""" | |
1756 out = [] | |
1757 escaped = False | |
1758 for i in x: | |
1759 if i == '\\' and not escaped: | |
1760 escaped = True | |
1761 continue | |
1762 escaped = False | |
1763 out.append(i) | |
1764 return ''.join(out) | |
1765 | |
1766 return [unescape(i) for i in text.split('\\001')] | |
1767 | |
1768 class Tracer(ApiBase.Tracer): | |
1769 # pylint: disable=C0301 | |
1770 # | |
1771 # To understand the following code, you'll want to take a look at: | |
1772 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref
.html | |
1773 # https://wikis.oracle.com/display/DTrace/Variables | |
1774 # http://docs.oracle.com/cd/E19205-01/820-4221/ | |
1775 # | |
1776 # 0. Dump all the valid probes into a text file. It is important, you | |
1777 # want to redirect into a file and you don't want to constantly 'sudo'. | |
1778 # $ sudo dtrace -l > probes.txt | |
1779 # | |
1780 # 1. Count the number of probes: | |
1781 # $ wc -l probes.txt | |
1782 # 81823 # On OSX 10.7, including 1 header line. | |
1783 # | |
1784 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the | |
1785 # likes and skipping the header with NR>1: | |
1786 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]' | |
1787 # dtrace | |
1788 # fbt | |
1789 # io | |
1790 # ip | |
1791 # lockstat | |
1792 # mach_trap | |
1793 # proc | |
1794 # profile | |
1795 # sched | |
1796 # syscall | |
1797 # tcp | |
1798 # vminfo | |
1799 # | |
1800 # 3. List of valid probes: | |
1801 # $ grep syscall probes.txt | less | |
1802 # or use dtrace directly: | |
1803 # $ sudo dtrace -l -P syscall | less | |
1804 # | |
1805 # trackedpid is an associative array where its value can be 0, 1 or 2. | |
1806 # 0 is for untracked processes and is the default value for items not | |
1807 # in the associative array. | |
1808 # 1 is for tracked processes. | |
1809 # 2 is for the script created by create_thunk() only. It is not tracked | |
1810 # itself but all its decendants are. | |
1811 # | |
1812 # The script will kill itself only once waiting_to_die == 1 and | |
1813 # current_processes == 0, so that both getlogin() was called and that | |
1814 # all traced processes exited. | |
1815 # | |
1816 # TODO(maruel): Use cacheable predicates. See | |
1817 # https://wikis.oracle.com/display/DTrace/Performance+Considerations | |
1818 D_CODE = """ | |
1819 dtrace:::BEGIN { | |
1820 waiting_to_die = 0; | |
1821 current_processes = 0; | |
1822 logindex = 0; | |
1823 printf("%d %d %s_%s(\\"%s\\")\\n", | |
1824 logindex, PID, probeprov, probename, SCRIPT); | |
1825 logindex++; | |
1826 } | |
1827 | |
1828 proc:::start /trackedpid[ppid]/ { | |
1829 trackedpid[pid] = 1; | |
1830 current_processes += 1; | |
1831 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n", | |
1832 logindex, pid, probeprov, probename, | |
1833 ppid, | |
1834 execname, | |
1835 current_processes); | |
1836 logindex++; | |
1837 } | |
1838 /* Should use SCRIPT but there is no access to this variable at that | |
1839 * point. */ | |
1840 proc:::start /ppid == PID && execname == "Python"/ { | |
1841 trackedpid[pid] = 2; | |
1842 current_processes += 1; | |
1843 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n", | |
1844 logindex, pid, probeprov, probename, | |
1845 ppid, | |
1846 execname, | |
1847 current_processes); | |
1848 logindex++; | |
1849 } | |
1850 proc:::exit /trackedpid[pid] && | |
1851 current_processes == 1 && | |
1852 waiting_to_die == 1/ { | |
1853 trackedpid[pid] = 0; | |
1854 current_processes -= 1; | |
1855 printf("%d %d %s_%s(%d)\\n", | |
1856 logindex, pid, probeprov, probename, | |
1857 current_processes); | |
1858 logindex++; | |
1859 exit(0); | |
1860 } | |
1861 proc:::exit /trackedpid[pid]/ { | |
1862 trackedpid[pid] = 0; | |
1863 current_processes -= 1; | |
1864 printf("%d %d %s_%s(%d)\\n", | |
1865 logindex, pid, probeprov, probename, | |
1866 current_processes); | |
1867 logindex++; | |
1868 } | |
1869 | |
1870 /* Use an arcane function to detect when we need to die */ | |
1871 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ { | |
1872 waiting_to_die = 1; | |
1873 printf("%d %d %s()\\n", logindex, pid, probefunc); | |
1874 logindex++; | |
1875 } | |
1876 syscall::ftruncate:entry / | |
1877 pid == PID && arg0 == FILE_ID && current_processes == 0/ { | |
1878 exit(0); | |
1879 } | |
1880 | |
1881 syscall::open*:entry /trackedpid[pid] == 1/ { | |
1882 self->open_arg0 = arg0; | |
1883 self->open_arg1 = arg1; | |
1884 self->open_arg2 = arg2; | |
1885 } | |
1886 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ { | |
1887 this->open_arg0 = copyinstr(self->open_arg0); | |
1888 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n", | |
1889 logindex, pid, probefunc, | |
1890 this->open_arg0, | |
1891 self->open_arg1, | |
1892 self->open_arg2); | |
1893 logindex++; | |
1894 this->open_arg0 = 0; | |
1895 } | |
1896 syscall::open*:return /trackedpid[pid] == 1/ { | |
1897 self->open_arg0 = 0; | |
1898 self->open_arg1 = 0; | |
1899 self->open_arg2 = 0; | |
1900 } | |
1901 | |
1902 syscall::rename:entry /trackedpid[pid] == 1/ { | |
1903 self->rename_arg0 = arg0; | |
1904 self->rename_arg1 = arg1; | |
1905 } | |
1906 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ { | |
1907 this->rename_arg0 = copyinstr(self->rename_arg0); | |
1908 this->rename_arg1 = copyinstr(self->rename_arg1); | |
1909 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n", | |
1910 logindex, pid, probefunc, | |
1911 this->rename_arg0, | |
1912 this->rename_arg1); | |
1913 logindex++; | |
1914 this->rename_arg0 = 0; | |
1915 this->rename_arg1 = 0; | |
1916 } | |
1917 syscall::rename:return /trackedpid[pid] == 1/ { | |
1918 self->rename_arg0 = 0; | |
1919 self->rename_arg1 = 0; | |
1920 } | |
1921 | |
1922 /* Track chdir, it's painful because it is only receiving relative path. | |
1923 */ | |
1924 syscall::chdir:entry /trackedpid[pid] == 1/ { | |
1925 self->chdir_arg0 = arg0; | |
1926 } | |
1927 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ { | |
1928 this->chdir_arg0 = copyinstr(self->chdir_arg0); | |
1929 printf("%d %d %s(\\"%s\\")\\n", | |
1930 logindex, pid, probefunc, | |
1931 this->chdir_arg0); | |
1932 logindex++; | |
1933 this->chdir_arg0 = 0; | |
1934 } | |
1935 syscall::chdir:return /trackedpid[pid] == 1/ { | |
1936 self->chdir_arg0 = 0; | |
1937 } | |
1938 """ | |
1939 | |
1940 # execve-specific code, tends to throw a lot of exceptions. | |
1941 D_CODE_EXECVE = """ | |
1942 /* Finally what we care about! */ | |
1943 syscall::exec*:entry /trackedpid[pid]/ { | |
1944 self->exec_arg0 = copyinstr(arg0); | |
1945 /* Incrementally probe for a NULL in the argv parameter of execve() to | |
1946 * figure out argc. */ | |
1947 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was | |
1948 * found. */ | |
1949 self->exec_argc = 0; | |
1950 /* Probe for argc==1 */ | |
1951 this->exec_argv = (user_addr_t*)copyin( | |
1952 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
1953 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
1954 (self->exec_argc + 1) : self->exec_argc; | |
1955 | |
1956 /* Probe for argc==2 */ | |
1957 this->exec_argv = (user_addr_t*)copyin( | |
1958 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
1959 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
1960 (self->exec_argc + 1) : self->exec_argc; | |
1961 | |
1962 /* Probe for argc==3 */ | |
1963 this->exec_argv = (user_addr_t*)copyin( | |
1964 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
1965 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
1966 (self->exec_argc + 1) : self->exec_argc; | |
1967 | |
1968 /* Probe for argc==4 */ | |
1969 this->exec_argv = (user_addr_t*)copyin( | |
1970 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
1971 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
1972 (self->exec_argc + 1) : self->exec_argc; | |
1973 | |
1974 /* Copy the inputs strings since there is no guarantee they'll be | |
1975 * present after the call completed. */ | |
1976 self->exec_argv0 = (self->exec_argc > 0) ? | |
1977 copyinstr(this->exec_argv[0]) : ""; | |
1978 self->exec_argv1 = (self->exec_argc > 1) ? | |
1979 copyinstr(this->exec_argv[1]) : ""; | |
1980 self->exec_argv2 = (self->exec_argc > 2) ? | |
1981 copyinstr(this->exec_argv[2]) : ""; | |
1982 self->exec_argv3 = (self->exec_argc > 3) ? | |
1983 copyinstr(this->exec_argv[3]) : ""; | |
1984 this->exec_argv = 0; | |
1985 } | |
1986 syscall::exec*:return /trackedpid[pid] && errno == 0/ { | |
1987 /* We need to join strings here, as using multiple printf() would | |
1988 * cause tearing when multiple threads/processes are traced. | |
1989 * Since it is impossible to escape a string and join it to another one, | |
1990 * like sprintf("%s%S", previous, more), use hackery. | |
1991 * Each of the elements are split with a \\1. \\0 cannot be used because | |
1992 * it is simply ignored. This will conflict with any program putting a | |
1993 * \\1 in their execve() string but this should be "rare enough" */ | |
1994 this->args = ""; | |
1995 /* Process exec_argv[0] */ | |
1996 this->args = strjoin( | |
1997 this->args, (self->exec_argc > 0) ? self->exec_argv0 : ""); | |
1998 | |
1999 /* Process exec_argv[1] */ | |
2000 this->args = strjoin( | |
2001 this->args, (self->exec_argc > 1) ? "\\1" : ""); | |
2002 this->args = strjoin( | |
2003 this->args, (self->exec_argc > 1) ? self->exec_argv1 : ""); | |
2004 | |
2005 /* Process exec_argv[2] */ | |
2006 this->args = strjoin( | |
2007 this->args, (self->exec_argc > 2) ? "\\1" : ""); | |
2008 this->args = strjoin( | |
2009 this->args, (self->exec_argc > 2) ? self->exec_argv2 : ""); | |
2010 | |
2011 /* Process exec_argv[3] */ | |
2012 this->args = strjoin( | |
2013 this->args, (self->exec_argc > 3) ? "\\1" : ""); | |
2014 this->args = strjoin( | |
2015 this->args, (self->exec_argc > 3) ? self->exec_argv3 : ""); | |
2016 | |
2017 /* Prints self->exec_argc to permits verifying the internal | |
2018 * consistency since this code is quite fishy. */ | |
2019 printf("%d %d %s(\\"%s\\", [%d, %S])\\n", | |
2020 logindex, pid, probefunc, | |
2021 self->exec_arg0, | |
2022 self->exec_argc, | |
2023 this->args); | |
2024 logindex++; | |
2025 this->args = 0; | |
2026 } | |
2027 syscall::exec*:return /trackedpid[pid]/ { | |
2028 self->exec_arg0 = 0; | |
2029 self->exec_argc = 0; | |
2030 self->exec_argv0 = 0; | |
2031 self->exec_argv1 = 0; | |
2032 self->exec_argv2 = 0; | |
2033 self->exec_argv3 = 0; | |
2034 } | |
2035 """ | |
2036 | |
2037 # Code currently not used. | |
2038 D_EXTRANEOUS = """ | |
2039 /* This is a good learning experience, since it traces a lot of things | |
2040 * related to the process and child processes. | |
2041 * Warning: it generates a gigantic log. For example, tracing | |
2042 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes | |
2043 * several minutes to execute. | |
2044 */ | |
2045 /* | |
2046 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
2047 printf("%d %d %s_%s() = %d\\n", | |
2048 logindex, pid, probeprov, probefunc, errno); | |
2049 logindex++; | |
2050 } | |
2051 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
2052 printf("%d %d %s_%s() = %d\\n", | |
2053 logindex, pid, probeprov, probefunc, errno); | |
2054 logindex++; | |
2055 } | |
2056 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
2057 printf("%d %d %s_%s() = %d\\n", | |
2058 logindex, pid, probeprov, probefunc, errno); | |
2059 logindex++; | |
2060 } | |
2061 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
2062 printf("%d %d %s_%s() = %d\\n", | |
2063 logindex, pid, probeprov, probefunc, errno); | |
2064 logindex++; | |
2065 } | |
2066 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
2067 printf("%d %d %s_%s() = %d\\n", | |
2068 logindex, pid, probeprov, probefunc, errno); | |
2069 logindex++; | |
2070 } | |
2071 */ | |
2072 /* TODO(maruel): *stat* functions and friends | |
2073 syscall::access:return, | |
2074 syscall::chdir:return, | |
2075 syscall::chflags:return, | |
2076 syscall::chown:return, | |
2077 syscall::chroot:return, | |
2078 syscall::getattrlist:return, | |
2079 syscall::getxattr:return, | |
2080 syscall::lchown:return, | |
2081 syscall::lstat64:return, | |
2082 syscall::lstat:return, | |
2083 syscall::mkdir:return, | |
2084 syscall::pathconf:return, | |
2085 syscall::readlink:return, | |
2086 syscall::removexattr:return, | |
2087 syscall::setxattr:return, | |
2088 syscall::stat64:return, | |
2089 syscall::stat:return, | |
2090 syscall::truncate:return, | |
2091 syscall::unlink:return, | |
2092 syscall::utimes:return, | |
2093 */ | |
2094 """ | |
2095 | |
2096 def __init__(self, logname): | |
2097 """Starts the log collection with dtrace. | |
2098 | |
2099 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so | |
2100 this needs to wait for dtrace to be "warmed up". | |
2101 """ | |
2102 super(Dtrace.Tracer, self).__init__(logname) | |
2103 self._script = create_thunk() | |
2104 # This unique dummy temp file is used to signal the dtrace script that it | |
2105 # should stop as soon as all the child processes are done. A bit hackish | |
2106 # but works fine enough. | |
2107 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp( | |
2108 prefix='trace_signal_file') | |
2109 | |
2110 # Note: do not use the -p flag. It's useless if the initial process quits | |
2111 # too fast, resulting in missing traces from the grand-children. The D | |
2112 # code manages the dtrace lifetime itself. | |
2113 trace_cmd = [ | |
2114 'sudo', | |
2115 'dtrace', | |
2116 # Use a larger buffer if getting 'out of scratch space' errors. | |
2117 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables | |
2118 '-b', '10m', | |
2119 '-x', 'dynvarsize=10m', | |
2120 #'-x', 'dtrace_global_maxsize=1m', | |
2121 '-x', 'evaltime=exec', | |
2122 '-o', '/dev/stderr', | |
2123 '-q', | |
2124 '-n', self._get_dtrace_code(), | |
2125 ] | |
2126 with open(self._logname + '.log', 'wb') as logfile: | |
2127 self._dtrace = subprocess.Popen( | |
2128 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT) | |
2129 logging.debug('Started dtrace pid: %d' % self._dtrace.pid) | |
2130 | |
2131 # Reads until one line is printed, which signifies dtrace is up and ready. | |
2132 with open(self._logname + '.log', 'rb') as logfile: | |
2133 while 'dtrace_BEGIN' not in logfile.readline(): | |
2134 if self._dtrace.poll() is not None: | |
2135 # Do a busy wait. :/ | |
2136 break | |
2137 logging.debug('dtrace started') | |
2138 | |
2139 def _get_dtrace_code(self): | |
2140 """Setups the D code to implement child process tracking. | |
2141 | |
2142 Injects the cookie in the script so it knows when to stop. | |
2143 | |
2144 The script will detect any instance of the script created with | |
2145 create_thunk() and will start tracing it. | |
2146 """ | |
2147 return ( | |
2148 'inline int PID = %d;\n' | |
2149 'inline string SCRIPT = "%s";\n' | |
2150 'inline int FILE_ID = %d;\n' | |
2151 '\n' | |
2152 '%s') % ( | |
2153 os.getpid(), | |
2154 self._script, | |
2155 self._dummy_file_id, | |
2156 self.D_CODE) + self.D_CODE_EXECVE | |
2157 | |
2158 def trace(self, cmd, cwd, tracename, output): | |
2159 """Runs dtrace on an executable. | |
2160 | |
2161 This dtruss is broken when it starts the process itself or when tracing | |
2162 child processes, this code starts a wrapper process | |
2163 generated with create_thunk() which starts the executable to trace. | |
2164 """ | |
2165 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) | |
2166 assert os.path.isabs(cmd[0]), cmd[0] | |
2167 assert os.path.isabs(cwd), cwd | |
2168 assert os.path.normpath(cwd) == cwd, cwd | |
2169 with self._lock: | |
2170 if not self._initialized: | |
2171 raise TracingFailure( | |
2172 'Called Tracer.trace() on an unitialized object', | |
2173 None, None, None, tracename) | |
2174 assert tracename not in (i['trace'] for i in self._traces) | |
2175 | |
2176 # Starts the script wrapper to start the child process. This signals the | |
2177 # dtrace script that this process is to be traced. | |
2178 stdout = stderr = None | |
2179 if output: | |
2180 stdout = subprocess.PIPE | |
2181 stderr = subprocess.STDOUT | |
2182 child_cmd = [ | |
2183 sys.executable, | |
2184 self._script, | |
2185 tracename, | |
2186 ] | |
2187 # Call a dummy function so that dtrace knows I'm about to launch a process | |
2188 # that needs to be traced. | |
2189 # Yummy. | |
2190 child = subprocess.Popen( | |
2191 child_cmd + fix_python_path(cmd), | |
2192 stdin=subprocess.PIPE, | |
2193 stdout=stdout, | |
2194 stderr=stderr, | |
2195 cwd=cwd) | |
2196 logging.debug('Started child pid: %d' % child.pid) | |
2197 | |
2198 out = child.communicate()[0] | |
2199 # This doesn't mean tracing is done, one of the grand-child process may | |
2200 # still be alive. It will be tracked with the dtrace script. | |
2201 | |
2202 with self._lock: | |
2203 assert tracename not in (i['trace'] for i in self._traces) | |
2204 self._traces.append( | |
2205 { | |
2206 'cmd': cmd, | |
2207 'cwd': cwd, | |
2208 # The pid of strace process, not very useful. | |
2209 'pid': child.pid, | |
2210 'trace': tracename, | |
2211 'output': out, | |
2212 }) | |
2213 return child.returncode, out | |
2214 | |
2215 def close(self, timeout=None): | |
2216 """Terminates dtrace.""" | |
2217 logging.debug('close(%s)' % timeout) | |
2218 try: | |
2219 try: | |
2220 super(Dtrace.Tracer, self).close(timeout) | |
2221 # Signal dtrace that it should stop now. | |
2222 os.ftruncate(self._dummy_file_id, 0) | |
2223 if timeout: | |
2224 start = time.time() | |
2225 # Use polling. :/ | |
2226 while (self._dtrace.poll() is None and | |
2227 (time.time() - start) < timeout): | |
2228 time.sleep(0.1) | |
2229 self._dtrace.kill() | |
2230 self._dtrace.wait() | |
2231 finally: | |
2232 # Make sure to kill it in any case. | |
2233 if self._dtrace.poll() is None: | |
2234 try: | |
2235 self._dtrace.kill() | |
2236 self._dtrace.wait() | |
2237 except OSError: | |
2238 pass | |
2239 | |
2240 if self._dtrace.returncode != 0: | |
2241 # Warn about any dtrace failure but basically ignore it. | |
2242 print 'dtrace failure: %s' % self._dtrace.returncode | |
2243 finally: | |
2244 os.close(self._dummy_file_id) | |
2245 os.remove(self._dummy_file_name) | |
2246 os.remove(self._script) | |
2247 | |
2248 def post_process_log(self): | |
2249 """Sorts the log back in order when each call occured. | |
2250 | |
2251 dtrace doesn't save the buffer in strict order since it keeps one buffer | |
2252 per CPU. | |
2253 """ | |
2254 super(Dtrace.Tracer, self).post_process_log() | |
2255 logname = self._logname + '.log' | |
2256 with open(logname, 'rb') as logfile: | |
2257 lines = [l for l in logfile if l.strip()] | |
2258 errors = [l for l in lines if l.startswith('dtrace:')] | |
2259 if errors: | |
2260 raise TracingFailure( | |
2261 'Found errors in the trace: %s' % '\n'.join(errors), | |
2262 None, None, None, logname) | |
2263 try: | |
2264 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0])) | |
2265 except ValueError: | |
2266 raise TracingFailure( | |
2267 'Found errors in the trace: %s' % '\n'.join( | |
2268 l for l in lines if l.split(' ', 1)[0].isdigit()), | |
2269 None, None, None, logname) | |
2270 with open(logname, 'wb') as logfile: | |
2271 logfile.write(''.join(lines)) | |
2272 | |
2273 @staticmethod | |
2274 def clean_trace(logname): | |
2275 for ext in ('', '.log'): | |
2276 if os.path.isfile(logname + ext): | |
2277 os.remove(logname + ext) | |
2278 | |
2279 @classmethod | |
2280 def parse_log(cls, logname, blacklist): | |
2281 logging.info('parse_log(%s, ...)' % logname) | |
2282 | |
2283 def blacklist_more(filepath): | |
2284 # All the HFS metadata is in the form /.vol/... | |
2285 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath) | |
2286 | |
2287 data = read_json(logname) | |
2288 out = [] | |
2289 for item in data['traces']: | |
2290 context = cls.Context(blacklist_more, item['pid'], item['cwd']) | |
2291 for line in open(logname + '.log', 'rb'): | |
2292 context.on_line(line) | |
2293 out.append( | |
2294 { | |
2295 'results': context.to_results(), | |
2296 'trace': item['trace'], | |
2297 'output': item['output'], | |
2298 }) | |
2299 return out | |
2300 | |
2301 | |
2302 class LogmanTrace(ApiBase): | |
2303 """Uses the native Windows ETW based tracing functionality to trace a child | |
2304 process. | |
2305 | |
2306 Caveat: this implementations doesn't track cwd or initial_cwd. It is because | |
2307 the Windows Kernel doesn't have a concept of 'current working directory' at | |
2308 all. A Win32 process has a map of current directories, one per drive letter | |
2309 and it is managed by the user mode kernel32.dll. In kernel, a file is always | |
2310 opened relative to another file_object or as an absolute path. All the current | |
2311 working directory logic is done in user mode. | |
2312 """ | |
2313 class Context(ApiBase.Context): | |
2314 """Processes a ETW log line and keeps the list of existent and non | |
2315 existent files accessed. | |
2316 | |
2317 Ignores directories. | |
2318 """ | |
2319 # These indexes are for the stripped version in json. | |
2320 EVENT_NAME = 0 | |
2321 TYPE = 1 | |
2322 PID = 2 | |
2323 TID = 3 | |
2324 PROCESSOR_ID = 4 | |
2325 TIMESTAMP = 5 | |
2326 USER_DATA = 6 | |
2327 | |
2328 class Process(ApiBase.Context.Process): | |
2329 def __init__(self, *args): | |
2330 super(LogmanTrace.Context.Process, self).__init__(*args) | |
2331 # Handle file objects that succeeded. | |
2332 self.file_objects = {} | |
2333 | |
2334 def __init__(self, blacklist, tracer_pid): | |
2335 logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid)) | |
2336 super(LogmanTrace.Context, self).__init__(blacklist) | |
2337 self._drive_map = DosDriveMap() | |
2338 # Threads mapping to the corresponding process id. | |
2339 self._threads_active = {} | |
2340 # Process ID of the tracer, e.g. tracer_inputs.py | |
2341 self._tracer_pid = tracer_pid | |
2342 self._line_number = 0 | |
2343 | |
2344 def on_line(self, line): | |
2345 """Processes a json Event line.""" | |
2346 self._line_number += 1 | |
2347 try: | |
2348 # By Opcode | |
2349 handler = getattr( | |
2350 self, | |
2351 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]), | |
2352 None) | |
2353 if not handler: | |
2354 raise TracingFailure( | |
2355 'Unexpected event %s_%s' % ( | |
2356 line[self.EVENT_NAME], line[self.TYPE]), | |
2357 None, None, None) | |
2358 handler(line) | |
2359 except TracingFailure, e: | |
2360 # Hack in the values since the handler could be a static function. | |
2361 e.pid = line[self.PID] | |
2362 e.line = line | |
2363 e.line_number = self._line_number | |
2364 # Re-raise the modified exception. | |
2365 raise | |
2366 except (KeyError, NotImplementedError, ValueError), e: | |
2367 raise TracingFailure( | |
2368 'Trace generated a %s exception: %s' % ( | |
2369 e.__class__.__name__, str(e)), | |
2370 line[self.PID], | |
2371 self._line_number, | |
2372 line, | |
2373 e) | |
2374 | |
2375 def to_results(self): | |
2376 if not self.root_process: | |
2377 raise TracingFailure( | |
2378 'Failed to detect the initial process', | |
2379 None, None, None) | |
2380 process = self.root_process.to_results_process() | |
2381 return Results(process) | |
2382 | |
2383 def _thread_to_process(self, tid): | |
2384 """Finds the process from the thread id.""" | |
2385 tid = int(tid, 16) | |
2386 pid = self._threads_active.get(tid) | |
2387 if not pid or not self._process_lookup.get(pid): | |
2388 return | |
2389 return self._process_lookup[pid] | |
2390 | |
2391 @classmethod | |
2392 def handle_EventTrace_Header(cls, line): | |
2393 """Verifies no event was dropped, e.g. no buffer overrun occured.""" | |
2394 BUFFER_SIZE = cls.USER_DATA | |
2395 #VERSION = cls.USER_DATA + 1 | |
2396 #PROVIDER_VERSION = cls.USER_DATA + 2 | |
2397 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3 | |
2398 #END_TIME = cls.USER_DATA + 4 | |
2399 #TIMER_RESOLUTION = cls.USER_DATA + 5 | |
2400 #MAX_FILE_SIZE = cls.USER_DATA + 6 | |
2401 #LOG_FILE_MODE = cls.USER_DATA + 7 | |
2402 #BUFFERS_WRITTEN = cls.USER_DATA + 8 | |
2403 #START_BUFFERS = cls.USER_DATA + 9 | |
2404 #POINTER_SIZE = cls.USER_DATA + 10 | |
2405 EVENTS_LOST = cls.USER_DATA + 11 | |
2406 #CPU_SPEED = cls.USER_DATA + 12 | |
2407 #LOGGER_NAME = cls.USER_DATA + 13 | |
2408 #LOG_FILE_NAME = cls.USER_DATA + 14 | |
2409 #BOOT_TIME = cls.USER_DATA + 15 | |
2410 #PERF_FREQ = cls.USER_DATA + 16 | |
2411 #START_TIME = cls.USER_DATA + 17 | |
2412 #RESERVED_FLAGS = cls.USER_DATA + 18 | |
2413 #BUFFERS_LOST = cls.USER_DATA + 19 | |
2414 #SESSION_NAME_STRING = cls.USER_DATA + 20 | |
2415 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21 | |
2416 if line[EVENTS_LOST] != '0': | |
2417 raise TracingFailure( | |
2418 ( '%s events were lost during trace, please increase the buffer ' | |
2419 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]), | |
2420 None, None, None) | |
2421 | |
2422 def handle_FileIo_Cleanup(self, line): | |
2423 """General wisdom: if a file is closed, it's because it was opened. | |
2424 | |
2425 Note that FileIo_Close is not used since if a file was opened properly but | |
2426 not closed before the process exits, only Cleanup will be logged. | |
2427 """ | |
2428 #IRP = self.USER_DATA | |
2429 TTID = self.USER_DATA + 1 # Thread ID, that's what we want. | |
2430 FILE_OBJECT = self.USER_DATA + 2 | |
2431 #FILE_KEY = self.USER_DATA + 3 | |
2432 proc = self._thread_to_process(line[TTID]) | |
2433 if not proc: | |
2434 # Not a process we care about. | |
2435 return | |
2436 file_object = line[FILE_OBJECT] | |
2437 if file_object in proc.file_objects: | |
2438 proc.add_file(proc.file_objects.pop(file_object), False) | |
2439 | |
2440 def handle_FileIo_Create(self, line): | |
2441 """Handles a file open. | |
2442 | |
2443 All FileIo events are described at | |
2444 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx | |
2445 for some value of 'description'. | |
2446 | |
2447 " (..) process and thread id values of the IO events (..) are not valid " | |
2448 http://msdn.microsoft.com/magazine/ee358703.aspx | |
2449 | |
2450 The FileIo.Create event doesn't return if the CreateFile() call | |
2451 succeeded, so keep track of the file_object and check that it is | |
2452 eventually closed with FileIo_Cleanup. | |
2453 """ | |
2454 #IRP = self.USER_DATA | |
2455 TTID = self.USER_DATA + 1 # Thread ID, that's what we want. | |
2456 FILE_OBJECT = self.USER_DATA + 2 | |
2457 #CREATE_OPTIONS = self.USER_DATA + 3 | |
2458 #FILE_ATTRIBUTES = self.USER_DATA + 4 | |
2459 #self.USER_DATA + SHARE_ACCESS = 5 | |
2460 OPEN_PATH = self.USER_DATA + 6 | |
2461 | |
2462 proc = self._thread_to_process(line[TTID]) | |
2463 if not proc: | |
2464 # Not a process we care about. | |
2465 return | |
2466 | |
2467 match = re.match(r'^\"(.+)\"$', line[OPEN_PATH]) | |
2468 raw_path = match.group(1) | |
2469 # Ignore directories and bare drive right away. | |
2470 if raw_path.endswith(os.path.sep): | |
2471 return | |
2472 filepath = self._drive_map.to_win32(raw_path) | |
2473 # Ignore bare drive right away. Some may still fall through with format | |
2474 # like '\\?\X:' | |
2475 if len(filepath) == 2: | |
2476 return | |
2477 file_object = line[FILE_OBJECT] | |
2478 if os.path.isdir(filepath): | |
2479 # There is no O_DIRECTORY equivalent on Windows. The closed is | |
2480 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So | |
2481 # simply discard directories are they are found. | |
2482 return | |
2483 # Override any stale file object | |
2484 proc.file_objects[file_object] = filepath | |
2485 | |
2486 def handle_FileIo_Rename(self, line): | |
2487 # TODO(maruel): Handle? | |
2488 pass | |
2489 | |
2490 def handle_Process_End(self, line): | |
2491 pid = line[self.PID] | |
2492 if self._process_lookup.get(pid): | |
2493 logging.info('Terminated: %d' % pid) | |
2494 self._process_lookup[pid] = None | |
2495 else: | |
2496 logging.debug('Terminated: %d' % pid) | |
2497 | |
2498 def handle_Process_Start(self, line): | |
2499 """Handles a new child process started by PID.""" | |
2500 #UNIQUE_PROCESS_KEY = self.USER_DATA | |
2501 PROCESS_ID = self.USER_DATA + 1 | |
2502 #PARENT_PID = self.USER_DATA + 2 | |
2503 #SESSION_ID = self.USER_DATA + 3 | |
2504 #EXIT_STATUS = self.USER_DATA + 4 | |
2505 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5 | |
2506 #USER_SID = self.USER_DATA + 6 | |
2507 IMAGE_FILE_NAME = self.USER_DATA + 7 | |
2508 COMMAND_LINE = self.USER_DATA + 8 | |
2509 | |
2510 ppid = line[self.PID] | |
2511 pid = int(line[PROCESS_ID], 16) | |
2512 logging.debug( | |
2513 'New process %d->%d (%s) %s' % | |
2514 (ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE])) | |
2515 | |
2516 if ppid == self._tracer_pid: | |
2517 # Need to ignore processes we don't know about because the log is | |
2518 # system-wide. self._tracer_pid shall start only one process. | |
2519 if self.root_process: | |
2520 raise TracingFailure( | |
2521 ( 'Parent process is _tracer_pid(%d) but root_process(%d) is ' | |
2522 'already set') % (self._tracer_pid, self.root_process.pid), | |
2523 None, None, None) | |
2524 proc = self.Process(self.blacklist, pid, None) | |
2525 self.root_process = proc | |
2526 ppid = None | |
2527 elif self._process_lookup.get(ppid): | |
2528 proc = self.Process(self.blacklist, pid, None) | |
2529 self._process_lookup[ppid].children.append(proc) | |
2530 else: | |
2531 # Ignore | |
2532 return | |
2533 self._process_lookup[pid] = proc | |
2534 | |
2535 if (not line[IMAGE_FILE_NAME].startswith('"') or | |
2536 not line[IMAGE_FILE_NAME].endswith('"')): | |
2537 raise TracingFailure( | |
2538 'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME], | |
2539 None, None, None) | |
2540 | |
2541 # TODO(maruel): Process escapes. | |
2542 if (not line[COMMAND_LINE].startswith('"') or | |
2543 not line[COMMAND_LINE].endswith('"')): | |
2544 raise TracingFailure( | |
2545 'Command line is not properly quoted: %s' % line[COMMAND_LINE], | |
2546 None, None, None) | |
2547 proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1]) | |
2548 proc.executable = line[IMAGE_FILE_NAME][1:-1] | |
2549 # proc.command[0] may be the absolute path of 'executable' but it may be | |
2550 # anything else too. If it happens that command[0] ends with executable, | |
2551 # use it, otherwise defaults to the base name. | |
2552 cmd0 = proc.command[0].lower() | |
2553 if not cmd0.endswith('.exe'): | |
2554 # TODO(maruel): That's not strictly true either. | |
2555 cmd0 += '.exe' | |
2556 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0): | |
2557 # Fix the path. | |
2558 cmd0 = cmd0.replace('/', os.path.sep) | |
2559 cmd0 = os.path.normpath(cmd0) | |
2560 proc.executable = get_native_path_case(cmd0) | |
2561 logging.info( | |
2562 'New child: %s -> %d %s' % (ppid, pid, proc.executable)) | |
2563 | |
2564 def handle_Thread_End(self, line): | |
2565 """Has the same parameters as Thread_Start.""" | |
2566 tid = int(line[self.TID], 16) | |
2567 self._threads_active.pop(tid, None) | |
2568 | |
2569 def handle_Thread_Start(self, line): | |
2570 """Handles a new thread created. | |
2571 | |
2572 Do not use self.PID here since a process' initial thread is created by | |
2573 the parent process. | |
2574 """ | |
2575 PROCESS_ID = self.USER_DATA | |
2576 TTHREAD_ID = self.USER_DATA + 1 | |
2577 #STACK_BASE = self.USER_DATA + 2 | |
2578 #STACK_LIMIT = self.USER_DATA + 3 | |
2579 #USER_STACK_BASE = self.USER_DATA + 4 | |
2580 #USER_STACK_LIMIT = self.USER_DATA + 5 | |
2581 #AFFINITY = self.USER_DATA + 6 | |
2582 #WIN32_START_ADDR = self.USER_DATA + 7 | |
2583 #TEB_BASE = self.USER_DATA + 8 | |
2584 #SUB_PROCESS_TAG = self.USER_DATA + 9 | |
2585 #BASE_PRIORITY = self.USER_DATA + 10 | |
2586 #PAGE_PRIORITY = self.USER_DATA + 11 | |
2587 #IO_PRIORITY = self.USER_DATA + 12 | |
2588 #THREAD_FLAGS = self.USER_DATA + 13 | |
2589 # Do not use self.PID here since a process' initial thread is created by | |
2590 # the parent process. | |
2591 pid = int(line[PROCESS_ID], 16) | |
2592 tid = int(line[TTHREAD_ID], 16) | |
2593 logging.debug('New thread pid:%d, tid:%d' % (pid, tid)) | |
2594 self._threads_active[tid] = pid | |
2595 | |
2596 @classmethod | |
2597 def supported_events(cls): | |
2598 """Returns all the procesed events.""" | |
2599 out = [] | |
2600 for member in dir(cls): | |
2601 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member) | |
2602 if match: | |
2603 out.append(match.groups()) | |
2604 return out | |
2605 | |
2606 class Tracer(ApiBase.Tracer): | |
2607 # The basic headers. | |
2608 EXPECTED_HEADER = [ | |
2609 u'Event Name', | |
2610 u'Type', | |
2611 u'Event ID', | |
2612 u'Version', | |
2613 u'Channel', | |
2614 u'Level', # 5 | |
2615 u'Opcode', | |
2616 u'Task', | |
2617 u'Keyword', | |
2618 u'PID', | |
2619 u'TID', # 10 | |
2620 u'Processor Number', | |
2621 u'Instance ID', | |
2622 u'Parent Instance ID', | |
2623 u'Activity ID', | |
2624 u'Related Activity ID', # 15 | |
2625 u'Clock-Time', | |
2626 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them | |
2627 u'User(ms)', # pretty much useless. | |
2628 u'User Data', # Extra arguments that are event-specific. | |
2629 ] | |
2630 # Only the useful headers common to all entries are listed there. Any column | |
2631 # at 19 or higher is dependent on the specific event. | |
2632 EVENT_NAME = 0 | |
2633 TYPE = 1 | |
2634 PID = 9 | |
2635 TID = 10 | |
2636 PROCESSOR_ID = 11 | |
2637 TIMESTAMP = 16 | |
2638 NULL_GUID = '{00000000-0000-0000-0000-000000000000}' | |
2639 USER_DATA = 19 | |
2640 | |
2641 def __init__(self, logname): | |
2642 """Starts the log collection. | |
2643 | |
2644 Requires administrative access. logman.exe is synchronous so no need for a | |
2645 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID | |
2646 instead. The GUID constant name is SystemTraceControlGuid. Lovely. | |
2647 | |
2648 One can get the list of potentially interesting providers with: | |
2649 "logman query providers | findstr /i file" | |
2650 """ | |
2651 super(LogmanTrace.Tracer, self).__init__(logname) | |
2652 self._script = create_thunk() | |
2653 cmd_start = [ | |
2654 'logman.exe', | |
2655 'start', | |
2656 'NT Kernel Logger', | |
2657 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}', | |
2658 # splitio,fileiocompletion,syscall,file,cswitch,img | |
2659 '(process,fileio,thread)', | |
2660 '-o', self._logname + '.etl', | |
2661 '-ets', # Send directly to kernel | |
2662 # Values extracted out of thin air. | |
2663 # Event Trace Session buffer size in kb. | |
2664 '-bs', '10240', | |
2665 # Number of Event Trace Session buffers. | |
2666 '-nb', '16', '256', | |
2667 ] | |
2668 logging.debug('Running: %s' % cmd_start) | |
2669 try: | |
2670 subprocess.check_call( | |
2671 cmd_start, | |
2672 stdin=subprocess.PIPE, | |
2673 stdout=subprocess.PIPE, | |
2674 stderr=subprocess.STDOUT) | |
2675 except subprocess.CalledProcessError, e: | |
2676 if e.returncode == -2147024891: | |
2677 print >> sys.stderr, 'Please restart with an elevated admin prompt' | |
2678 elif e.returncode == -2144337737: | |
2679 print >> sys.stderr, ( | |
2680 'A kernel trace was already running, stop it and try again') | |
2681 raise | |
2682 | |
2683 def trace(self, cmd, cwd, tracename, output): | |
2684 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) | |
2685 assert os.path.isabs(cmd[0]), cmd[0] | |
2686 assert os.path.isabs(cwd), cwd | |
2687 assert os.path.normpath(cwd) == cwd, cwd | |
2688 with self._lock: | |
2689 if not self._initialized: | |
2690 raise TracingFailure( | |
2691 'Called Tracer.trace() on an unitialized object', | |
2692 None, None, None, tracename) | |
2693 assert tracename not in (i['trace'] for i in self._traces) | |
2694 | |
2695 # Use "logman -?" for help. | |
2696 | |
2697 stdout = stderr = None | |
2698 if output: | |
2699 stdout = subprocess.PIPE | |
2700 stderr = subprocess.STDOUT | |
2701 | |
2702 # Run the child process. | |
2703 logging.debug('Running: %s' % cmd) | |
2704 # Use the temporary script generated with create_thunk() so we have a | |
2705 # clear pid owner. Since trace_inputs.py can be used as a library and | |
2706 # could trace multiple processes simultaneously, it makes it more complex | |
2707 # if the executable to be traced is executed directly here. It also solves | |
2708 # issues related to logman.exe that needs to be executed to control the | |
2709 # kernel trace. | |
2710 child_cmd = [ | |
2711 sys.executable, | |
2712 self._script, | |
2713 tracename, | |
2714 ] | |
2715 child = subprocess.Popen( | |
2716 child_cmd + fix_python_path(cmd), | |
2717 cwd=cwd, | |
2718 stdin=subprocess.PIPE, | |
2719 stdout=stdout, | |
2720 stderr=stderr) | |
2721 logging.debug('Started child pid: %d' % child.pid) | |
2722 out = child.communicate()[0] | |
2723 # This doesn't mean all the grand-children are done. Sadly, we don't have | |
2724 # a good way to determine that. | |
2725 | |
2726 with self._lock: | |
2727 assert tracename not in (i['trace'] for i in self._traces) | |
2728 self._traces.append({ | |
2729 'command': cmd, | |
2730 'cwd': cwd, | |
2731 'pid': child.pid, | |
2732 'trace': tracename, | |
2733 'output': out, | |
2734 }) | |
2735 | |
2736 return child.returncode, out | |
2737 | |
2738 def close(self, _timeout=None): | |
2739 """Stops the kernel log collection and converts the traces to text | |
2740 representation. | |
2741 """ | |
2742 with self._lock: | |
2743 if not self._initialized: | |
2744 raise TracingFailure( | |
2745 'Called Tracer.close() on an unitialized object', | |
2746 None, None, None) | |
2747 os.remove(self._script) | |
2748 # Save metadata, add 'format' key.. | |
2749 data = { | |
2750 'format': 'csv', | |
2751 'traces': self._traces, | |
2752 } | |
2753 write_json(self._logname, data, False) | |
2754 | |
2755 cmd_stop = [ | |
2756 'logman.exe', | |
2757 'stop', | |
2758 'NT Kernel Logger', | |
2759 '-ets', # Sends the command directly to the kernel. | |
2760 ] | |
2761 logging.debug('Running: %s' % cmd_stop) | |
2762 subprocess.check_call( | |
2763 cmd_stop, | |
2764 stdin=subprocess.PIPE, | |
2765 stdout=subprocess.PIPE, | |
2766 stderr=subprocess.STDOUT) | |
2767 self._initialized = False | |
2768 | |
2769 def post_process_log(self): | |
2770 """Converts the .etl file into .csv then into .json.""" | |
2771 super(LogmanTrace.Tracer, self).post_process_log() | |
2772 logformat = 'csv' | |
2773 self._convert_log(logformat) | |
2774 | |
2775 if logformat == 'csv_utf16': | |
2776 def load_file(): | |
2777 def utf_8_encoder(unicode_csv_data): | |
2778 """Encodes the unicode object as utf-8 encoded str instance""" | |
2779 for line in unicode_csv_data: | |
2780 yield line.encode('utf-8') | |
2781 | |
2782 def unicode_csv_reader(unicode_csv_data, **kwargs): | |
2783 """Encodes temporarily as UTF-8 since csv module doesn't do unicode. | |
2784 """ | |
2785 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs) | |
2786 for row in csv_reader: | |
2787 # Decode str utf-8 instances back to unicode instances, cell by | |
2788 # cell: | |
2789 yield [cell.decode('utf-8') for cell in row] | |
2790 | |
2791 # The CSV file is UTF-16 so use codecs.open() to load the file into | |
2792 # the python internal unicode format (utf-8). Then explicitly | |
2793 # re-encode as utf8 as str instances so csv can parse it fine. Then | |
2794 # decode the utf-8 str back into python unicode instances. This | |
2795 # sounds about right. | |
2796 for line in unicode_csv_reader( | |
2797 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')): | |
2798 # line is a list of unicode objects | |
2799 # So much white space! | |
2800 yield [i.strip() for i in line] | |
2801 | |
2802 elif logformat == 'csv': | |
2803 def load_file(): | |
2804 def ansi_csv_reader(ansi_csv_data, **kwargs): | |
2805 """Loads an 'ANSI' code page and returns unicode() objects.""" | |
2806 assert sys.getfilesystemencoding() == 'mbcs' | |
2807 encoding = get_current_encoding() | |
2808 for row in csv.reader(ansi_csv_data, **kwargs): | |
2809 # Decode str 'ansi' instances to unicode instances, cell by cell: | |
2810 yield [cell.decode(encoding) for cell in row] | |
2811 | |
2812 # The fastest and smallest format but only supports 'ANSI' file paths. | |
2813 # E.g. the filenames are encoding in the 'current' encoding. | |
2814 for line in ansi_csv_reader(open(self._logname + '.' + logformat)): | |
2815 # line is a list of unicode objects. | |
2816 yield [i.strip() for i in line] | |
2817 | |
2818 supported_events = LogmanTrace.Context.supported_events() | |
2819 | |
2820 def trim(generator): | |
2821 for index, line in enumerate(generator): | |
2822 if not index: | |
2823 if line != self.EXPECTED_HEADER: | |
2824 raise TracingFailure( | |
2825 'Found malformed header: %s' % ' '.join(line), | |
2826 None, None, None) | |
2827 continue | |
2828 # As you can see, the CSV is full of useful non-redundant information: | |
2829 if (line[2] != '0' or # Event ID | |
2830 line[3] not in ('2', '3') or # Version | |
2831 line[4] != '0' or # Channel | |
2832 line[5] != '0' or # Level | |
2833 line[7] != '0' or # Task | |
2834 line[8] != '0x0000000000000000' or # Keyword | |
2835 line[12] != '' or # Instance ID | |
2836 line[13] != '' or # Parent Instance ID | |
2837 line[14] != self.NULL_GUID or # Activity ID | |
2838 line[15] != ''): # Related Activity ID | |
2839 raise TracingFailure( | |
2840 'Found unexpected values in line: %s' % ' '.join(line), | |
2841 None, None, None) | |
2842 | |
2843 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events: | |
2844 continue | |
2845 | |
2846 # Convert the PID in-place from hex. | |
2847 line[self.PID] = int(line[self.PID], 16) | |
2848 | |
2849 yield [ | |
2850 line[self.EVENT_NAME], | |
2851 line[self.TYPE], | |
2852 line[self.PID], | |
2853 line[self.TID], | |
2854 line[self.PROCESSOR_ID], | |
2855 line[self.TIMESTAMP], | |
2856 ] + line[self.USER_DATA:] | |
2857 | |
2858 write_json('%s.json' % self._logname, list(trim(load_file())), True) | |
2859 | |
2860 def _convert_log(self, logformat): | |
2861 """Converts the ETL trace to text representation. | |
2862 | |
2863 Normally, 'csv' is sufficient. If complex scripts are used (like eastern | |
2864 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'. | |
2865 | |
2866 Arguments: | |
2867 - logformat: Text format to be generated, csv, csv_utf16 or xml. | |
2868 | |
2869 Use "tracerpt -?" for help. | |
2870 """ | |
2871 LOCALE_INVARIANT = 0x7F | |
2872 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT) | |
2873 cmd_convert = [ | |
2874 'tracerpt.exe', | |
2875 '-l', self._logname + '.etl', | |
2876 '-o', self._logname + '.' + logformat, | |
2877 '-gmt', # Use UTC | |
2878 '-y', # No prompt | |
2879 # Use -of XML to get the header of each items after column 19, e.g. all | |
2880 # the actual headers of 'User Data'. | |
2881 ] | |
2882 | |
2883 if logformat == 'csv': | |
2884 # tracerpt localizes the 'Type' column, for major brainfuck | |
2885 # entertainment. I can't imagine any sane reason to do that. | |
2886 cmd_convert.extend(['-of', 'CSV']) | |
2887 elif logformat == 'csv_utf16': | |
2888 # This causes it to use UTF-16, which doubles the log size but ensures | |
2889 # the log is readable for non-ASCII characters. | |
2890 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode']) | |
2891 elif logformat == 'xml': | |
2892 cmd_convert.extend(['-of', 'XML']) | |
2893 else: | |
2894 raise ValueError('Unexpected log format \'%s\'' % logformat) | |
2895 logging.debug('Running: %s' % cmd_convert) | |
2896 # This can takes tens of minutes for large logs. | |
2897 # Redirects all output to stderr. | |
2898 subprocess.check_call( | |
2899 cmd_convert, | |
2900 stdin=subprocess.PIPE, | |
2901 stdout=sys.stderr, | |
2902 stderr=sys.stderr) | |
2903 | |
2904 @staticmethod | |
2905 def clean_trace(logname): | |
2906 for ext in ('', '.csv', '.etl', '.json', '.xml'): | |
2907 if os.path.isfile(logname + ext): | |
2908 os.remove(logname + ext) | |
2909 | |
2910 @classmethod | |
2911 def parse_log(cls, logname, blacklist): | |
2912 logging.info('parse_log(%s, %s)' % (logname, blacklist)) | |
2913 | |
2914 def blacklist_more(filepath): | |
2915 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that. | |
2916 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath) | |
2917 | |
2918 data = read_json(logname) | |
2919 lines = read_json(logname + '.json') | |
2920 out = [] | |
2921 for item in data['traces']: | |
2922 context = cls.Context(blacklist_more, item['pid']) | |
2923 for line in lines: | |
2924 context.on_line(line) | |
2925 out.append( | |
2926 { | |
2927 'results': context.to_results(), | |
2928 'trace': item['trace'], | |
2929 'output': item['output'], | |
2930 }) | |
2931 return out | |
2932 | |
2933 | |
2934 def get_api(): | |
2935 """Returns the correct implementation for the current OS.""" | |
2936 if sys.platform == 'cygwin': | |
2937 raise NotImplementedError( | |
2938 'Not implemented for cygwin, start the script from Win32 python') | |
2939 flavors = { | |
2940 'win32': LogmanTrace, | |
2941 'darwin': Dtrace, | |
2942 'sunos5': Dtrace, | |
2943 'freebsd7': Dtrace, | |
2944 'freebsd8': Dtrace, | |
2945 } | |
2946 # Defaults to strace. | |
2947 return flavors.get(sys.platform, Strace)() | |
2948 | |
2949 | |
2950 def extract_directories(root_dir, files, blacklist): | |
2951 """Detects if all the files in a directory are in |files| and if so, replace | |
2952 the individual files by a Results.Directory instance. | |
2953 | |
2954 Takes a list of Results.File instances and returns a shorter list of | |
2955 Results.File and Results.Directory instances. | |
2956 | |
2957 Arguments: | |
2958 - root_dir: Optional base directory that shouldn't be search further. | |
2959 - files: list of Results.File instances. | |
2960 - blacklist: regexp of files to ignore, for example r'.+\.pyc'. | |
2961 """ | |
2962 logging.info( | |
2963 'extract_directories(%s, %d files, ...)' % (root_dir, len(files))) | |
2964 assert not (root_dir or '').endswith(os.path.sep), root_dir | |
2965 assert not root_dir or (get_native_path_case(root_dir) == root_dir) | |
2966 assert not any(isinstance(f, Results.Directory) for f in files) | |
2967 # Remove non existent files. | |
2968 files = [f for f in files if f.existent] | |
2969 if not files: | |
2970 return files | |
2971 # All files must share the same root, which can be None. | |
2972 assert len(set(f.root for f in files)) == 1, set(f.root for f in files) | |
2973 | |
2974 # Creates a {directory: {filename: File}} mapping, up to root. | |
2975 buckets = {} | |
2976 if root_dir: | |
2977 buckets[root_dir] = {} | |
2978 for fileobj in files: | |
2979 path = fileobj.full_path | |
2980 directory = os.path.dirname(path) | |
2981 assert directory | |
2982 # Do not use os.path.basename() so trailing os.path.sep is kept. | |
2983 basename = path[len(directory)+1:] | |
2984 files_in_directory = buckets.setdefault(directory, {}) | |
2985 files_in_directory[basename] = fileobj | |
2986 # Add all the directories recursively up to root. | |
2987 while True: | |
2988 old_d = directory | |
2989 directory = os.path.dirname(directory) | |
2990 if directory + os.path.sep == root_dir or directory == old_d: | |
2991 break | |
2992 buckets.setdefault(directory, {}) | |
2993 | |
2994 root_prefix = len(root_dir) + 1 if root_dir else 0 | |
2995 for directory in sorted(buckets, reverse=True): | |
2996 actual = set(f for f in os.listdir(directory) if not blacklist(f)) | |
2997 expected = set(buckets[directory]) | |
2998 if not (actual - expected): | |
2999 parent = os.path.dirname(directory) | |
3000 buckets[parent][os.path.basename(directory)] = Results.Directory( | |
3001 root_dir, | |
3002 directory[root_prefix:], | |
3003 False, | |
3004 sum(f.size for f in buckets[directory].itervalues()), | |
3005 sum(f.nb_files for f in buckets[directory].itervalues())) | |
3006 # Remove the whole bucket. | |
3007 del buckets[directory] | |
3008 | |
3009 # Reverse the mapping with what remains. The original instances are returned, | |
3010 # so the cached meta data is kept. | |
3011 files = sum((x.values() for x in buckets.itervalues()), []) | |
3012 return sorted(files, key=lambda x: x.path) | |
3013 | |
3014 | |
3015 def trace(logfile, cmd, cwd, api, output): | |
3016 """Traces an executable. Returns (returncode, output) from api. | |
3017 | |
3018 Arguments: | |
3019 - logfile: file to write to. | |
3020 - cmd: command to run. | |
3021 - cwd: current directory to start the process in. | |
3022 - api: a tracing api instance. | |
3023 - output: if True, returns output, otherwise prints it at the console. | |
3024 """ | |
3025 cmd = fix_python_path(cmd) | |
3026 api.clean_trace(logfile) | |
3027 with api.get_tracer(logfile) as tracer: | |
3028 return tracer.trace(cmd, cwd, 'default', output) | |
3029 | |
3030 | |
3031 def load_trace(logfile, root_dir, api, blacklist): | |
3032 """Loads a trace file and returns the Results instance. | |
3033 | |
3034 Arguments: | |
3035 - logfile: File to load. | |
3036 - root_dir: Root directory to use to determine if a file is relevant to the | |
3037 trace or not. | |
3038 - api: A tracing api instance. | |
3039 - blacklist: Optional blacklist function to filter out unimportant files. | |
3040 """ | |
3041 data = api.parse_log(logfile, (blacklist or (lambda _: False))) | |
3042 assert len(data) == 1, 'More than one trace was detected!' | |
3043 if 'exception' in data[0]: | |
3044 # It got an exception, raise it. | |
3045 raise data[0]['exception'] | |
3046 results = data[0]['results'] | |
3047 if root_dir: | |
3048 results = results.strip_root(root_dir) | |
3049 return results | |
3050 | |
3051 | |
3052 def CMDclean(args): | |
3053 """Cleans up traces.""" | |
3054 parser = OptionParserTraceInputs(command='clean') | |
3055 options, args = parser.parse_args(args) | |
3056 api = get_api() | |
3057 api.clean_trace(options.log) | |
3058 return 0 | |
3059 | |
3060 | |
3061 def CMDtrace(args): | |
3062 """Traces an executable.""" | |
3063 parser = OptionParserTraceInputs(command='trace') | |
3064 parser.allow_interspersed_args = False | |
3065 parser.add_option( | |
3066 '-q', '--quiet', action='store_true', | |
3067 help='Redirects traced executable output to /dev/null') | |
3068 options, args = parser.parse_args(args) | |
3069 | |
3070 if not args: | |
3071 parser.error('Please provide a command to run') | |
3072 | |
3073 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK): | |
3074 args[0] = os.path.abspath(args[0]) | |
3075 | |
3076 api = get_api() | |
3077 return trace(options.log, args, os.getcwd(), api, options.quiet)[0] | |
3078 | |
3079 | |
3080 def CMDread(args): | |
3081 """Reads the logs and prints the result.""" | |
3082 parser = OptionParserTraceInputs(command='read') | |
3083 parser.add_option( | |
3084 '-V', '--variable', | |
3085 nargs=2, | |
3086 action='append', | |
3087 dest='variables', | |
3088 metavar='VAR_NAME directory', | |
3089 default=[], | |
3090 help=('Variables to replace relative directories against. Example: ' | |
3091 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your ' | |
3092 'home dir with $HOME') % getpass.getuser()) | |
3093 parser.add_option( | |
3094 '--root-dir', | |
3095 help='Root directory to base everything off it. Anything outside of this ' | |
3096 'this directory will not be reported') | |
3097 parser.add_option( | |
3098 '-j', '--json', action='store_true', | |
3099 help='Outputs raw result data as json') | |
3100 parser.add_option( | |
3101 '-b', '--blacklist', action='append', default=[], | |
3102 help='List of regexp to use as blacklist filter') | |
3103 options, args = parser.parse_args(args) | |
3104 | |
3105 if options.root_dir: | |
3106 options.root_dir = os.path.abspath(options.root_dir) | |
3107 | |
3108 variables = dict(options.variables) | |
3109 api = get_api() | |
3110 def blacklist(f): | |
3111 return any(re.match(b, f) for b in options.blacklist) | |
3112 data = api.parse_log(options.log, blacklist) | |
3113 # Process each trace. | |
3114 output_as_json = [] | |
3115 for item in data: | |
3116 if 'exception' in item: | |
3117 print >> sys.stderr, ( | |
3118 'Trace %s: Got an exception: %s' % (item['trace'], item['exception'])) | |
3119 continue | |
3120 results = item['results'] | |
3121 if options.root_dir: | |
3122 results = results.strip_root(options.root_dir) | |
3123 | |
3124 if options.json: | |
3125 output_as_json.append(results.flatten()) | |
3126 else: | |
3127 simplified = extract_directories( | |
3128 options.root_dir, results.files, blacklist) | |
3129 simplified = [f.replace_variables(variables) for f in simplified] | |
3130 if len(data) > 1: | |
3131 print('Trace: %s' % item['trace']) | |
3132 print('Total: %d' % len(results.files)) | |
3133 print('Non existent: %d' % len(results.non_existent)) | |
3134 for f in results.non_existent: | |
3135 print(' %s' % f.path) | |
3136 print( | |
3137 'Interesting: %d reduced to %d' % ( | |
3138 len(results.existent), len(simplified))) | |
3139 for f in simplified: | |
3140 print(' %s' % f.path) | |
3141 | |
3142 if options.json: | |
3143 write_json(sys.stdout, output_as_json, False) | |
3144 return 0 | |
3145 | |
3146 | |
3147 class OptionParserWithLogging(optparse.OptionParser): | |
3148 """Adds --verbose option.""" | |
3149 def __init__(self, verbose=0, **kwargs): | |
3150 optparse.OptionParser.__init__(self, **kwargs) | |
3151 self.add_option( | |
3152 '-v', '--verbose', | |
3153 action='count', | |
3154 default=verbose, | |
3155 help='Use multiple times to increase verbosity') | |
3156 | |
3157 def parse_args(self, *args, **kwargs): | |
3158 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs) | |
3159 levels = [logging.ERROR, logging.INFO, logging.DEBUG] | |
3160 logging.basicConfig( | |
3161 level=levels[min(len(levels)-1, options.verbose)], | |
3162 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s') | |
3163 return options, args | |
3164 | |
3165 | |
3166 class OptionParserWithNiceDescription(OptionParserWithLogging): | |
3167 """Generates the description with the command's docstring.""" | |
3168 def __init__(self, **kwargs): | |
3169 """Sets 'description' and 'usage' if not already specified.""" | |
3170 command = kwargs.pop('command', 'help') | |
3171 kwargs.setdefault( | |
3172 'description', | |
3173 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__)) | |
3174 kwargs.setdefault('usage', '%%prog %s [options]' % command) | |
3175 OptionParserWithLogging.__init__(self, **kwargs) | |
3176 | |
3177 | |
3178 class OptionParserTraceInputs(OptionParserWithNiceDescription): | |
3179 """Adds automatic --log handling.""" | |
3180 def __init__(self, **kwargs): | |
3181 OptionParserWithNiceDescription.__init__(self, **kwargs) | |
3182 self.add_option( | |
3183 '-l', '--log', help='Log file to generate or read, required') | |
3184 | |
3185 def parse_args(self, *args, **kwargs): | |
3186 """Makes sure the paths make sense. | |
3187 | |
3188 On Windows, / and \ are often mixed together in a path. | |
3189 """ | |
3190 options, args = OptionParserWithNiceDescription.parse_args( | |
3191 self, *args, **kwargs) | |
3192 if not options.log: | |
3193 self.error('Must supply a log file with -l') | |
3194 options.log = os.path.abspath(options.log) | |
3195 return options, args | |
3196 | |
3197 | |
3198 def extract_documentation(): | |
3199 """Returns a dict {command: description} for each of documented command.""" | |
3200 commands = ( | |
3201 fn[3:] | |
3202 for fn in dir(sys.modules['__main__']) | |
3203 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__) | |
3204 return dict((fn, get_command_handler(fn).__doc__) for fn in commands) | |
3205 | |
3206 | |
3207 def CMDhelp(args): | |
3208 """Prints list of commands or help for a specific command.""" | |
3209 doc = extract_documentation() | |
3210 # Calculates the optimal offset. | |
3211 offset = max(len(cmd) for cmd in doc) | |
3212 format_str = ' %-' + str(offset + 2) + 's %s' | |
3213 # Generate a one-liner documentation of each commands. | |
3214 commands_description = '\n'.join( | |
3215 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc)) | |
3216 | |
3217 parser = OptionParserWithNiceDescription( | |
3218 usage='%prog <command> [options]', | |
3219 description='Commands are:\n%s\n' % commands_description) | |
3220 parser.format_description = lambda _: parser.description | |
3221 | |
3222 # Strip out any -h or --help argument. | |
3223 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')]) | |
3224 if len(args) == 1: | |
3225 if not get_command_handler(args[0]): | |
3226 parser.error('Unknown command %s' % args[0]) | |
3227 # The command was "%prog help command", replaces ourself with | |
3228 # "%prog command --help" so help is correctly printed out. | |
3229 return main(args + ['--help']) | |
3230 elif args: | |
3231 parser.error('Unknown argument "%s"' % ' '.join(args)) | |
3232 parser.print_help() | |
3233 return 0 | |
3234 | |
3235 | |
3236 def get_command_handler(name): | |
3237 """Returns the command handler or CMDhelp if it doesn't exist.""" | |
3238 return getattr(sys.modules['__main__'], 'CMD%s' % name, None) | |
3239 | |
3240 | |
3241 def main_impl(argv): | |
3242 command = get_command_handler(argv[0] if argv else 'help') | |
3243 if not command: | |
3244 return CMDhelp(argv) | |
3245 return command(argv[1:]) | |
3246 | |
3247 def main(argv): | |
3248 try: | |
3249 main_impl(argv) | |
3250 except TracingFailure, e: | |
3251 sys.stderr.write('\nError: ') | |
3252 sys.stderr.write(str(e)) | |
3253 sys.stderr.write('\n') | |
3254 return 1 | |
3255 | |
3256 | |
3257 if __name__ == '__main__': | |
3258 sys.exit(main(sys.argv[1:])) | |
OLD | NEW |