Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(99)

Side by Side Diff: tools/isolate/trace_inputs.py

Issue 11045023: Move src/tools/isolate to src/tools/swarm_client as a DEPS. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Use r159961 Created 8 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « tools/isolate/tests/trace_test_cases_smoke_test.py ('k') | tools/isolate/trace_test_cases.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # coding=utf-8
3 # Copyright (c) 2012 The Chromium Authors. All rights reserved.
4 # Use of this source code is governed by a BSD-style license that can be
5 # found in the LICENSE file.
6
7 """Traces an executable and its child processes and extract the files accessed
8 by them.
9
10 The implementation uses OS-specific API. The native Kernel logger and the ETL
11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
12 The OS-specific implementation is hidden in an 'API' interface.
13
14 The results are embedded in a Results instance. The tracing is done in two
15 phases, the first is to do the actual trace and generate an
16 implementation-specific log file. Then the log file is parsed to extract the
17 information, including the individual child processes and the files accessed
18 from the log.
19 """
20
21 import codecs
22 import csv
23 import getpass
24 import glob
25 import json
26 import logging
27 import optparse
28 import os
29 import re
30 import subprocess
31 import sys
32 import tempfile
33 import threading
34 import time
35 import weakref
36
37 ## OS-specific imports
38
39 if sys.platform == 'win32':
40 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p
41 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611
42 from ctypes.wintypes import GetLastError # pylint: disable=E0611
43 elif sys.platform == 'darwin':
44 import Carbon.File # pylint: disable=F0401
45 import MacOS # pylint: disable=F0401
46
47
48 BASE_DIR = os.path.dirname(os.path.abspath(__file__))
49 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
50
51
52 class TracingFailure(Exception):
53 """An exception occured during tracing."""
54 def __init__(self, description, pid, line_number, line, *args):
55 super(TracingFailure, self).__init__(
56 description, pid, line_number, line, *args)
57 self.description = description
58 self.pid = pid
59 self.line_number = line_number
60 self.line = line
61 self.extra = args
62
63 def __str__(self):
64 out = self.description
65 if self.pid:
66 out += '\npid: %d' % self.pid
67 if self.line_number:
68 out += '\nline: %d' % self.line_number
69 if self.line:
70 out += '\n%s' % self.line
71 if self.extra:
72 out += '\n' + ', '.join(map(str, filter(None, self.extra)))
73 return out
74
75
76 ## OS-specific functions
77
78 if sys.platform == 'win32':
79 def QueryDosDevice(drive_letter):
80 """Returns the Windows 'native' path for a DOS drive letter."""
81 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter
82 # Guesswork. QueryDosDeviceW never returns the required number of bytes.
83 chars = 1024
84 drive_letter = unicode(drive_letter)
85 p = create_unicode_buffer(chars)
86 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):
87 err = GetLastError()
88 if err:
89 # pylint: disable=E0602
90 raise WindowsError(
91 err,
92 'QueryDosDevice(%s): %s (%d)' % (
93 str(drive_letter), FormatError(err), err))
94 return p.value
95
96
97 def GetShortPathName(long_path):
98 """Returns the Windows short path equivalent for a 'long' path."""
99 long_path = unicode(long_path)
100 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
101 # not enforced.
102 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):
103 long_path = '\\\\?\\' + long_path
104 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)
105 if chars:
106 p = create_unicode_buffer(chars)
107 if windll.kernel32.GetShortPathNameW(long_path, p, chars):
108 return p.value
109
110 err = GetLastError()
111 if err:
112 # pylint: disable=E0602
113 raise WindowsError(
114 err,
115 'GetShortPathName(%s): %s (%d)' % (
116 str(long_path), FormatError(err), err))
117
118
119 def GetLongPathName(short_path):
120 """Returns the Windows long path equivalent for a 'short' path."""
121 short_path = unicode(short_path)
122 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is
123 # not enforced.
124 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):
125 short_path = '\\\\?\\' + short_path
126 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)
127 if chars:
128 p = create_unicode_buffer(chars)
129 if windll.kernel32.GetLongPathNameW(short_path, p, chars):
130 return p.value
131
132 err = GetLastError()
133 if err:
134 # pylint: disable=E0602
135 raise WindowsError(
136 err,
137 'GetLongPathName(%s): %s (%d)' % (
138 str(short_path), FormatError(err), err))
139
140
141 def get_current_encoding():
142 """Returns the 'ANSI' code page associated to the process."""
143 return 'cp%d' % int(windll.kernel32.GetACP())
144
145
146 class DosDriveMap(object):
147 """Maps \Device\HarddiskVolumeN to N: on Windows."""
148 # Keep one global cache.
149 _MAPPING = {}
150
151 def __init__(self):
152 """Lazy loads the cache."""
153 if not self._MAPPING:
154 # This is related to UNC resolver on windows. Ignore that.
155 self._MAPPING['\\Device\\Mup'] = None
156 self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']
157
158 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):
159 try:
160 letter = '%s:' % letter
161 mapped = QueryDosDevice(letter)
162 if mapped in self._MAPPING:
163 logging.warn(
164 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'
165 '. Drive letters are a user-mode concept and the kernel '
166 'traces only have NT path, so all accesses will be '
167 'associated with the first drive letter, independent of the '
168 'actual letter used by the code') % (
169 self._MAPPING[mapped], letter))
170 else:
171 self._MAPPING[mapped] = letter
172 except WindowsError: # pylint: disable=E0602
173 pass
174
175 def to_win32(self, path):
176 """Converts a native NT path to Win32/DOS compatible path."""
177 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)
178 if not match:
179 raise ValueError(
180 'Can\'t convert %s into a Win32 compatible path' % path,
181 path)
182 if not match.group(1) in self._MAPPING:
183 # Unmapped partitions may be accessed by windows for the
184 # fun of it while the test is running. Discard these.
185 return None
186 drive = self._MAPPING[match.group(1)]
187 if not drive or not match.group(2):
188 return drive
189 return drive + match.group(2)
190
191
192 def isabs(path):
193 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""
194 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'
195
196
197 def get_native_path_case(p):
198 """Returns the native path case for an existing file.
199
200 On Windows, removes any leading '\\?\'.
201 """
202 if not isabs(p):
203 raise ValueError(
204 'Can\'t get native path case for a non-absolute path: %s' % p,
205 p)
206 # Windows used to have an option to turn on case sensitivity on non Win32
207 # subsystem but that's out of scope here and isn't supported anymore.
208 # Go figure why GetShortPathName() is needed.
209 try:
210 out = GetLongPathName(GetShortPathName(p))
211 except OSError, e:
212 if e.args[0] in (2, 3, 5):
213 # The path does not exist. Try to recurse and reconstruct the path.
214 base = os.path.dirname(p)
215 rest = os.path.basename(p)
216 return os.path.join(get_native_path_case(base), rest)
217 raise
218 if out.startswith('\\\\?\\'):
219 out = out[4:]
220 # Always upper case the first letter since GetLongPathName() will return the
221 # drive letter in the case it was given.
222 return out[0].upper() + out[1:]
223
224
225 def CommandLineToArgvW(command_line):
226 """Splits a commandline into argv using CommandLineToArgvW()."""
227 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
228 size = c_int()
229 ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))
230 try:
231 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
232 finally:
233 windll.kernel32.LocalFree(ptr)
234
235
236 elif sys.platform == 'darwin':
237
238
239 # On non-windows, keep the stdlib behavior.
240 isabs = os.path.isabs
241
242
243 def _find_item_native_case(root_path, item):
244 """Gets the native path case of a single item based at root_path.
245
246 There is no API to get the native path case of symlinks on OSX. So it
247 needs to be done the slow way.
248 """
249 item = item.lower()
250 for element in os.listdir(root_path):
251 if element.lower() == item:
252 return element
253
254
255 def _native_case(p):
256 """Gets the native path case. Warning: this function resolves symlinks."""
257 logging.debug('native_case(%s)' % p)
258 try:
259 rel_ref, _ = Carbon.File.FSPathMakeRef(p)
260 out = rel_ref.FSRefMakePath()
261 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):
262 return out + os.path.sep
263 return out
264 except MacOS.Error, e:
265 if e.args[0] in (-43, -120):
266 # The path does not exist. Try to recurse and reconstruct the path.
267 # -43 means file not found.
268 # -120 means directory not found.
269 base = os.path.dirname(p)
270 rest = os.path.basename(p)
271 return os.path.join(_native_case(base), rest)
272 raise OSError(
273 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])
274
275
276 def _split_at_symlink_native(base_path, rest):
277 """Returns the native path for a symlink."""
278 base, symlink, rest = split_at_symlink(base_path, rest)
279 if symlink:
280 if not base_path:
281 base_path = base
282 else:
283 base_path = safe_join(base_path, base)
284 symlink = _find_item_native_case(base_path, symlink)
285 return base, symlink, rest
286
287
288 def get_native_path_case(path):
289 """Returns the native path case for an existing file.
290
291 Technically, it's only HFS+ on OSX that is case preserving and
292 insensitive. It's the default setting on HFS+ but can be changed.
293 """
294 if not isabs(path):
295 raise ValueError(
296 'Can\'t get native path case for a non-absolute path: %s' % path,
297 path)
298 if path.startswith('/dev'):
299 # /dev is not visible from Carbon, causing an exception.
300 return path
301
302 # Starts assuming there is no symlink along the path.
303 resolved = _native_case(path)
304 if resolved.lower() == path.lower():
305 # This code path is incredibly faster.
306 return resolved
307
308 # There was a symlink, process it.
309 base, symlink, rest = _split_at_symlink_native(None, path)
310 assert symlink, (path, base, symlink, rest, resolved)
311 prev = base
312 base = safe_join(_native_case(base), symlink)
313 assert len(base) > len(prev)
314 while rest:
315 prev = base
316 relbase, symlink, rest = _split_at_symlink_native(base, rest)
317 base = safe_join(base, relbase)
318 assert len(base) > len(prev), (prev, base, symlink)
319 if symlink:
320 base = safe_join(base, symlink)
321 assert len(base) > len(prev), (prev, base, symlink)
322 # Make sure no symlink was resolved.
323 assert base.lower() == path.lower(), (base, path)
324 return base
325
326
327 else: # OSes other than Windows and OSX.
328
329
330 # On non-windows, keep the stdlib behavior.
331 isabs = os.path.isabs
332
333
334 def get_native_path_case(path):
335 """Returns the native path case for an existing file.
336
337 On OSes other than OSX and Windows, assume the file system is
338 case-sensitive.
339
340 TODO(maruel): This is not strictly true. Implement if necessary.
341 """
342 if not isabs(path):
343 raise ValueError(
344 'Can\'t get native path case for a non-absolute path: %s' % path,
345 path)
346 # Give up on cygwin, as GetLongPathName() can't be called.
347 # Linux traces tends to not be normalized so use this occasion to normalize
348 # it. This function implementation already normalizes the path on the other
349 # OS so this needs to be done here to be coherent between OSes.
350 out = os.path.normpath(path)
351 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):
352 return out + os.path.sep
353 return out
354
355
356 if sys.platform != 'win32': # All non-Windows OSes.
357
358
359 def safe_join(*args):
360 """Joins path elements like os.path.join() but doesn't abort on absolute
361 path.
362
363 os.path.join('foo', '/bar') == '/bar'
364 but safe_join('foo', '/bar') == 'foo/bar'.
365 """
366 out = ''
367 for element in args:
368 if element.startswith(os.path.sep):
369 if out.endswith(os.path.sep):
370 out += element[1:]
371 else:
372 out += element
373 else:
374 if out.endswith(os.path.sep):
375 out += element
376 else:
377 out += os.path.sep + element
378 return out
379
380
381 def split_at_symlink(base_dir, relfile):
382 """Scans each component of relfile and cut the string at the symlink if
383 there is any.
384
385 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if
386 not symlink was found.
387 """
388 if base_dir:
389 assert relfile
390 assert os.path.isabs(base_dir)
391 index = 0
392 else:
393 assert os.path.isabs(relfile)
394 index = 1
395
396 def at_root(rest):
397 if base_dir:
398 return safe_join(base_dir, rest)
399 return rest
400
401 while True:
402 try:
403 index = relfile.index(os.path.sep, index)
404 except ValueError:
405 index = len(relfile)
406 full = at_root(relfile[:index])
407 if os.path.islink(full):
408 # A symlink!
409 base = os.path.dirname(relfile[:index])
410 symlink = os.path.basename(relfile[:index])
411 rest = relfile[index:]
412 logging.debug(
413 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %
414 (base_dir, relfile, base, symlink, rest))
415 return base, symlink, rest
416 if index == len(relfile):
417 break
418 index += 1
419 return relfile, None, None
420
421
422 def fix_python_path(cmd):
423 """Returns the fixed command line to call the right python executable."""
424 out = cmd[:]
425 if out[0] == 'python':
426 out[0] = sys.executable
427 elif out[0].endswith('.py'):
428 out.insert(0, sys.executable)
429 return out
430
431
432 def create_thunk():
433 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
434 os.write(
435 handle,
436 (
437 'import subprocess\n'
438 'import sys\n'
439 'sys.exit(subprocess.call(sys.argv[2:]))\n'
440 ))
441 os.close(handle)
442 return name
443
444
445 def strace_process_quoted_arguments(text):
446 """Extracts quoted arguments on a string and return the arguments as a list.
447
448 Implemented as an automaton. Supports incomplete strings in the form
449 '"foo"...'.
450
451 Example:
452 With text = '"foo", "bar"', the function will return ['foo', 'bar']
453
454 TODO(maruel): Implement escaping.
455 """
456 # All the possible states of the DFA.
457 ( NEED_QUOTE, # Begining of a new arguments.
458 INSIDE_STRING, # Inside an argument.
459 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
460 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
461 # a serie of 3 dots or a comma.
462 NEED_SPACE, # Right after a comma
463 NEED_DOT_2, # Found a dot, need a second one.
464 NEED_DOT_3, # Found second dot, need a third one.
465 NEED_COMMA, # Found third dot, need a comma.
466 ) = range(8)
467
468 state = NEED_QUOTE
469 out = []
470 for index, char in enumerate(text):
471 if char == '"':
472 if state == NEED_QUOTE:
473 state = INSIDE_STRING
474 # A new argument was found.
475 out.append('')
476 elif state == INSIDE_STRING:
477 # The argument is now closed.
478 state = NEED_COMMA_OR_DOT
479 elif state == ESCAPED:
480 out[-1] += char
481 state = INSIDE_STRING
482 else:
483 raise ValueError(
484 'Can\'t process char at column %d for: %r' % (index, text),
485 index,
486 text)
487 elif char == ',':
488 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
489 state = NEED_SPACE
490 elif state == INSIDE_STRING:
491 out[-1] += char
492 elif state == ESCAPED:
493 out[-1] += char
494 state = INSIDE_STRING
495 else:
496 raise ValueError(
497 'Can\'t process char at column %d for: %r' % (index, text),
498 index,
499 text)
500 elif char == ' ':
501 if state == NEED_SPACE:
502 state = NEED_QUOTE
503 elif state == INSIDE_STRING:
504 out[-1] += char
505 elif state == ESCAPED:
506 out[-1] += char
507 state = INSIDE_STRING
508 else:
509 raise ValueError(
510 'Can\'t process char at column %d for: %r' % (index, text),
511 index,
512 text)
513 elif char == '.':
514 if state == NEED_COMMA_OR_DOT:
515 # The string is incomplete, this mean the strace -s flag should be
516 # increased.
517 state = NEED_DOT_2
518 elif state == NEED_DOT_2:
519 state = NEED_DOT_3
520 elif state == NEED_DOT_3:
521 state = NEED_COMMA
522 elif state == INSIDE_STRING:
523 out[-1] += char
524 elif state == ESCAPED:
525 out[-1] += char
526 state = INSIDE_STRING
527 else:
528 raise ValueError(
529 'Can\'t process char at column %d for: %r' % (index, text),
530 index,
531 text)
532 elif char == '\\':
533 if state == ESCAPED:
534 out[-1] += char
535 state = INSIDE_STRING
536 elif state == INSIDE_STRING:
537 state = ESCAPED
538 else:
539 raise ValueError(
540 'Can\'t process char at column %d for: %r' % (index, text),
541 index,
542 text)
543 else:
544 if state == INSIDE_STRING:
545 out[-1] += char
546 else:
547 raise ValueError(
548 'Can\'t process char at column %d for: %r' % (index, text),
549 index,
550 text)
551 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
552 raise ValueError(
553 'String is incorrectly terminated: %r' % text,
554 text)
555 return out
556
557
558 def read_json(filepath):
559 with open(filepath, 'r') as f:
560 return json.load(f)
561
562
563 def write_json(filepath_or_handle, data, dense):
564 """Writes data into filepath or file handle encoded as json.
565
566 If dense is True, the json is packed. Otherwise, it is human readable.
567 """
568 if hasattr(filepath_or_handle, 'write'):
569 if dense:
570 filepath_or_handle.write(
571 json.dumps(data, sort_keys=True, separators=(',',':')))
572 else:
573 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
574 else:
575 with open(filepath_or_handle, 'wb') as f:
576 if dense:
577 json.dump(data, f, sort_keys=True, separators=(',',':'))
578 else:
579 json.dump(data, f, sort_keys=True, indent=2)
580
581
582 class Results(object):
583 """Results of a trace session."""
584
585 class _TouchedObject(object):
586 """Something, a file or a directory, that was accessed."""
587 def __init__(self, root, path, tainted, size, nb_files):
588 logging.debug(
589 '%s(%s, %s, %s, %s, %s)' %
590 (self.__class__.__name__, root, path, tainted, size, nb_files))
591 self.root = root
592 self.path = path
593 self.tainted = tainted
594 self.nb_files = nb_files
595 # Can be used as a cache or a default value, depending on context.
596 self._size = size
597 # These are cache only.
598 self._real_path = None
599
600 # Check internal consistency.
601 assert path, path
602 assert tainted or bool(root) != bool(isabs(path)), (root, path)
603 assert tainted or (
604 not os.path.exists(self.full_path) or
605 (self.full_path == get_native_path_case(self.full_path))), (
606 tainted, self.full_path, get_native_path_case(self.full_path))
607
608 @property
609 def existent(self):
610 return self.size != -1
611
612 @property
613 def full_path(self):
614 if self.root:
615 return os.path.join(self.root, self.path)
616 return self.path
617
618 @property
619 def real_path(self):
620 """Returns the path with symlinks resolved."""
621 if not self._real_path:
622 self._real_path = os.path.realpath(self.full_path)
623 return self._real_path
624
625 @property
626 def size(self):
627 """File's size. -1 is not existent."""
628 if self._size is None and not self.tainted:
629 try:
630 self._size = os.stat(self.full_path).st_size
631 except OSError:
632 self._size = -1
633 return self._size
634
635 def flatten(self):
636 """Returns a dict representing this object.
637
638 A 'size' of 0 means the file was only touched and not read.
639 """
640 return {
641 'path': self.path,
642 'size': self.size,
643 }
644
645 def replace_variables(self, variables):
646 """Replaces the root of this File with one of the variables if it matches.
647
648 If a variable replacement occurs, the cloned object becomes tainted.
649 """
650 for variable, root_path in variables.iteritems():
651 if self.path.startswith(root_path):
652 return self._clone(
653 self.root, variable + self.path[len(root_path):], True)
654 # No need to clone, returns ourself.
655 return self
656
657 def strip_root(self, root):
658 """Returns a clone of itself with 'root' stripped off."""
659 # Check internal consistency.
660 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root
661 if not self.full_path.startswith(root):
662 # Now try to resolve the symlinks to see if it can be reached this way.
663 # Only try *after* trying without resolving symlink.
664 if not self.real_path.startswith(root):
665 return None
666 path = self.real_path
667 else:
668 path = self.full_path
669 return self._clone(root, path[len(root):], self.tainted)
670
671 def _clone(self, new_root, new_path, tainted):
672 raise NotImplementedError(self.__class__.__name__)
673
674 class File(_TouchedObject):
675 """A file that was accessed. May not be present anymore.
676
677 If tainted is true, it means it is not a real path anymore as a variable
678 replacement occured.
679
680 If touched_only is True, this means the file was probed for existence, and
681 it is existent, but was never _opened_. If touched_only is True, the file
682 must have existed.
683 """
684 def __init__(self, root, path, tainted, size):
685 super(Results.File, self).__init__(root, path, tainted, size, 1)
686
687 def _clone(self, new_root, new_path, tainted):
688 """Clones itself keeping meta-data."""
689 # Keep the self.size and self._real_path caches for performance reason. It
690 # is also important when the file becomes tainted (with a variable instead
691 # of the real path) since self.path is not an on-disk path anymore so
692 # out._size cannot be updated.
693 out = self.__class__(new_root, new_path, tainted, self.size)
694 out._real_path = self._real_path
695 return out
696
697 class Directory(_TouchedObject):
698 """A directory of files. Must exist."""
699 def __init__(self, root, path, tainted, size, nb_files):
700 """path='.' is a valid value and must be handled appropriately."""
701 assert not path.endswith(os.path.sep), path
702 super(Results.Directory, self).__init__(
703 root, path + os.path.sep, tainted, size, nb_files)
704 # For a Directory instance, self.size is not a cache, it's an actual value
705 # that is never modified and represents the total size of the files
706 # contained in this directory. It is possible that the directory is empty
707 # so that size == 0; this happens if there's only an invalid symlink in
708 # it.
709
710 def flatten(self):
711 out = super(Results.Directory, self).flatten()
712 out['nb_files'] = self.nb_files
713 return out
714
715 def _clone(self, new_root, new_path, tainted):
716 """Clones itself keeping meta-data."""
717 out = self.__class__(
718 new_root,
719 new_path.rstrip(os.path.sep),
720 tainted,
721 self.size,
722 self.nb_files)
723 out._real_path = self._real_path
724 return out
725
726 class Process(object):
727 """A process that was traced.
728
729 Contains references to the files accessed by this process and its children.
730 """
731 def __init__(self, pid, files, executable, command, initial_cwd, children):
732 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
733 self.pid = pid
734 self.files = sorted(files, key=lambda x: x.path)
735 self.children = children
736 self.executable = executable
737 self.command = command
738 self.initial_cwd = initial_cwd
739
740 # Check internal consistency.
741 assert len(set(f.path for f in self.files)) == len(self.files), sorted(
742 f.path for f in self.files)
743 assert isinstance(self.children, list)
744 assert isinstance(self.files, list)
745
746 @property
747 def all(self):
748 for child in self.children:
749 for i in child.all:
750 yield i
751 yield self
752
753 def flatten(self):
754 return {
755 'children': [c.flatten() for c in self.children],
756 'command': self.command,
757 'executable': self.executable,
758 'files': [f.flatten() for f in self.files],
759 'initial_cwd': self.initial_cwd,
760 'pid': self.pid,
761 }
762
763 def strip_root(self, root):
764 assert isabs(root) and root.endswith(os.path.sep), root
765 # Loads the files after since they are constructed as objects.
766 out = self.__class__(
767 self.pid,
768 filter(None, (f.strip_root(root) for f in self.files)),
769 self.executable,
770 self.command,
771 self.initial_cwd,
772 [c.strip_root(root) for c in self.children])
773 logging.debug(
774 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
775 return out
776
777 def __init__(self, process):
778 self.process = process
779 # Cache.
780 self._files = None
781
782 def flatten(self):
783 return {
784 'root': self.process.flatten(),
785 }
786
787 @property
788 def files(self):
789 if self._files is None:
790 self._files = sorted(
791 sum((p.files for p in self.process.all), []),
792 key=lambda x: x.path)
793 return self._files
794
795 @property
796 def existent(self):
797 return [f for f in self.files if f.existent]
798
799 @property
800 def non_existent(self):
801 return [f for f in self.files if not f.existent]
802
803 def strip_root(self, root):
804 """Returns a clone with all the files outside the directory |root| removed
805 and converts all the path to be relative paths.
806 """
807 # Resolve any symlink
808 root = os.path.realpath(root)
809 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep
810 logging.debug('strip_root(%s)' % root)
811 return Results(self.process.strip_root(root))
812
813
814 class ApiBase(object):
815 """OS-agnostic API to trace a process and its children."""
816 class Context(object):
817 """Processes one log line at a time and keeps the list of traced processes.
818
819 The parsing is complicated by the fact that logs are traced out of order for
820 strace but in-order for dtrace and logman. In addition, on Windows it is
821 very frequent that processids are reused so a flat list cannot be used. But
822 at the same time, it is impossible to faithfully construct a graph when the
823 logs are processed out of order. So both a tree and a flat mapping are used,
824 the tree is the real process tree, while the flat mapping stores the last
825 valid process for the corresponding processid. For the strace case, the
826 tree's head is guessed at the last moment.
827 """
828 class Process(object):
829 """Keeps context for one traced child process.
830
831 Logs all the files this process touched. Ignores directories.
832 """
833 def __init__(self, blacklist, pid, initial_cwd):
834 # Check internal consistency.
835 assert isinstance(pid, int), repr(pid)
836 self.pid = pid
837 # children are Process instances.
838 self.children = []
839 self.initial_cwd = initial_cwd
840 self.cwd = None
841 self.files = set()
842 self.only_touched = set()
843 self.executable = None
844 self.command = None
845 self._blacklist = blacklist
846
847 def to_results_process(self):
848 """Resolves file case sensitivity and or late-bound strings."""
849 # When resolving files, it's normal to get dupe because a file could be
850 # opened multiple times with different case. Resolve the deduplication
851 # here.
852 def fix_path(x):
853 """Returns the native file path case.
854
855 Converts late-bound strings.
856 """
857 if not x:
858 # Do not convert None instance to 'None'.
859 return x
860 # TODO(maruel): Do not upconvert to unicode here, on linux we don't
861 # know the file path encoding so they must be treated as bytes.
862 x = unicode(x)
863 if os.path.isabs(x):
864 # If the path is not absolute, which tends to happen occasionally on
865 # Windows, it is not possible to get the native path case so ignore
866 # that trace. It mostly happens for 'executable' value.
867 x = get_native_path_case(x)
868 return x
869
870 def fix_and_blacklist_path(x):
871 x = fix_path(x)
872 if not x:
873 return
874 # The blacklist needs to be reapplied, since path casing could
875 # influence blacklisting.
876 if self._blacklist(x):
877 return
878 return x
879
880 # Filters out directories. Some may have passed through.
881 files = set(f for f in map(fix_and_blacklist_path, self.files) if f)
882 only_touched = set(
883 f for f in map(fix_and_blacklist_path, self.only_touched) if f)
884 only_touched -= files
885
886 files = [
887 Results.File(None, f, False, None) for f in files
888 if not os.path.isdir(f)
889 ]
890 # Using 0 as size means the file's content is ignored since the file was
891 # never opened for I/O.
892 files.extend(
893 Results.File(None, f, False, 0) for f in only_touched
894 if not os.path.isdir(f)
895 )
896 return Results.Process(
897 self.pid,
898 files,
899 fix_path(self.executable),
900 self.command,
901 fix_path(self.initial_cwd),
902 [c.to_results_process() for c in self.children])
903
904 def add_file(self, filepath, touch_only):
905 """Adds a file if it passes the blacklist."""
906 if self._blacklist(unicode(filepath)):
907 return
908 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))
909 # Note that filepath and not unicode(filepath) is added. It is because
910 # filepath could be something else than a string, like a RelativePath
911 # instance for dtrace logs.
912 if touch_only:
913 self.only_touched.add(filepath)
914 else:
915 self.files.add(filepath)
916
917 def __init__(self, blacklist):
918 self.blacklist = blacklist
919 # Initial process.
920 self.root_process = None
921 # dict to accelerate process lookup, to not have to lookup the whole graph
922 # each time.
923 self._process_lookup = {}
924
925 class Tracer(object):
926 """During it's lifetime, the tracing subsystem is enabled."""
927 def __init__(self, logname):
928 self._logname = logname
929 self._lock = threading.Lock()
930 self._traces = []
931 self._initialized = True
932
933 def trace(self, cmd, cwd, tracename, output):
934 """Runs the OS-specific trace program on an executable.
935
936 Arguments:
937 - cmd: The command (a list) to run.
938 - cwd: Current directory to start the child process in.
939 - tracename: Name of the trace in the logname file.
940 - output: If False, redirects output to PIPEs.
941
942 Returns a tuple (resultcode, output) and updates the internal trace
943 entries.
944 """
945 # The implementation adds an item to self._traces.
946 raise NotImplementedError(self.__class__.__name__)
947
948 def close(self, _timeout=None):
949 """Saves the meta-data in the logname file.
950
951 For kernel-based tracing, stops the tracing subsystem.
952
953 Must not be used manually when using 'with' construct.
954 """
955 with self._lock:
956 assert self._initialized
957 try:
958 data = {
959 'traces': self._traces,
960 }
961 write_json(self._logname, data, False)
962 finally:
963 self._initialized = False
964
965 def post_process_log(self):
966 """Post-processes the log so it becomes faster to load afterward.
967
968 Must not be used manually when using 'with' construct.
969 """
970 assert not self._initialized, 'Must stop tracing first.'
971
972 def __enter__(self):
973 """Enables 'with' statement."""
974 return self
975
976 def __exit__(self, exc_type, exc_value, traceback):
977 """Enables 'with' statement."""
978 self.close()
979 # If an exception was thrown, do not process logs.
980 if not exc_type:
981 self.post_process_log()
982
983 def get_tracer(self, logname):
984 """Returns an ApiBase.Tracer instance.
985
986 Initializes the tracing subsystem, which is a requirement for kernel-based
987 tracers. Only one tracer instance should be live at a time!
988
989 logname is the filepath to the json file that will contain the meta-data
990 about the logs.
991 """
992 return self.Tracer(logname)
993
994 @staticmethod
995 def clean_trace(logname):
996 """Deletes an old log."""
997 raise NotImplementedError()
998
999 @classmethod
1000 def parse_log(cls, logname, blacklist):
1001 """Processes trace logs and returns the files opened and the files that do
1002 not exist.
1003
1004 It does not track directories.
1005
1006 Most of the time, files that do not exist are temporary test files that
1007 should be put in /tmp instead. See http://crbug.com/116251.
1008
1009 Returns a list of dict with keys:
1010 - results: A Results instance.
1011 - trace: The corresponding tracename parameter provided to
1012 get_tracer().trace().
1013 - output: Output gathered during execution, if get_tracer().trace(...,
1014 output=False) was used.
1015 """
1016 raise NotImplementedError(cls.__class__.__name__)
1017
1018
1019 class Strace(ApiBase):
1020 """strace implies linux."""
1021 class Context(ApiBase.Context):
1022 """Processes a strace log line and keeps the list of existent and non
1023 existent files accessed.
1024
1025 Ignores directories.
1026
1027 Uses late-binding to processes the cwd of each process. The problem is that
1028 strace generates one log file per process it traced but doesn't give any
1029 information about which process was started when and by who. So we don't
1030 even know which process is the initial one. So process the logs out of
1031 order and use late binding with RelativePath to be able to deduce the
1032 initial directory of each process once all the logs are parsed.
1033 """
1034 class Process(ApiBase.Context.Process):
1035 """Represents the state of a process.
1036
1037 Contains all the information retrieved from the pid-specific log.
1038 """
1039 # Function names are using ([a-z_0-9]+)
1040 # This is the most common format. function(args) = result
1041 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.+?)\)\s+= (.+)$')
1042 # An interrupted function call, only grab the minimal header.
1043 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
1044 # A resumed function call.
1045 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
1046 # A process received a signal.
1047 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
1048 # A process didn't handle a signal. Ignore any junk appearing before,
1049 # because the process was forcibly killed so it won't open any new file.
1050 RE_KILLED = re.compile(
1051 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
1052 # The process has exited.
1053 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
1054 # A call was canceled. Ignore any prefix.
1055 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
1056 # Happens when strace fails to even get the function name.
1057 UNNAMED_FUNCTION = '????'
1058
1059 # Corner-case in python, a class member function decorator must not be
1060 # @staticmethod.
1061 def parse_args(regexp, expect_zero): # pylint: disable=E0213
1062 """Automatically convert the str 'args' into a list of processed
1063 arguments.
1064
1065 Arguments:
1066 - regexp is used to parse args.
1067 - expect_zero: one of True, False or None.
1068 - True: will check for result.startswith('0') first and will ignore
1069 the trace line completely otherwise. This is important because for
1070 many functions, the regexp will not process if the call failed.
1071 - False: will check for not result.startswith(('?', '-1')) for the
1072 same reason than with True.
1073 - None: ignore result.
1074 """
1075 def meta_hook(function):
1076 assert function.__name__.startswith('handle_')
1077 def hook(self, args, result):
1078 if expect_zero is True and not result.startswith('0'):
1079 return
1080 if expect_zero is False and result.startswith(('?', '-1')):
1081 return
1082 match = re.match(regexp, args)
1083 if not match:
1084 raise TracingFailure(
1085 'Failed to parse %s(%s) = %s' %
1086 (function.__name__[len('handle_'):], args, result),
1087 None, None, None)
1088 return function(self, match.groups(), result)
1089 return hook
1090 return meta_hook
1091
1092 class RelativePath(object):
1093 """A late-bound relative path."""
1094 def __init__(self, parent, value):
1095 self.parent = parent
1096 self.value = value
1097
1098 def render(self):
1099 """Returns the current directory this instance is representing.
1100
1101 This function is used to return the late-bound value.
1102 """
1103 if self.value and self.value.startswith(u'/'):
1104 # An absolute path.
1105 return self.value
1106 parent = self.parent.render() if self.parent else u'<None>'
1107 if self.value:
1108 return os.path.normpath(os.path.join(parent, self.value))
1109 return parent
1110
1111 def __unicode__(self):
1112 """Acts as a string whenever needed."""
1113 return unicode(self.render())
1114
1115 def __str__(self):
1116 """Acts as a string whenever needed."""
1117 return str(self.render())
1118
1119 def __init__(self, root, pid):
1120 """Keeps enough information to be able to guess the original process
1121 root.
1122
1123 strace doesn't store which process was the initial process. So more
1124 information needs to be kept so the graph can be reconstructed from the
1125 flat map.
1126 """
1127 logging.info('%s(%d)' % (self.__class__.__name__, pid))
1128 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
1129 assert isinstance(root, ApiBase.Context)
1130 self._root = weakref.ref(root)
1131 # The dict key is the function name of the pending call, like 'open'
1132 # or 'execve'.
1133 self._pending_calls = {}
1134 self._line_number = 0
1135 # Current directory when the process started.
1136 self.initial_cwd = self.RelativePath(self._root(), None)
1137 self.parentid = None
1138
1139 def get_cwd(self):
1140 """Returns the best known value of cwd."""
1141 return self.cwd or self.initial_cwd
1142
1143 def render(self):
1144 """Returns the string value of the RelativePath() object.
1145
1146 Used by RelativePath. Returns the initial directory and not the
1147 current one since the current directory 'cwd' validity is time-limited.
1148
1149 The validity is only guaranteed once all the logs are processed.
1150 """
1151 return self.initial_cwd.render()
1152
1153 def on_line(self, line):
1154 self._line_number += 1
1155 if self.RE_SIGNAL.match(line):
1156 # Ignore signals.
1157 return
1158
1159 try:
1160 match = self.RE_KILLED.match(line)
1161 if match:
1162 # Converts a '+++ killed by Foo +++' trace into an exit_group().
1163 self.handle_exit_group(match.group(1), None)
1164 return
1165
1166 match = self.RE_PROCESS_EXITED.match(line)
1167 if match:
1168 # Converts a '+++ exited with 1 +++' trace into an exit_group()
1169 self.handle_exit_group(match.group(1), None)
1170 return
1171
1172 match = self.RE_UNFINISHED.match(line)
1173 if match:
1174 if match.group(1) in self._pending_calls:
1175 raise TracingFailure(
1176 'Found two unfinished calls for the same function',
1177 None, None, None,
1178 self._pending_calls)
1179 self._pending_calls[match.group(1)] = (
1180 match.group(1) + match.group(2))
1181 return
1182
1183 match = self.RE_UNAVAILABLE.match(line)
1184 if match:
1185 # This usually means a process was killed and a pending call was
1186 # canceled.
1187 # TODO(maruel): Look up the last exit_group() trace just above and
1188 # make sure any self._pending_calls[anything] is properly flushed.
1189 return
1190
1191 match = self.RE_RESUMED.match(line)
1192 if match:
1193 if match.group(1) not in self._pending_calls:
1194 raise TracingFailure(
1195 'Found a resumed call that was not logged as unfinished',
1196 None, None, None,
1197 self._pending_calls)
1198 pending = self._pending_calls.pop(match.group(1))
1199 # Reconstruct the line.
1200 line = pending + match.group(2)
1201
1202 match = self.RE_HEADER.match(line)
1203 if not match:
1204 raise TracingFailure(
1205 'Found an invalid line: %s' % line,
1206 None, None, None)
1207 if match.group(1) == self.UNNAMED_FUNCTION:
1208 return
1209
1210 # It's a valid line, handle it.
1211 handler = getattr(self, 'handle_%s' % match.group(1), None)
1212 if not handler:
1213 self._handle_unknown(match.group(1), match.group(2), match.group(3))
1214 return handler(match.group(2), match.group(3))
1215 except TracingFailure, e:
1216 # Hack in the values since the handler could be a static function.
1217 e.pid = self.pid
1218 e.line = line
1219 e.line_number = self._line_number
1220 # Re-raise the modified exception.
1221 raise
1222 except (KeyError, NotImplementedError, ValueError), e:
1223 raise TracingFailure(
1224 'Trace generated a %s exception: %s' % (
1225 e.__class__.__name__, str(e)),
1226 self.pid,
1227 self._line_number,
1228 line,
1229 e)
1230
1231 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
1232 def handle_access(self, args, _result):
1233 self._handle_file(args[0], True)
1234
1235 @parse_args(r'^\"(.+?)\"$', True)
1236 def handle_chdir(self, args, _result):
1237 """Updates cwd."""
1238 self.cwd = self.RelativePath(self, args[0])
1239 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
1240
1241 def handle_clone(self, _args, result):
1242 """Transfers cwd."""
1243 if result.startswith(('?', '-1')):
1244 # The call failed.
1245 return
1246 # Update the other process right away.
1247 childpid = int(result)
1248 child = self._root().get_or_set_proc(childpid)
1249 if child.parentid is not None or childpid in self.children:
1250 raise TracingFailure(
1251 'Found internal inconsitency in process lifetime detection '
1252 'during a clone() call',
1253 None, None, None)
1254
1255 # Copy the cwd object.
1256 child.initial_cwd = self.get_cwd()
1257 child.parentid = self.pid
1258 # It is necessary because the logs are processed out of order.
1259 self.children.append(child)
1260
1261 def handle_close(self, _args, _result):
1262 pass
1263
1264 def handle_chmod(self, _args, _result):
1265 pass
1266
1267 def handle_creat(self, _args, _result):
1268 # Ignore files created, since they didn't need to exist.
1269 pass
1270
1271 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
1272 def handle_execve(self, args, _result):
1273 # Even if in practice execve() doesn't returns when it succeeds, strace
1274 # still prints '0' as the result.
1275 filepath = args[0]
1276 self._handle_file(filepath, False)
1277 self.executable = self.RelativePath(self.get_cwd(), filepath)
1278 self.command = strace_process_quoted_arguments(args[1])
1279
1280 def handle_exit_group(self, _args, _result):
1281 """Removes cwd."""
1282 self.cwd = None
1283
1284 def handle_fork(self, args, result):
1285 self._handle_unknown('fork', args, result)
1286
1287 def handle_getcwd(self, _args, _result):
1288 pass
1289
1290 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1291 def handle_link(self, args, _result):
1292 self._handle_file(args[0], False)
1293 self._handle_file(args[1], False)
1294
1295 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1296 def handle_lstat(self, args, _result):
1297 self._handle_file(args[0], True)
1298
1299 def handle_mkdir(self, _args, _result):
1300 pass
1301
1302 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1303 def handle_open(self, args, _result):
1304 if 'O_DIRECTORY' in args[1]:
1305 return
1306 self._handle_file(args[0], False)
1307
1308 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
1309 def handle_openat(self, args, _result):
1310 if 'O_DIRECTORY' in args[2]:
1311 return
1312 if args[1] == 'AT_FDCWD':
1313 self._handle_file(args[1], False)
1314 else:
1315 # TODO(maruel): Implement relative open if necessary instead of the
1316 # AT_FDCWD flag, let's hope not since this means tracking all active
1317 # directory handles.
1318 raise Exception('Relative open via openat not implemented.')
1319
1320 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
1321 def handle_readlink(self, args, _result):
1322 self._handle_file(args[0], False)
1323
1324 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
1325 def handle_rename(self, args, _result):
1326 self._handle_file(args[0], False)
1327 self._handle_file(args[1], False)
1328
1329 def handle_rmdir(self, _args, _result):
1330 pass
1331
1332 def handle_setxattr(self, _args, _result):
1333 pass
1334
1335 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
1336 def handle_stat(self, args, _result):
1337 self._handle_file(args[0], True)
1338
1339 def handle_symlink(self, _args, _result):
1340 pass
1341
1342 def handle_unlink(self, _args, _result):
1343 # In theory, the file had to be created anyway.
1344 pass
1345
1346 def handle_statfs(self, _args, _result):
1347 pass
1348
1349 def handle_vfork(self, args, result):
1350 self._handle_unknown('vfork', args, result)
1351
1352 @staticmethod
1353 def _handle_unknown(function, args, result):
1354 raise TracingFailure(
1355 'Unexpected/unimplemented trace %s(%s)= %s' %
1356 (function, args, result),
1357 None, None, None)
1358
1359 def _handle_file(self, filepath, touch_only):
1360 filepath = self.RelativePath(self.get_cwd(), filepath)
1361 #assert not touch_only, unicode(filepath)
1362 self.add_file(filepath, touch_only)
1363
1364 def __init__(self, blacklist, initial_cwd):
1365 super(Strace.Context, self).__init__(blacklist)
1366 self.initial_cwd = initial_cwd
1367
1368 def render(self):
1369 """Returns the string value of the initial cwd of the root process.
1370
1371 Used by RelativePath.
1372 """
1373 return self.initial_cwd
1374
1375 def on_line(self, pid, line):
1376 """Transfers control into the Process.on_line() function."""
1377 self.get_or_set_proc(pid).on_line(line.strip())
1378
1379 def to_results(self):
1380 """Finds back the root process and verify consistency."""
1381 # TODO(maruel): Absolutely unecessary, fix me.
1382 root = [p for p in self._process_lookup.itervalues() if not p.parentid]
1383 if len(root) != 1:
1384 raise TracingFailure(
1385 'Found internal inconsitency in process lifetime detection '
1386 'while finding the root process',
1387 None,
1388 None,
1389 None,
1390 sorted(p.pid for p in root))
1391 self.root_process = root[0]
1392 process = self.root_process.to_results_process()
1393 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1394 raise TracingFailure(
1395 'Found internal inconsitency in process lifetime detection '
1396 'while looking for len(tree) == len(list)',
1397 None,
1398 None,
1399 None,
1400 sorted(self._process_lookup),
1401 sorted(p.pid for p in process.all))
1402 return Results(process)
1403
1404 def get_or_set_proc(self, pid):
1405 """Returns the Context.Process instance for this pid or creates a new one.
1406 """
1407 if not pid or not isinstance(pid, int):
1408 raise TracingFailure(
1409 'Unpexpected value for pid: %r' % pid,
1410 pid,
1411 None,
1412 None,
1413 pid)
1414 if pid not in self._process_lookup:
1415 self._process_lookup[pid] = self.Process(self, pid)
1416 return self._process_lookup[pid]
1417
1418 @classmethod
1419 def traces(cls):
1420 """Returns the list of all handled traces to pass this as an argument to
1421 strace.
1422 """
1423 prefix = 'handle_'
1424 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
1425
1426 class Tracer(ApiBase.Tracer):
1427 MAX_LEN = 256
1428
1429 def trace(self, cmd, cwd, tracename, output):
1430 """Runs strace on an executable."""
1431 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
1432 assert os.path.isabs(cmd[0]), cmd[0]
1433 assert os.path.isabs(cwd), cwd
1434 assert os.path.normpath(cwd) == cwd, cwd
1435 with self._lock:
1436 if not self._initialized:
1437 raise TracingFailure(
1438 'Called Tracer.trace() on an unitialized object',
1439 None, None, None, tracename)
1440 assert tracename not in (i['trace'] for i in self._traces)
1441 stdout = stderr = None
1442 if output:
1443 stdout = subprocess.PIPE
1444 stderr = subprocess.STDOUT
1445 # Ensure all file related APIs are hooked.
1446 traces = ','.join(Strace.Context.traces() + ['file'])
1447 trace_cmd = [
1448 'strace',
1449 '-ff',
1450 '-s', '%d' % self.MAX_LEN,
1451 '-e', 'trace=%s' % traces,
1452 '-o', self._logname + '.' + tracename,
1453 ]
1454 child = subprocess.Popen(
1455 trace_cmd + cmd,
1456 cwd=cwd,
1457 stdin=subprocess.PIPE,
1458 stdout=stdout,
1459 stderr=stderr)
1460 out = child.communicate()[0]
1461 # TODO(maruel): Walk the logs and figure out the root process would
1462 # simplify parsing the logs a *lot*.
1463 with self._lock:
1464 assert tracename not in (i['trace'] for i in self._traces)
1465 self._traces.append(
1466 {
1467 'cmd': cmd,
1468 'cwd': cwd,
1469 # The pid of strace process, not very useful.
1470 'pid': child.pid,
1471 'trace': tracename,
1472 'output': out,
1473 })
1474 return child.returncode, out
1475
1476 @staticmethod
1477 def clean_trace(logname):
1478 if os.path.isfile(logname):
1479 os.remove(logname)
1480 # Also delete any pid specific file from previous traces.
1481 for i in glob.iglob(logname + '.*'):
1482 if i.rsplit('.', 1)[1].isdigit():
1483 os.remove(i)
1484
1485 @classmethod
1486 def parse_log(cls, logname, blacklist):
1487 logging.info('parse_log(%s, %s)' % (logname, blacklist))
1488 data = read_json(logname)
1489 out = []
1490 for item in data['traces']:
1491 result = {
1492 'trace': item['trace'],
1493 'output': item['output'],
1494 }
1495 try:
1496 context = cls.Context(blacklist, item['cwd'])
1497 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
1498 pid = pidfile.rsplit('.', 1)[1]
1499 if pid.isdigit():
1500 pid = int(pid)
1501 # TODO(maruel): Load as utf-8
1502 for line in open(pidfile, 'rb'):
1503 context.on_line(pid, line)
1504 result['results'] = context.to_results()
1505 except TracingFailure, e:
1506 result['exception'] = e
1507 out.append(result)
1508 return out
1509
1510
1511 class Dtrace(ApiBase):
1512 """Uses DTrace framework through dtrace. Requires root access.
1513
1514 Implies Mac OSX.
1515
1516 dtruss can't be used because it has compatibility issues with python.
1517
1518 Also, the pid->cwd handling needs to be done manually since OSX has no way to
1519 get the absolute path of the 'cwd' dtrace variable from the probe.
1520
1521 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
1522 https://discussions.apple.com/thread/1980539. So resort to handling execve()
1523 manually.
1524
1525 errno is not printed in the log since this implementation currently only cares
1526 about files that were successfully opened.
1527 """
1528 class Context(ApiBase.Context):
1529 # Format: index pid function(args)
1530 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
1531
1532 # Arguments parsing.
1533 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
1534 RE_CHDIR = re.compile(r'^\"(.+?)\"$')
1535 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
1536 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
1537 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
1538 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
1539
1540 O_DIRECTORY = 0x100000
1541
1542 class Process(ApiBase.Context.Process):
1543 def __init__(self, *args):
1544 super(Dtrace.Context.Process, self).__init__(*args)
1545 self.cwd = self.initial_cwd
1546
1547 def __init__(self, blacklist, tracer_pid, initial_cwd):
1548 logging.info(
1549 '%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))
1550 super(Dtrace.Context, self).__init__(blacklist)
1551 # Process ID of the temporary script created by create_thunk().
1552 self._tracer_pid = tracer_pid
1553 self._initial_cwd = initial_cwd
1554 self._line_number = 0
1555
1556 def on_line(self, line):
1557 self._line_number += 1
1558 match = self.RE_HEADER.match(line)
1559 if not match:
1560 raise TracingFailure(
1561 'Found malformed line: %s' % line,
1562 None,
1563 self._line_number,
1564 line)
1565 fn = getattr(
1566 self,
1567 'handle_%s' % match.group(2).replace('-', '_'),
1568 self._handle_ignored)
1569 # It is guaranteed to succeed because of the regexp. Or at least I thought
1570 # it would.
1571 pid = int(match.group(1))
1572 try:
1573 return fn(pid, match.group(3))
1574 except TracingFailure, e:
1575 # Hack in the values since the handler could be a static function.
1576 e.pid = pid
1577 e.line = line
1578 e.line_number = self._line_number
1579 # Re-raise the modified exception.
1580 raise
1581 except (KeyError, NotImplementedError, ValueError), e:
1582 raise TracingFailure(
1583 'Trace generated a %s exception: %s' % (
1584 e.__class__.__name__, str(e)),
1585 pid,
1586 self._line_number,
1587 line,
1588 e)
1589
1590 def to_results(self):
1591 process = self.root_process.to_results_process()
1592 # Internal concistency check.
1593 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
1594 raise TracingFailure(
1595 'Found internal inconsitency in process lifetime detection '
1596 'while looking for len(tree) == len(list)',
1597 None,
1598 None,
1599 None,
1600 sorted(self._process_lookup),
1601 sorted(p.pid for p in process.all))
1602 return Results(process)
1603
1604 def handle_dtrace_BEGIN(self, _pid, args):
1605 if not self.RE_DTRACE_BEGIN.match(args):
1606 raise TracingFailure(
1607 'Found internal inconsitency in dtrace_BEGIN log line',
1608 None, None, None)
1609
1610 def handle_proc_start(self, pid, args):
1611 """Transfers cwd.
1612
1613 The dtrace script already takes care of only tracing the processes that
1614 are child of the traced processes so there is no need to verify the
1615 process hierarchy.
1616 """
1617 if pid in self._process_lookup:
1618 raise TracingFailure(
1619 'Found internal inconsitency in proc_start: %d started two times' %
1620 pid,
1621 None, None, None)
1622 match = self.RE_PROC_START.match(args)
1623 if not match:
1624 raise TracingFailure(
1625 'Failed to parse arguments: %s' % args,
1626 None, None, None)
1627 ppid = int(match.group(1))
1628 if ppid == self._tracer_pid and not self.root_process:
1629 proc = self.root_process = self.Process(
1630 self.blacklist, pid, self._initial_cwd)
1631 elif ppid in self._process_lookup:
1632 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
1633 self._process_lookup[ppid].children.append(proc)
1634 else:
1635 # Another process tree, ignore.
1636 return
1637 self._process_lookup[pid] = proc
1638 logging.debug(
1639 'New child: %s -> %d cwd:%s' %
1640 (ppid, pid, unicode(proc.initial_cwd)))
1641
1642 def handle_proc_exit(self, pid, _args):
1643 """Removes cwd."""
1644 if pid in self._process_lookup:
1645 # self._tracer_pid is not traced itself and other traces run neither.
1646 self._process_lookup[pid].cwd = None
1647
1648 def handle_execve(self, pid, args):
1649 """Sets the process' executable.
1650
1651 TODO(maruel): Read command line arguments. See
1652 https://discussions.apple.com/thread/1980539 for an example.
1653 https://gist.github.com/1242279
1654
1655 Will have to put the answer at http://stackoverflow.com/questions/7556249.
1656 :)
1657 """
1658 if not pid in self._process_lookup:
1659 # Another process tree, ignore.
1660 return
1661 match = self.RE_EXECVE.match(args)
1662 if not match:
1663 raise TracingFailure(
1664 'Failed to parse arguments: %r' % args,
1665 None, None, None)
1666 proc = self._process_lookup[pid]
1667 proc.executable = match.group(1)
1668 proc.command = self.process_escaped_arguments(match.group(3))
1669 if int(match.group(2)) != len(proc.command):
1670 raise TracingFailure(
1671 'Failed to parse execve() arguments: %s' % args,
1672 None, None, None)
1673
1674 def handle_chdir(self, pid, args):
1675 """Updates cwd."""
1676 if pid not in self._process_lookup:
1677 # Another process tree, ignore.
1678 return
1679 cwd = self.RE_CHDIR.match(args).group(1)
1680 if not cwd.startswith('/'):
1681 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
1682 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
1683 else:
1684 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
1685 cwd2 = cwd
1686 self._process_lookup[pid].cwd = cwd2
1687
1688 def handle_open_nocancel(self, pid, args):
1689 """Redirects to handle_open()."""
1690 return self.handle_open(pid, args)
1691
1692 def handle_open(self, pid, args):
1693 if pid not in self._process_lookup:
1694 # Another process tree, ignore.
1695 return
1696 match = self.RE_OPEN.match(args)
1697 if not match:
1698 raise TracingFailure(
1699 'Failed to parse arguments: %s' % args,
1700 None, None, None)
1701 flag = int(match.group(2), 16)
1702 if self.O_DIRECTORY & flag == self.O_DIRECTORY:
1703 # Ignore directories.
1704 return
1705 self._handle_file(pid, match.group(1))
1706
1707 def handle_rename(self, pid, args):
1708 if pid not in self._process_lookup:
1709 # Another process tree, ignore.
1710 return
1711 match = self.RE_RENAME.match(args)
1712 if not match:
1713 raise TracingFailure(
1714 'Failed to parse arguments: %s' % args,
1715 None, None, None)
1716 self._handle_file(pid, match.group(1))
1717 self._handle_file(pid, match.group(2))
1718
1719 def _handle_file(self, pid, filepath):
1720 if not filepath.startswith('/'):
1721 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
1722 # We can get '..' in the path.
1723 filepath = os.path.normpath(filepath)
1724 # Sadly, still need to filter out directories here;
1725 # saw open_nocancel(".", 0, 0) = 0 lines.
1726 if os.path.isdir(filepath):
1727 return
1728 self._process_lookup[pid].add_file(filepath, False)
1729
1730 def handle_ftruncate(self, pid, args):
1731 """Just used as a signal to kill dtrace, ignoring."""
1732 pass
1733
1734 @staticmethod
1735 def _handle_ignored(pid, args):
1736 """Is called for all the event traces that are not handled."""
1737 raise NotImplementedError('Please implement me')
1738
1739 @staticmethod
1740 def process_escaped_arguments(text):
1741 """Extracts escaped arguments on a string and return the arguments as a
1742 list.
1743
1744 Implemented as an automaton.
1745
1746 Example:
1747 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
1748 function will return ['python2.7', '-c', 'print("hi")]
1749 """
1750 if not text.endswith('\\0'):
1751 raise ValueError('String is not null terminated: %r' % text, text)
1752 text = text[:-2]
1753
1754 def unescape(x):
1755 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
1756 out = []
1757 escaped = False
1758 for i in x:
1759 if i == '\\' and not escaped:
1760 escaped = True
1761 continue
1762 escaped = False
1763 out.append(i)
1764 return ''.join(out)
1765
1766 return [unescape(i) for i in text.split('\\001')]
1767
1768 class Tracer(ApiBase.Tracer):
1769 # pylint: disable=C0301
1770 #
1771 # To understand the following code, you'll want to take a look at:
1772 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref .html
1773 # https://wikis.oracle.com/display/DTrace/Variables
1774 # http://docs.oracle.com/cd/E19205-01/820-4221/
1775 #
1776 # 0. Dump all the valid probes into a text file. It is important, you
1777 # want to redirect into a file and you don't want to constantly 'sudo'.
1778 # $ sudo dtrace -l > probes.txt
1779 #
1780 # 1. Count the number of probes:
1781 # $ wc -l probes.txt
1782 # 81823 # On OSX 10.7, including 1 header line.
1783 #
1784 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
1785 # likes and skipping the header with NR>1:
1786 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
1787 # dtrace
1788 # fbt
1789 # io
1790 # ip
1791 # lockstat
1792 # mach_trap
1793 # proc
1794 # profile
1795 # sched
1796 # syscall
1797 # tcp
1798 # vminfo
1799 #
1800 # 3. List of valid probes:
1801 # $ grep syscall probes.txt | less
1802 # or use dtrace directly:
1803 # $ sudo dtrace -l -P syscall | less
1804 #
1805 # trackedpid is an associative array where its value can be 0, 1 or 2.
1806 # 0 is for untracked processes and is the default value for items not
1807 # in the associative array.
1808 # 1 is for tracked processes.
1809 # 2 is for the script created by create_thunk() only. It is not tracked
1810 # itself but all its decendants are.
1811 #
1812 # The script will kill itself only once waiting_to_die == 1 and
1813 # current_processes == 0, so that both getlogin() was called and that
1814 # all traced processes exited.
1815 #
1816 # TODO(maruel): Use cacheable predicates. See
1817 # https://wikis.oracle.com/display/DTrace/Performance+Considerations
1818 D_CODE = """
1819 dtrace:::BEGIN {
1820 waiting_to_die = 0;
1821 current_processes = 0;
1822 logindex = 0;
1823 printf("%d %d %s_%s(\\"%s\\")\\n",
1824 logindex, PID, probeprov, probename, SCRIPT);
1825 logindex++;
1826 }
1827
1828 proc:::start /trackedpid[ppid]/ {
1829 trackedpid[pid] = 1;
1830 current_processes += 1;
1831 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1832 logindex, pid, probeprov, probename,
1833 ppid,
1834 execname,
1835 current_processes);
1836 logindex++;
1837 }
1838 /* Should use SCRIPT but there is no access to this variable at that
1839 * point. */
1840 proc:::start /ppid == PID && execname == "Python"/ {
1841 trackedpid[pid] = 2;
1842 current_processes += 1;
1843 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
1844 logindex, pid, probeprov, probename,
1845 ppid,
1846 execname,
1847 current_processes);
1848 logindex++;
1849 }
1850 proc:::exit /trackedpid[pid] &&
1851 current_processes == 1 &&
1852 waiting_to_die == 1/ {
1853 trackedpid[pid] = 0;
1854 current_processes -= 1;
1855 printf("%d %d %s_%s(%d)\\n",
1856 logindex, pid, probeprov, probename,
1857 current_processes);
1858 logindex++;
1859 exit(0);
1860 }
1861 proc:::exit /trackedpid[pid]/ {
1862 trackedpid[pid] = 0;
1863 current_processes -= 1;
1864 printf("%d %d %s_%s(%d)\\n",
1865 logindex, pid, probeprov, probename,
1866 current_processes);
1867 logindex++;
1868 }
1869
1870 /* Use an arcane function to detect when we need to die */
1871 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
1872 waiting_to_die = 1;
1873 printf("%d %d %s()\\n", logindex, pid, probefunc);
1874 logindex++;
1875 }
1876 syscall::ftruncate:entry /
1877 pid == PID && arg0 == FILE_ID && current_processes == 0/ {
1878 exit(0);
1879 }
1880
1881 syscall::open*:entry /trackedpid[pid] == 1/ {
1882 self->open_arg0 = arg0;
1883 self->open_arg1 = arg1;
1884 self->open_arg2 = arg2;
1885 }
1886 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
1887 this->open_arg0 = copyinstr(self->open_arg0);
1888 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
1889 logindex, pid, probefunc,
1890 this->open_arg0,
1891 self->open_arg1,
1892 self->open_arg2);
1893 logindex++;
1894 this->open_arg0 = 0;
1895 }
1896 syscall::open*:return /trackedpid[pid] == 1/ {
1897 self->open_arg0 = 0;
1898 self->open_arg1 = 0;
1899 self->open_arg2 = 0;
1900 }
1901
1902 syscall::rename:entry /trackedpid[pid] == 1/ {
1903 self->rename_arg0 = arg0;
1904 self->rename_arg1 = arg1;
1905 }
1906 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
1907 this->rename_arg0 = copyinstr(self->rename_arg0);
1908 this->rename_arg1 = copyinstr(self->rename_arg1);
1909 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
1910 logindex, pid, probefunc,
1911 this->rename_arg0,
1912 this->rename_arg1);
1913 logindex++;
1914 this->rename_arg0 = 0;
1915 this->rename_arg1 = 0;
1916 }
1917 syscall::rename:return /trackedpid[pid] == 1/ {
1918 self->rename_arg0 = 0;
1919 self->rename_arg1 = 0;
1920 }
1921
1922 /* Track chdir, it's painful because it is only receiving relative path.
1923 */
1924 syscall::chdir:entry /trackedpid[pid] == 1/ {
1925 self->chdir_arg0 = arg0;
1926 }
1927 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
1928 this->chdir_arg0 = copyinstr(self->chdir_arg0);
1929 printf("%d %d %s(\\"%s\\")\\n",
1930 logindex, pid, probefunc,
1931 this->chdir_arg0);
1932 logindex++;
1933 this->chdir_arg0 = 0;
1934 }
1935 syscall::chdir:return /trackedpid[pid] == 1/ {
1936 self->chdir_arg0 = 0;
1937 }
1938 """
1939
1940 # execve-specific code, tends to throw a lot of exceptions.
1941 D_CODE_EXECVE = """
1942 /* Finally what we care about! */
1943 syscall::exec*:entry /trackedpid[pid]/ {
1944 self->exec_arg0 = copyinstr(arg0);
1945 /* Incrementally probe for a NULL in the argv parameter of execve() to
1946 * figure out argc. */
1947 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
1948 * found. */
1949 self->exec_argc = 0;
1950 /* Probe for argc==1 */
1951 this->exec_argv = (user_addr_t*)copyin(
1952 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1953 self->exec_argc = this->exec_argv[self->exec_argc] ?
1954 (self->exec_argc + 1) : self->exec_argc;
1955
1956 /* Probe for argc==2 */
1957 this->exec_argv = (user_addr_t*)copyin(
1958 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1959 self->exec_argc = this->exec_argv[self->exec_argc] ?
1960 (self->exec_argc + 1) : self->exec_argc;
1961
1962 /* Probe for argc==3 */
1963 this->exec_argv = (user_addr_t*)copyin(
1964 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1965 self->exec_argc = this->exec_argv[self->exec_argc] ?
1966 (self->exec_argc + 1) : self->exec_argc;
1967
1968 /* Probe for argc==4 */
1969 this->exec_argv = (user_addr_t*)copyin(
1970 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
1971 self->exec_argc = this->exec_argv[self->exec_argc] ?
1972 (self->exec_argc + 1) : self->exec_argc;
1973
1974 /* Copy the inputs strings since there is no guarantee they'll be
1975 * present after the call completed. */
1976 self->exec_argv0 = (self->exec_argc > 0) ?
1977 copyinstr(this->exec_argv[0]) : "";
1978 self->exec_argv1 = (self->exec_argc > 1) ?
1979 copyinstr(this->exec_argv[1]) : "";
1980 self->exec_argv2 = (self->exec_argc > 2) ?
1981 copyinstr(this->exec_argv[2]) : "";
1982 self->exec_argv3 = (self->exec_argc > 3) ?
1983 copyinstr(this->exec_argv[3]) : "";
1984 this->exec_argv = 0;
1985 }
1986 syscall::exec*:return /trackedpid[pid] && errno == 0/ {
1987 /* We need to join strings here, as using multiple printf() would
1988 * cause tearing when multiple threads/processes are traced.
1989 * Since it is impossible to escape a string and join it to another one,
1990 * like sprintf("%s%S", previous, more), use hackery.
1991 * Each of the elements are split with a \\1. \\0 cannot be used because
1992 * it is simply ignored. This will conflict with any program putting a
1993 * \\1 in their execve() string but this should be "rare enough" */
1994 this->args = "";
1995 /* Process exec_argv[0] */
1996 this->args = strjoin(
1997 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
1998
1999 /* Process exec_argv[1] */
2000 this->args = strjoin(
2001 this->args, (self->exec_argc > 1) ? "\\1" : "");
2002 this->args = strjoin(
2003 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
2004
2005 /* Process exec_argv[2] */
2006 this->args = strjoin(
2007 this->args, (self->exec_argc > 2) ? "\\1" : "");
2008 this->args = strjoin(
2009 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
2010
2011 /* Process exec_argv[3] */
2012 this->args = strjoin(
2013 this->args, (self->exec_argc > 3) ? "\\1" : "");
2014 this->args = strjoin(
2015 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
2016
2017 /* Prints self->exec_argc to permits verifying the internal
2018 * consistency since this code is quite fishy. */
2019 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
2020 logindex, pid, probefunc,
2021 self->exec_arg0,
2022 self->exec_argc,
2023 this->args);
2024 logindex++;
2025 this->args = 0;
2026 }
2027 syscall::exec*:return /trackedpid[pid]/ {
2028 self->exec_arg0 = 0;
2029 self->exec_argc = 0;
2030 self->exec_argv0 = 0;
2031 self->exec_argv1 = 0;
2032 self->exec_argv2 = 0;
2033 self->exec_argv3 = 0;
2034 }
2035 """
2036
2037 # Code currently not used.
2038 D_EXTRANEOUS = """
2039 /* This is a good learning experience, since it traces a lot of things
2040 * related to the process and child processes.
2041 * Warning: it generates a gigantic log. For example, tracing
2042 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
2043 * several minutes to execute.
2044 */
2045 /*
2046 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2047 printf("%d %d %s_%s() = %d\\n",
2048 logindex, pid, probeprov, probefunc, errno);
2049 logindex++;
2050 }
2051 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2052 printf("%d %d %s_%s() = %d\\n",
2053 logindex, pid, probeprov, probefunc, errno);
2054 logindex++;
2055 }
2056 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2057 printf("%d %d %s_%s() = %d\\n",
2058 logindex, pid, probeprov, probefunc, errno);
2059 logindex++;
2060 }
2061 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2062 printf("%d %d %s_%s() = %d\\n",
2063 logindex, pid, probeprov, probefunc, errno);
2064 logindex++;
2065 }
2066 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
2067 printf("%d %d %s_%s() = %d\\n",
2068 logindex, pid, probeprov, probefunc, errno);
2069 logindex++;
2070 }
2071 */
2072 /* TODO(maruel): *stat* functions and friends
2073 syscall::access:return,
2074 syscall::chdir:return,
2075 syscall::chflags:return,
2076 syscall::chown:return,
2077 syscall::chroot:return,
2078 syscall::getattrlist:return,
2079 syscall::getxattr:return,
2080 syscall::lchown:return,
2081 syscall::lstat64:return,
2082 syscall::lstat:return,
2083 syscall::mkdir:return,
2084 syscall::pathconf:return,
2085 syscall::readlink:return,
2086 syscall::removexattr:return,
2087 syscall::setxattr:return,
2088 syscall::stat64:return,
2089 syscall::stat:return,
2090 syscall::truncate:return,
2091 syscall::unlink:return,
2092 syscall::utimes:return,
2093 */
2094 """
2095
2096 def __init__(self, logname):
2097 """Starts the log collection with dtrace.
2098
2099 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
2100 this needs to wait for dtrace to be "warmed up".
2101 """
2102 super(Dtrace.Tracer, self).__init__(logname)
2103 self._script = create_thunk()
2104 # This unique dummy temp file is used to signal the dtrace script that it
2105 # should stop as soon as all the child processes are done. A bit hackish
2106 # but works fine enough.
2107 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
2108 prefix='trace_signal_file')
2109
2110 # Note: do not use the -p flag. It's useless if the initial process quits
2111 # too fast, resulting in missing traces from the grand-children. The D
2112 # code manages the dtrace lifetime itself.
2113 trace_cmd = [
2114 'sudo',
2115 'dtrace',
2116 # Use a larger buffer if getting 'out of scratch space' errors.
2117 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
2118 '-b', '10m',
2119 '-x', 'dynvarsize=10m',
2120 #'-x', 'dtrace_global_maxsize=1m',
2121 '-x', 'evaltime=exec',
2122 '-o', '/dev/stderr',
2123 '-q',
2124 '-n', self._get_dtrace_code(),
2125 ]
2126 with open(self._logname + '.log', 'wb') as logfile:
2127 self._dtrace = subprocess.Popen(
2128 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
2129 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
2130
2131 # Reads until one line is printed, which signifies dtrace is up and ready.
2132 with open(self._logname + '.log', 'rb') as logfile:
2133 while 'dtrace_BEGIN' not in logfile.readline():
2134 if self._dtrace.poll() is not None:
2135 # Do a busy wait. :/
2136 break
2137 logging.debug('dtrace started')
2138
2139 def _get_dtrace_code(self):
2140 """Setups the D code to implement child process tracking.
2141
2142 Injects the cookie in the script so it knows when to stop.
2143
2144 The script will detect any instance of the script created with
2145 create_thunk() and will start tracing it.
2146 """
2147 return (
2148 'inline int PID = %d;\n'
2149 'inline string SCRIPT = "%s";\n'
2150 'inline int FILE_ID = %d;\n'
2151 '\n'
2152 '%s') % (
2153 os.getpid(),
2154 self._script,
2155 self._dummy_file_id,
2156 self.D_CODE) + self.D_CODE_EXECVE
2157
2158 def trace(self, cmd, cwd, tracename, output):
2159 """Runs dtrace on an executable.
2160
2161 This dtruss is broken when it starts the process itself or when tracing
2162 child processes, this code starts a wrapper process
2163 generated with create_thunk() which starts the executable to trace.
2164 """
2165 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2166 assert os.path.isabs(cmd[0]), cmd[0]
2167 assert os.path.isabs(cwd), cwd
2168 assert os.path.normpath(cwd) == cwd, cwd
2169 with self._lock:
2170 if not self._initialized:
2171 raise TracingFailure(
2172 'Called Tracer.trace() on an unitialized object',
2173 None, None, None, tracename)
2174 assert tracename not in (i['trace'] for i in self._traces)
2175
2176 # Starts the script wrapper to start the child process. This signals the
2177 # dtrace script that this process is to be traced.
2178 stdout = stderr = None
2179 if output:
2180 stdout = subprocess.PIPE
2181 stderr = subprocess.STDOUT
2182 child_cmd = [
2183 sys.executable,
2184 self._script,
2185 tracename,
2186 ]
2187 # Call a dummy function so that dtrace knows I'm about to launch a process
2188 # that needs to be traced.
2189 # Yummy.
2190 child = subprocess.Popen(
2191 child_cmd + fix_python_path(cmd),
2192 stdin=subprocess.PIPE,
2193 stdout=stdout,
2194 stderr=stderr,
2195 cwd=cwd)
2196 logging.debug('Started child pid: %d' % child.pid)
2197
2198 out = child.communicate()[0]
2199 # This doesn't mean tracing is done, one of the grand-child process may
2200 # still be alive. It will be tracked with the dtrace script.
2201
2202 with self._lock:
2203 assert tracename not in (i['trace'] for i in self._traces)
2204 self._traces.append(
2205 {
2206 'cmd': cmd,
2207 'cwd': cwd,
2208 # The pid of strace process, not very useful.
2209 'pid': child.pid,
2210 'trace': tracename,
2211 'output': out,
2212 })
2213 return child.returncode, out
2214
2215 def close(self, timeout=None):
2216 """Terminates dtrace."""
2217 logging.debug('close(%s)' % timeout)
2218 try:
2219 try:
2220 super(Dtrace.Tracer, self).close(timeout)
2221 # Signal dtrace that it should stop now.
2222 os.ftruncate(self._dummy_file_id, 0)
2223 if timeout:
2224 start = time.time()
2225 # Use polling. :/
2226 while (self._dtrace.poll() is None and
2227 (time.time() - start) < timeout):
2228 time.sleep(0.1)
2229 self._dtrace.kill()
2230 self._dtrace.wait()
2231 finally:
2232 # Make sure to kill it in any case.
2233 if self._dtrace.poll() is None:
2234 try:
2235 self._dtrace.kill()
2236 self._dtrace.wait()
2237 except OSError:
2238 pass
2239
2240 if self._dtrace.returncode != 0:
2241 # Warn about any dtrace failure but basically ignore it.
2242 print 'dtrace failure: %s' % self._dtrace.returncode
2243 finally:
2244 os.close(self._dummy_file_id)
2245 os.remove(self._dummy_file_name)
2246 os.remove(self._script)
2247
2248 def post_process_log(self):
2249 """Sorts the log back in order when each call occured.
2250
2251 dtrace doesn't save the buffer in strict order since it keeps one buffer
2252 per CPU.
2253 """
2254 super(Dtrace.Tracer, self).post_process_log()
2255 logname = self._logname + '.log'
2256 with open(logname, 'rb') as logfile:
2257 lines = [l for l in logfile if l.strip()]
2258 errors = [l for l in lines if l.startswith('dtrace:')]
2259 if errors:
2260 raise TracingFailure(
2261 'Found errors in the trace: %s' % '\n'.join(errors),
2262 None, None, None, logname)
2263 try:
2264 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
2265 except ValueError:
2266 raise TracingFailure(
2267 'Found errors in the trace: %s' % '\n'.join(
2268 l for l in lines if l.split(' ', 1)[0].isdigit()),
2269 None, None, None, logname)
2270 with open(logname, 'wb') as logfile:
2271 logfile.write(''.join(lines))
2272
2273 @staticmethod
2274 def clean_trace(logname):
2275 for ext in ('', '.log'):
2276 if os.path.isfile(logname + ext):
2277 os.remove(logname + ext)
2278
2279 @classmethod
2280 def parse_log(cls, logname, blacklist):
2281 logging.info('parse_log(%s, ...)' % logname)
2282
2283 def blacklist_more(filepath):
2284 # All the HFS metadata is in the form /.vol/...
2285 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
2286
2287 data = read_json(logname)
2288 out = []
2289 for item in data['traces']:
2290 context = cls.Context(blacklist_more, item['pid'], item['cwd'])
2291 for line in open(logname + '.log', 'rb'):
2292 context.on_line(line)
2293 out.append(
2294 {
2295 'results': context.to_results(),
2296 'trace': item['trace'],
2297 'output': item['output'],
2298 })
2299 return out
2300
2301
2302 class LogmanTrace(ApiBase):
2303 """Uses the native Windows ETW based tracing functionality to trace a child
2304 process.
2305
2306 Caveat: this implementations doesn't track cwd or initial_cwd. It is because
2307 the Windows Kernel doesn't have a concept of 'current working directory' at
2308 all. A Win32 process has a map of current directories, one per drive letter
2309 and it is managed by the user mode kernel32.dll. In kernel, a file is always
2310 opened relative to another file_object or as an absolute path. All the current
2311 working directory logic is done in user mode.
2312 """
2313 class Context(ApiBase.Context):
2314 """Processes a ETW log line and keeps the list of existent and non
2315 existent files accessed.
2316
2317 Ignores directories.
2318 """
2319 # These indexes are for the stripped version in json.
2320 EVENT_NAME = 0
2321 TYPE = 1
2322 PID = 2
2323 TID = 3
2324 PROCESSOR_ID = 4
2325 TIMESTAMP = 5
2326 USER_DATA = 6
2327
2328 class Process(ApiBase.Context.Process):
2329 def __init__(self, *args):
2330 super(LogmanTrace.Context.Process, self).__init__(*args)
2331 # Handle file objects that succeeded.
2332 self.file_objects = {}
2333
2334 def __init__(self, blacklist, tracer_pid):
2335 logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))
2336 super(LogmanTrace.Context, self).__init__(blacklist)
2337 self._drive_map = DosDriveMap()
2338 # Threads mapping to the corresponding process id.
2339 self._threads_active = {}
2340 # Process ID of the tracer, e.g. tracer_inputs.py
2341 self._tracer_pid = tracer_pid
2342 self._line_number = 0
2343
2344 def on_line(self, line):
2345 """Processes a json Event line."""
2346 self._line_number += 1
2347 try:
2348 # By Opcode
2349 handler = getattr(
2350 self,
2351 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
2352 None)
2353 if not handler:
2354 raise TracingFailure(
2355 'Unexpected event %s_%s' % (
2356 line[self.EVENT_NAME], line[self.TYPE]),
2357 None, None, None)
2358 handler(line)
2359 except TracingFailure, e:
2360 # Hack in the values since the handler could be a static function.
2361 e.pid = line[self.PID]
2362 e.line = line
2363 e.line_number = self._line_number
2364 # Re-raise the modified exception.
2365 raise
2366 except (KeyError, NotImplementedError, ValueError), e:
2367 raise TracingFailure(
2368 'Trace generated a %s exception: %s' % (
2369 e.__class__.__name__, str(e)),
2370 line[self.PID],
2371 self._line_number,
2372 line,
2373 e)
2374
2375 def to_results(self):
2376 if not self.root_process:
2377 raise TracingFailure(
2378 'Failed to detect the initial process',
2379 None, None, None)
2380 process = self.root_process.to_results_process()
2381 return Results(process)
2382
2383 def _thread_to_process(self, tid):
2384 """Finds the process from the thread id."""
2385 tid = int(tid, 16)
2386 pid = self._threads_active.get(tid)
2387 if not pid or not self._process_lookup.get(pid):
2388 return
2389 return self._process_lookup[pid]
2390
2391 @classmethod
2392 def handle_EventTrace_Header(cls, line):
2393 """Verifies no event was dropped, e.g. no buffer overrun occured."""
2394 BUFFER_SIZE = cls.USER_DATA
2395 #VERSION = cls.USER_DATA + 1
2396 #PROVIDER_VERSION = cls.USER_DATA + 2
2397 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
2398 #END_TIME = cls.USER_DATA + 4
2399 #TIMER_RESOLUTION = cls.USER_DATA + 5
2400 #MAX_FILE_SIZE = cls.USER_DATA + 6
2401 #LOG_FILE_MODE = cls.USER_DATA + 7
2402 #BUFFERS_WRITTEN = cls.USER_DATA + 8
2403 #START_BUFFERS = cls.USER_DATA + 9
2404 #POINTER_SIZE = cls.USER_DATA + 10
2405 EVENTS_LOST = cls.USER_DATA + 11
2406 #CPU_SPEED = cls.USER_DATA + 12
2407 #LOGGER_NAME = cls.USER_DATA + 13
2408 #LOG_FILE_NAME = cls.USER_DATA + 14
2409 #BOOT_TIME = cls.USER_DATA + 15
2410 #PERF_FREQ = cls.USER_DATA + 16
2411 #START_TIME = cls.USER_DATA + 17
2412 #RESERVED_FLAGS = cls.USER_DATA + 18
2413 #BUFFERS_LOST = cls.USER_DATA + 19
2414 #SESSION_NAME_STRING = cls.USER_DATA + 20
2415 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
2416 if line[EVENTS_LOST] != '0':
2417 raise TracingFailure(
2418 ( '%s events were lost during trace, please increase the buffer '
2419 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
2420 None, None, None)
2421
2422 def handle_FileIo_Cleanup(self, line):
2423 """General wisdom: if a file is closed, it's because it was opened.
2424
2425 Note that FileIo_Close is not used since if a file was opened properly but
2426 not closed before the process exits, only Cleanup will be logged.
2427 """
2428 #IRP = self.USER_DATA
2429 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2430 FILE_OBJECT = self.USER_DATA + 2
2431 #FILE_KEY = self.USER_DATA + 3
2432 proc = self._thread_to_process(line[TTID])
2433 if not proc:
2434 # Not a process we care about.
2435 return
2436 file_object = line[FILE_OBJECT]
2437 if file_object in proc.file_objects:
2438 proc.add_file(proc.file_objects.pop(file_object), False)
2439
2440 def handle_FileIo_Create(self, line):
2441 """Handles a file open.
2442
2443 All FileIo events are described at
2444 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
2445 for some value of 'description'.
2446
2447 " (..) process and thread id values of the IO events (..) are not valid "
2448 http://msdn.microsoft.com/magazine/ee358703.aspx
2449
2450 The FileIo.Create event doesn't return if the CreateFile() call
2451 succeeded, so keep track of the file_object and check that it is
2452 eventually closed with FileIo_Cleanup.
2453 """
2454 #IRP = self.USER_DATA
2455 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
2456 FILE_OBJECT = self.USER_DATA + 2
2457 #CREATE_OPTIONS = self.USER_DATA + 3
2458 #FILE_ATTRIBUTES = self.USER_DATA + 4
2459 #self.USER_DATA + SHARE_ACCESS = 5
2460 OPEN_PATH = self.USER_DATA + 6
2461
2462 proc = self._thread_to_process(line[TTID])
2463 if not proc:
2464 # Not a process we care about.
2465 return
2466
2467 match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])
2468 raw_path = match.group(1)
2469 # Ignore directories and bare drive right away.
2470 if raw_path.endswith(os.path.sep):
2471 return
2472 filepath = self._drive_map.to_win32(raw_path)
2473 # Ignore bare drive right away. Some may still fall through with format
2474 # like '\\?\X:'
2475 if len(filepath) == 2:
2476 return
2477 file_object = line[FILE_OBJECT]
2478 if os.path.isdir(filepath):
2479 # There is no O_DIRECTORY equivalent on Windows. The closed is
2480 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
2481 # simply discard directories are they are found.
2482 return
2483 # Override any stale file object
2484 proc.file_objects[file_object] = filepath
2485
2486 def handle_FileIo_Rename(self, line):
2487 # TODO(maruel): Handle?
2488 pass
2489
2490 def handle_Process_End(self, line):
2491 pid = line[self.PID]
2492 if self._process_lookup.get(pid):
2493 logging.info('Terminated: %d' % pid)
2494 self._process_lookup[pid] = None
2495 else:
2496 logging.debug('Terminated: %d' % pid)
2497
2498 def handle_Process_Start(self, line):
2499 """Handles a new child process started by PID."""
2500 #UNIQUE_PROCESS_KEY = self.USER_DATA
2501 PROCESS_ID = self.USER_DATA + 1
2502 #PARENT_PID = self.USER_DATA + 2
2503 #SESSION_ID = self.USER_DATA + 3
2504 #EXIT_STATUS = self.USER_DATA + 4
2505 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
2506 #USER_SID = self.USER_DATA + 6
2507 IMAGE_FILE_NAME = self.USER_DATA + 7
2508 COMMAND_LINE = self.USER_DATA + 8
2509
2510 ppid = line[self.PID]
2511 pid = int(line[PROCESS_ID], 16)
2512 logging.debug(
2513 'New process %d->%d (%s) %s' %
2514 (ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))
2515
2516 if ppid == self._tracer_pid:
2517 # Need to ignore processes we don't know about because the log is
2518 # system-wide. self._tracer_pid shall start only one process.
2519 if self.root_process:
2520 raise TracingFailure(
2521 ( 'Parent process is _tracer_pid(%d) but root_process(%d) is '
2522 'already set') % (self._tracer_pid, self.root_process.pid),
2523 None, None, None)
2524 proc = self.Process(self.blacklist, pid, None)
2525 self.root_process = proc
2526 ppid = None
2527 elif self._process_lookup.get(ppid):
2528 proc = self.Process(self.blacklist, pid, None)
2529 self._process_lookup[ppid].children.append(proc)
2530 else:
2531 # Ignore
2532 return
2533 self._process_lookup[pid] = proc
2534
2535 if (not line[IMAGE_FILE_NAME].startswith('"') or
2536 not line[IMAGE_FILE_NAME].endswith('"')):
2537 raise TracingFailure(
2538 'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],
2539 None, None, None)
2540
2541 # TODO(maruel): Process escapes.
2542 if (not line[COMMAND_LINE].startswith('"') or
2543 not line[COMMAND_LINE].endswith('"')):
2544 raise TracingFailure(
2545 'Command line is not properly quoted: %s' % line[COMMAND_LINE],
2546 None, None, None)
2547 proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])
2548 proc.executable = line[IMAGE_FILE_NAME][1:-1]
2549 # proc.command[0] may be the absolute path of 'executable' but it may be
2550 # anything else too. If it happens that command[0] ends with executable,
2551 # use it, otherwise defaults to the base name.
2552 cmd0 = proc.command[0].lower()
2553 if not cmd0.endswith('.exe'):
2554 # TODO(maruel): That's not strictly true either.
2555 cmd0 += '.exe'
2556 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
2557 # Fix the path.
2558 cmd0 = cmd0.replace('/', os.path.sep)
2559 cmd0 = os.path.normpath(cmd0)
2560 proc.executable = get_native_path_case(cmd0)
2561 logging.info(
2562 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
2563
2564 def handle_Thread_End(self, line):
2565 """Has the same parameters as Thread_Start."""
2566 tid = int(line[self.TID], 16)
2567 self._threads_active.pop(tid, None)
2568
2569 def handle_Thread_Start(self, line):
2570 """Handles a new thread created.
2571
2572 Do not use self.PID here since a process' initial thread is created by
2573 the parent process.
2574 """
2575 PROCESS_ID = self.USER_DATA
2576 TTHREAD_ID = self.USER_DATA + 1
2577 #STACK_BASE = self.USER_DATA + 2
2578 #STACK_LIMIT = self.USER_DATA + 3
2579 #USER_STACK_BASE = self.USER_DATA + 4
2580 #USER_STACK_LIMIT = self.USER_DATA + 5
2581 #AFFINITY = self.USER_DATA + 6
2582 #WIN32_START_ADDR = self.USER_DATA + 7
2583 #TEB_BASE = self.USER_DATA + 8
2584 #SUB_PROCESS_TAG = self.USER_DATA + 9
2585 #BASE_PRIORITY = self.USER_DATA + 10
2586 #PAGE_PRIORITY = self.USER_DATA + 11
2587 #IO_PRIORITY = self.USER_DATA + 12
2588 #THREAD_FLAGS = self.USER_DATA + 13
2589 # Do not use self.PID here since a process' initial thread is created by
2590 # the parent process.
2591 pid = int(line[PROCESS_ID], 16)
2592 tid = int(line[TTHREAD_ID], 16)
2593 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
2594 self._threads_active[tid] = pid
2595
2596 @classmethod
2597 def supported_events(cls):
2598 """Returns all the procesed events."""
2599 out = []
2600 for member in dir(cls):
2601 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
2602 if match:
2603 out.append(match.groups())
2604 return out
2605
2606 class Tracer(ApiBase.Tracer):
2607 # The basic headers.
2608 EXPECTED_HEADER = [
2609 u'Event Name',
2610 u'Type',
2611 u'Event ID',
2612 u'Version',
2613 u'Channel',
2614 u'Level', # 5
2615 u'Opcode',
2616 u'Task',
2617 u'Keyword',
2618 u'PID',
2619 u'TID', # 10
2620 u'Processor Number',
2621 u'Instance ID',
2622 u'Parent Instance ID',
2623 u'Activity ID',
2624 u'Related Activity ID', # 15
2625 u'Clock-Time',
2626 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
2627 u'User(ms)', # pretty much useless.
2628 u'User Data', # Extra arguments that are event-specific.
2629 ]
2630 # Only the useful headers common to all entries are listed there. Any column
2631 # at 19 or higher is dependent on the specific event.
2632 EVENT_NAME = 0
2633 TYPE = 1
2634 PID = 9
2635 TID = 10
2636 PROCESSOR_ID = 11
2637 TIMESTAMP = 16
2638 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
2639 USER_DATA = 19
2640
2641 def __init__(self, logname):
2642 """Starts the log collection.
2643
2644 Requires administrative access. logman.exe is synchronous so no need for a
2645 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
2646 instead. The GUID constant name is SystemTraceControlGuid. Lovely.
2647
2648 One can get the list of potentially interesting providers with:
2649 "logman query providers | findstr /i file"
2650 """
2651 super(LogmanTrace.Tracer, self).__init__(logname)
2652 self._script = create_thunk()
2653 cmd_start = [
2654 'logman.exe',
2655 'start',
2656 'NT Kernel Logger',
2657 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
2658 # splitio,fileiocompletion,syscall,file,cswitch,img
2659 '(process,fileio,thread)',
2660 '-o', self._logname + '.etl',
2661 '-ets', # Send directly to kernel
2662 # Values extracted out of thin air.
2663 # Event Trace Session buffer size in kb.
2664 '-bs', '10240',
2665 # Number of Event Trace Session buffers.
2666 '-nb', '16', '256',
2667 ]
2668 logging.debug('Running: %s' % cmd_start)
2669 try:
2670 subprocess.check_call(
2671 cmd_start,
2672 stdin=subprocess.PIPE,
2673 stdout=subprocess.PIPE,
2674 stderr=subprocess.STDOUT)
2675 except subprocess.CalledProcessError, e:
2676 if e.returncode == -2147024891:
2677 print >> sys.stderr, 'Please restart with an elevated admin prompt'
2678 elif e.returncode == -2144337737:
2679 print >> sys.stderr, (
2680 'A kernel trace was already running, stop it and try again')
2681 raise
2682
2683 def trace(self, cmd, cwd, tracename, output):
2684 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
2685 assert os.path.isabs(cmd[0]), cmd[0]
2686 assert os.path.isabs(cwd), cwd
2687 assert os.path.normpath(cwd) == cwd, cwd
2688 with self._lock:
2689 if not self._initialized:
2690 raise TracingFailure(
2691 'Called Tracer.trace() on an unitialized object',
2692 None, None, None, tracename)
2693 assert tracename not in (i['trace'] for i in self._traces)
2694
2695 # Use "logman -?" for help.
2696
2697 stdout = stderr = None
2698 if output:
2699 stdout = subprocess.PIPE
2700 stderr = subprocess.STDOUT
2701
2702 # Run the child process.
2703 logging.debug('Running: %s' % cmd)
2704 # Use the temporary script generated with create_thunk() so we have a
2705 # clear pid owner. Since trace_inputs.py can be used as a library and
2706 # could trace multiple processes simultaneously, it makes it more complex
2707 # if the executable to be traced is executed directly here. It also solves
2708 # issues related to logman.exe that needs to be executed to control the
2709 # kernel trace.
2710 child_cmd = [
2711 sys.executable,
2712 self._script,
2713 tracename,
2714 ]
2715 child = subprocess.Popen(
2716 child_cmd + fix_python_path(cmd),
2717 cwd=cwd,
2718 stdin=subprocess.PIPE,
2719 stdout=stdout,
2720 stderr=stderr)
2721 logging.debug('Started child pid: %d' % child.pid)
2722 out = child.communicate()[0]
2723 # This doesn't mean all the grand-children are done. Sadly, we don't have
2724 # a good way to determine that.
2725
2726 with self._lock:
2727 assert tracename not in (i['trace'] for i in self._traces)
2728 self._traces.append({
2729 'command': cmd,
2730 'cwd': cwd,
2731 'pid': child.pid,
2732 'trace': tracename,
2733 'output': out,
2734 })
2735
2736 return child.returncode, out
2737
2738 def close(self, _timeout=None):
2739 """Stops the kernel log collection and converts the traces to text
2740 representation.
2741 """
2742 with self._lock:
2743 if not self._initialized:
2744 raise TracingFailure(
2745 'Called Tracer.close() on an unitialized object',
2746 None, None, None)
2747 os.remove(self._script)
2748 # Save metadata, add 'format' key..
2749 data = {
2750 'format': 'csv',
2751 'traces': self._traces,
2752 }
2753 write_json(self._logname, data, False)
2754
2755 cmd_stop = [
2756 'logman.exe',
2757 'stop',
2758 'NT Kernel Logger',
2759 '-ets', # Sends the command directly to the kernel.
2760 ]
2761 logging.debug('Running: %s' % cmd_stop)
2762 subprocess.check_call(
2763 cmd_stop,
2764 stdin=subprocess.PIPE,
2765 stdout=subprocess.PIPE,
2766 stderr=subprocess.STDOUT)
2767 self._initialized = False
2768
2769 def post_process_log(self):
2770 """Converts the .etl file into .csv then into .json."""
2771 super(LogmanTrace.Tracer, self).post_process_log()
2772 logformat = 'csv'
2773 self._convert_log(logformat)
2774
2775 if logformat == 'csv_utf16':
2776 def load_file():
2777 def utf_8_encoder(unicode_csv_data):
2778 """Encodes the unicode object as utf-8 encoded str instance"""
2779 for line in unicode_csv_data:
2780 yield line.encode('utf-8')
2781
2782 def unicode_csv_reader(unicode_csv_data, **kwargs):
2783 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
2784 """
2785 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
2786 for row in csv_reader:
2787 # Decode str utf-8 instances back to unicode instances, cell by
2788 # cell:
2789 yield [cell.decode('utf-8') for cell in row]
2790
2791 # The CSV file is UTF-16 so use codecs.open() to load the file into
2792 # the python internal unicode format (utf-8). Then explicitly
2793 # re-encode as utf8 as str instances so csv can parse it fine. Then
2794 # decode the utf-8 str back into python unicode instances. This
2795 # sounds about right.
2796 for line in unicode_csv_reader(
2797 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):
2798 # line is a list of unicode objects
2799 # So much white space!
2800 yield [i.strip() for i in line]
2801
2802 elif logformat == 'csv':
2803 def load_file():
2804 def ansi_csv_reader(ansi_csv_data, **kwargs):
2805 """Loads an 'ANSI' code page and returns unicode() objects."""
2806 assert sys.getfilesystemencoding() == 'mbcs'
2807 encoding = get_current_encoding()
2808 for row in csv.reader(ansi_csv_data, **kwargs):
2809 # Decode str 'ansi' instances to unicode instances, cell by cell:
2810 yield [cell.decode(encoding) for cell in row]
2811
2812 # The fastest and smallest format but only supports 'ANSI' file paths.
2813 # E.g. the filenames are encoding in the 'current' encoding.
2814 for line in ansi_csv_reader(open(self._logname + '.' + logformat)):
2815 # line is a list of unicode objects.
2816 yield [i.strip() for i in line]
2817
2818 supported_events = LogmanTrace.Context.supported_events()
2819
2820 def trim(generator):
2821 for index, line in enumerate(generator):
2822 if not index:
2823 if line != self.EXPECTED_HEADER:
2824 raise TracingFailure(
2825 'Found malformed header: %s' % ' '.join(line),
2826 None, None, None)
2827 continue
2828 # As you can see, the CSV is full of useful non-redundant information:
2829 if (line[2] != '0' or # Event ID
2830 line[3] not in ('2', '3') or # Version
2831 line[4] != '0' or # Channel
2832 line[5] != '0' or # Level
2833 line[7] != '0' or # Task
2834 line[8] != '0x0000000000000000' or # Keyword
2835 line[12] != '' or # Instance ID
2836 line[13] != '' or # Parent Instance ID
2837 line[14] != self.NULL_GUID or # Activity ID
2838 line[15] != ''): # Related Activity ID
2839 raise TracingFailure(
2840 'Found unexpected values in line: %s' % ' '.join(line),
2841 None, None, None)
2842
2843 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
2844 continue
2845
2846 # Convert the PID in-place from hex.
2847 line[self.PID] = int(line[self.PID], 16)
2848
2849 yield [
2850 line[self.EVENT_NAME],
2851 line[self.TYPE],
2852 line[self.PID],
2853 line[self.TID],
2854 line[self.PROCESSOR_ID],
2855 line[self.TIMESTAMP],
2856 ] + line[self.USER_DATA:]
2857
2858 write_json('%s.json' % self._logname, list(trim(load_file())), True)
2859
2860 def _convert_log(self, logformat):
2861 """Converts the ETL trace to text representation.
2862
2863 Normally, 'csv' is sufficient. If complex scripts are used (like eastern
2864 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
2865
2866 Arguments:
2867 - logformat: Text format to be generated, csv, csv_utf16 or xml.
2868
2869 Use "tracerpt -?" for help.
2870 """
2871 LOCALE_INVARIANT = 0x7F
2872 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
2873 cmd_convert = [
2874 'tracerpt.exe',
2875 '-l', self._logname + '.etl',
2876 '-o', self._logname + '.' + logformat,
2877 '-gmt', # Use UTC
2878 '-y', # No prompt
2879 # Use -of XML to get the header of each items after column 19, e.g. all
2880 # the actual headers of 'User Data'.
2881 ]
2882
2883 if logformat == 'csv':
2884 # tracerpt localizes the 'Type' column, for major brainfuck
2885 # entertainment. I can't imagine any sane reason to do that.
2886 cmd_convert.extend(['-of', 'CSV'])
2887 elif logformat == 'csv_utf16':
2888 # This causes it to use UTF-16, which doubles the log size but ensures
2889 # the log is readable for non-ASCII characters.
2890 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
2891 elif logformat == 'xml':
2892 cmd_convert.extend(['-of', 'XML'])
2893 else:
2894 raise ValueError('Unexpected log format \'%s\'' % logformat)
2895 logging.debug('Running: %s' % cmd_convert)
2896 # This can takes tens of minutes for large logs.
2897 # Redirects all output to stderr.
2898 subprocess.check_call(
2899 cmd_convert,
2900 stdin=subprocess.PIPE,
2901 stdout=sys.stderr,
2902 stderr=sys.stderr)
2903
2904 @staticmethod
2905 def clean_trace(logname):
2906 for ext in ('', '.csv', '.etl', '.json', '.xml'):
2907 if os.path.isfile(logname + ext):
2908 os.remove(logname + ext)
2909
2910 @classmethod
2911 def parse_log(cls, logname, blacklist):
2912 logging.info('parse_log(%s, %s)' % (logname, blacklist))
2913
2914 def blacklist_more(filepath):
2915 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
2916 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
2917
2918 data = read_json(logname)
2919 lines = read_json(logname + '.json')
2920 out = []
2921 for item in data['traces']:
2922 context = cls.Context(blacklist_more, item['pid'])
2923 for line in lines:
2924 context.on_line(line)
2925 out.append(
2926 {
2927 'results': context.to_results(),
2928 'trace': item['trace'],
2929 'output': item['output'],
2930 })
2931 return out
2932
2933
2934 def get_api():
2935 """Returns the correct implementation for the current OS."""
2936 if sys.platform == 'cygwin':
2937 raise NotImplementedError(
2938 'Not implemented for cygwin, start the script from Win32 python')
2939 flavors = {
2940 'win32': LogmanTrace,
2941 'darwin': Dtrace,
2942 'sunos5': Dtrace,
2943 'freebsd7': Dtrace,
2944 'freebsd8': Dtrace,
2945 }
2946 # Defaults to strace.
2947 return flavors.get(sys.platform, Strace)()
2948
2949
2950 def extract_directories(root_dir, files, blacklist):
2951 """Detects if all the files in a directory are in |files| and if so, replace
2952 the individual files by a Results.Directory instance.
2953
2954 Takes a list of Results.File instances and returns a shorter list of
2955 Results.File and Results.Directory instances.
2956
2957 Arguments:
2958 - root_dir: Optional base directory that shouldn't be search further.
2959 - files: list of Results.File instances.
2960 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.
2961 """
2962 logging.info(
2963 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
2964 assert not (root_dir or '').endswith(os.path.sep), root_dir
2965 assert not root_dir or (get_native_path_case(root_dir) == root_dir)
2966 assert not any(isinstance(f, Results.Directory) for f in files)
2967 # Remove non existent files.
2968 files = [f for f in files if f.existent]
2969 if not files:
2970 return files
2971 # All files must share the same root, which can be None.
2972 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
2973
2974 # Creates a {directory: {filename: File}} mapping, up to root.
2975 buckets = {}
2976 if root_dir:
2977 buckets[root_dir] = {}
2978 for fileobj in files:
2979 path = fileobj.full_path
2980 directory = os.path.dirname(path)
2981 assert directory
2982 # Do not use os.path.basename() so trailing os.path.sep is kept.
2983 basename = path[len(directory)+1:]
2984 files_in_directory = buckets.setdefault(directory, {})
2985 files_in_directory[basename] = fileobj
2986 # Add all the directories recursively up to root.
2987 while True:
2988 old_d = directory
2989 directory = os.path.dirname(directory)
2990 if directory + os.path.sep == root_dir or directory == old_d:
2991 break
2992 buckets.setdefault(directory, {})
2993
2994 root_prefix = len(root_dir) + 1 if root_dir else 0
2995 for directory in sorted(buckets, reverse=True):
2996 actual = set(f for f in os.listdir(directory) if not blacklist(f))
2997 expected = set(buckets[directory])
2998 if not (actual - expected):
2999 parent = os.path.dirname(directory)
3000 buckets[parent][os.path.basename(directory)] = Results.Directory(
3001 root_dir,
3002 directory[root_prefix:],
3003 False,
3004 sum(f.size for f in buckets[directory].itervalues()),
3005 sum(f.nb_files for f in buckets[directory].itervalues()))
3006 # Remove the whole bucket.
3007 del buckets[directory]
3008
3009 # Reverse the mapping with what remains. The original instances are returned,
3010 # so the cached meta data is kept.
3011 files = sum((x.values() for x in buckets.itervalues()), [])
3012 return sorted(files, key=lambda x: x.path)
3013
3014
3015 def trace(logfile, cmd, cwd, api, output):
3016 """Traces an executable. Returns (returncode, output) from api.
3017
3018 Arguments:
3019 - logfile: file to write to.
3020 - cmd: command to run.
3021 - cwd: current directory to start the process in.
3022 - api: a tracing api instance.
3023 - output: if True, returns output, otherwise prints it at the console.
3024 """
3025 cmd = fix_python_path(cmd)
3026 api.clean_trace(logfile)
3027 with api.get_tracer(logfile) as tracer:
3028 return tracer.trace(cmd, cwd, 'default', output)
3029
3030
3031 def load_trace(logfile, root_dir, api, blacklist):
3032 """Loads a trace file and returns the Results instance.
3033
3034 Arguments:
3035 - logfile: File to load.
3036 - root_dir: Root directory to use to determine if a file is relevant to the
3037 trace or not.
3038 - api: A tracing api instance.
3039 - blacklist: Optional blacklist function to filter out unimportant files.
3040 """
3041 data = api.parse_log(logfile, (blacklist or (lambda _: False)))
3042 assert len(data) == 1, 'More than one trace was detected!'
3043 if 'exception' in data[0]:
3044 # It got an exception, raise it.
3045 raise data[0]['exception']
3046 results = data[0]['results']
3047 if root_dir:
3048 results = results.strip_root(root_dir)
3049 return results
3050
3051
3052 def CMDclean(args):
3053 """Cleans up traces."""
3054 parser = OptionParserTraceInputs(command='clean')
3055 options, args = parser.parse_args(args)
3056 api = get_api()
3057 api.clean_trace(options.log)
3058 return 0
3059
3060
3061 def CMDtrace(args):
3062 """Traces an executable."""
3063 parser = OptionParserTraceInputs(command='trace')
3064 parser.allow_interspersed_args = False
3065 parser.add_option(
3066 '-q', '--quiet', action='store_true',
3067 help='Redirects traced executable output to /dev/null')
3068 options, args = parser.parse_args(args)
3069
3070 if not args:
3071 parser.error('Please provide a command to run')
3072
3073 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
3074 args[0] = os.path.abspath(args[0])
3075
3076 api = get_api()
3077 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
3078
3079
3080 def CMDread(args):
3081 """Reads the logs and prints the result."""
3082 parser = OptionParserTraceInputs(command='read')
3083 parser.add_option(
3084 '-V', '--variable',
3085 nargs=2,
3086 action='append',
3087 dest='variables',
3088 metavar='VAR_NAME directory',
3089 default=[],
3090 help=('Variables to replace relative directories against. Example: '
3091 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
3092 'home dir with $HOME') % getpass.getuser())
3093 parser.add_option(
3094 '--root-dir',
3095 help='Root directory to base everything off it. Anything outside of this '
3096 'this directory will not be reported')
3097 parser.add_option(
3098 '-j', '--json', action='store_true',
3099 help='Outputs raw result data as json')
3100 parser.add_option(
3101 '-b', '--blacklist', action='append', default=[],
3102 help='List of regexp to use as blacklist filter')
3103 options, args = parser.parse_args(args)
3104
3105 if options.root_dir:
3106 options.root_dir = os.path.abspath(options.root_dir)
3107
3108 variables = dict(options.variables)
3109 api = get_api()
3110 def blacklist(f):
3111 return any(re.match(b, f) for b in options.blacklist)
3112 data = api.parse_log(options.log, blacklist)
3113 # Process each trace.
3114 output_as_json = []
3115 for item in data:
3116 if 'exception' in item:
3117 print >> sys.stderr, (
3118 'Trace %s: Got an exception: %s' % (item['trace'], item['exception']))
3119 continue
3120 results = item['results']
3121 if options.root_dir:
3122 results = results.strip_root(options.root_dir)
3123
3124 if options.json:
3125 output_as_json.append(results.flatten())
3126 else:
3127 simplified = extract_directories(
3128 options.root_dir, results.files, blacklist)
3129 simplified = [f.replace_variables(variables) for f in simplified]
3130 if len(data) > 1:
3131 print('Trace: %s' % item['trace'])
3132 print('Total: %d' % len(results.files))
3133 print('Non existent: %d' % len(results.non_existent))
3134 for f in results.non_existent:
3135 print(' %s' % f.path)
3136 print(
3137 'Interesting: %d reduced to %d' % (
3138 len(results.existent), len(simplified)))
3139 for f in simplified:
3140 print(' %s' % f.path)
3141
3142 if options.json:
3143 write_json(sys.stdout, output_as_json, False)
3144 return 0
3145
3146
3147 class OptionParserWithLogging(optparse.OptionParser):
3148 """Adds --verbose option."""
3149 def __init__(self, verbose=0, **kwargs):
3150 optparse.OptionParser.__init__(self, **kwargs)
3151 self.add_option(
3152 '-v', '--verbose',
3153 action='count',
3154 default=verbose,
3155 help='Use multiple times to increase verbosity')
3156
3157 def parse_args(self, *args, **kwargs):
3158 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)
3159 levels = [logging.ERROR, logging.INFO, logging.DEBUG]
3160 logging.basicConfig(
3161 level=levels[min(len(levels)-1, options.verbose)],
3162 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')
3163 return options, args
3164
3165
3166 class OptionParserWithNiceDescription(OptionParserWithLogging):
3167 """Generates the description with the command's docstring."""
3168 def __init__(self, **kwargs):
3169 """Sets 'description' and 'usage' if not already specified."""
3170 command = kwargs.pop('command', 'help')
3171 kwargs.setdefault(
3172 'description',
3173 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))
3174 kwargs.setdefault('usage', '%%prog %s [options]' % command)
3175 OptionParserWithLogging.__init__(self, **kwargs)
3176
3177
3178 class OptionParserTraceInputs(OptionParserWithNiceDescription):
3179 """Adds automatic --log handling."""
3180 def __init__(self, **kwargs):
3181 OptionParserWithNiceDescription.__init__(self, **kwargs)
3182 self.add_option(
3183 '-l', '--log', help='Log file to generate or read, required')
3184
3185 def parse_args(self, *args, **kwargs):
3186 """Makes sure the paths make sense.
3187
3188 On Windows, / and \ are often mixed together in a path.
3189 """
3190 options, args = OptionParserWithNiceDescription.parse_args(
3191 self, *args, **kwargs)
3192 if not options.log:
3193 self.error('Must supply a log file with -l')
3194 options.log = os.path.abspath(options.log)
3195 return options, args
3196
3197
3198 def extract_documentation():
3199 """Returns a dict {command: description} for each of documented command."""
3200 commands = (
3201 fn[3:]
3202 for fn in dir(sys.modules['__main__'])
3203 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)
3204 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)
3205
3206
3207 def CMDhelp(args):
3208 """Prints list of commands or help for a specific command."""
3209 doc = extract_documentation()
3210 # Calculates the optimal offset.
3211 offset = max(len(cmd) for cmd in doc)
3212 format_str = ' %-' + str(offset + 2) + 's %s'
3213 # Generate a one-liner documentation of each commands.
3214 commands_description = '\n'.join(
3215 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))
3216
3217 parser = OptionParserWithNiceDescription(
3218 usage='%prog <command> [options]',
3219 description='Commands are:\n%s\n' % commands_description)
3220 parser.format_description = lambda _: parser.description
3221
3222 # Strip out any -h or --help argument.
3223 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])
3224 if len(args) == 1:
3225 if not get_command_handler(args[0]):
3226 parser.error('Unknown command %s' % args[0])
3227 # The command was "%prog help command", replaces ourself with
3228 # "%prog command --help" so help is correctly printed out.
3229 return main(args + ['--help'])
3230 elif args:
3231 parser.error('Unknown argument "%s"' % ' '.join(args))
3232 parser.print_help()
3233 return 0
3234
3235
3236 def get_command_handler(name):
3237 """Returns the command handler or CMDhelp if it doesn't exist."""
3238 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)
3239
3240
3241 def main_impl(argv):
3242 command = get_command_handler(argv[0] if argv else 'help')
3243 if not command:
3244 return CMDhelp(argv)
3245 return command(argv[1:])
3246
3247 def main(argv):
3248 try:
3249 main_impl(argv)
3250 except TracingFailure, e:
3251 sys.stderr.write('\nError: ')
3252 sys.stderr.write(str(e))
3253 sys.stderr.write('\n')
3254 return 1
3255
3256
3257 if __name__ == '__main__':
3258 sys.exit(main(sys.argv[1:]))
OLDNEW
« no previous file with comments | « tools/isolate/tests/trace_test_cases_smoke_test.py ('k') | tools/isolate/trace_test_cases.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698