trace_inputs.py - Issue 11048019: Add everything from src/tools/isolate r159537.

Unified Diff: trace_inputs.py

Issue 11048019: Add everything from src/tools/isolate r159537. (Closed) Base URL: https://git.chromium.org/chromium/tools/swarm_client.git@master

Patch Set: Ensure --similarity is sticky Created 8 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: trace_inputs.py

diff --git a/trace_inputs.py b/trace_inputs.py

new file mode 100755

index 0000000000000000000000000000000000000000..4e8df22757b15ef1672c50b6711df71ff068531c

--- /dev/null

+++ b/trace_inputs.py

@@ -0,0 +1,3258 @@

+#!/usr/bin/env python

+# coding=utf-8

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+"""Traces an executable and its child processes and extract the files accessed

+by them.

+The implementation uses OS-specific API. The native Kernel logger and the ETL

+interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.

+The OS-specific implementation is hidden in an 'API' interface.

+The results are embedded in a Results instance. The tracing is done in two

+phases, the first is to do the actual trace and generate an

+implementation-specific log file. Then the log file is parsed to extract the

+information, including the individual child processes and the files accessed

+from the log.

+"""

+import codecs

+import csv

+import getpass

+import glob

+import json

+import logging

+import optparse

+import os

+import re

+import subprocess

+import sys

+import tempfile

+import threading

+import time

+import weakref

+## OS-specific imports

+if sys.platform == 'win32':

+ from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p

+ from ctypes.wintypes import windll, FormatError # pylint: disable=E0611

+ from ctypes.wintypes import GetLastError # pylint: disable=E0611

+elif sys.platform == 'darwin':

+ import Carbon.File # pylint: disable=F0401

+ import MacOS # pylint: disable=F0401

+BASE_DIR = os.path.dirname(os.path.abspath(__file__))

+ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))

+class TracingFailure(Exception):

+ """An exception occured during tracing."""

+ def __init__(self, description, pid, line_number, line, *args):

+ super(TracingFailure, self).__init__(

+ description, pid, line_number, line, *args)

+ self.description = description

+ self.pid = pid

+ self.line_number = line_number

+ self.line = line

+ self.extra = args

+ def __str__(self):

+ out = self.description

+ if self.pid:

+ out += '\npid: %d' % self.pid

+ if self.line_number:

+ out += '\nline: %d' % self.line_number

+ if self.line:

+ out += '\n%s' % self.line

+ if self.extra:

+ out += '\n' + ', '.join(map(str, filter(None, self.extra)))

+ return out

+## OS-specific functions

+if sys.platform == 'win32':

+ def QueryDosDevice(drive_letter):

+ """Returns the Windows 'native' path for a DOS drive letter."""

+ assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter

+ # Guesswork. QueryDosDeviceW never returns the required number of bytes.

+ chars = 1024

+ drive_letter = unicode(drive_letter)

+ p = create_unicode_buffer(chars)

+ if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):

+ err = GetLastError()

+ if err:

+ # pylint: disable=E0602

+ raise WindowsError(

+ err,

+ 'QueryDosDevice(%s): %s (%d)' % (

+ str(drive_letter), FormatError(err), err))

+ return p.value

+ def GetShortPathName(long_path):

+ """Returns the Windows short path equivalent for a 'long' path."""

+ long_path = unicode(long_path)

+ # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is

+ # not enforced.

+ if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):

+ long_path = '\\\\?\\' + long_path

+ chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)

+ if chars:

+ p = create_unicode_buffer(chars)

+ if windll.kernel32.GetShortPathNameW(long_path, p, chars):

+ return p.value

+ err = GetLastError()

+ if err:

+ # pylint: disable=E0602

+ raise WindowsError(

+ err,

+ 'GetShortPathName(%s): %s (%d)' % (

+ str(long_path), FormatError(err), err))

+ def GetLongPathName(short_path):

+ """Returns the Windows long path equivalent for a 'short' path."""

+ short_path = unicode(short_path)

+ # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is

+ # not enforced.

+ if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):

+ short_path = '\\\\?\\' + short_path

+ chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)

+ if chars:

+ p = create_unicode_buffer(chars)

+ if windll.kernel32.GetLongPathNameW(short_path, p, chars):

+ return p.value

+ err = GetLastError()

+ if err:

+ # pylint: disable=E0602

+ raise WindowsError(

+ err,

+ 'GetLongPathName(%s): %s (%d)' % (

+ str(short_path), FormatError(err), err))

+ def get_current_encoding():

+ """Returns the 'ANSI' code page associated to the process."""

+ return 'cp%d' % int(windll.kernel32.GetACP())

+ class DosDriveMap(object):

+ """Maps \Device\HarddiskVolumeN to N: on Windows."""

+ # Keep one global cache.

+ _MAPPING = {}

+ def __init__(self):

+ """Lazy loads the cache."""

+ if not self._MAPPING:

+ # This is related to UNC resolver on windows. Ignore that.

+ self._MAPPING['\\Device\\Mup'] = None

+ self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']

+ for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):

+ try:

+ letter = '%s:' % letter

+ mapped = QueryDosDevice(letter)

+ if mapped in self._MAPPING:

+ logging.warn(

+ ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'

+ '. Drive letters are a user-mode concept and the kernel '

+ 'traces only have NT path, so all accesses will be '

+ 'associated with the first drive letter, independent of the '

+ 'actual letter used by the code') % (

+ self._MAPPING[mapped], letter))

+ else:

+ self._MAPPING[mapped] = letter

+ except WindowsError: # pylint: disable=E0602

+ pass

+ def to_win32(self, path):

+ """Converts a native NT path to Win32/DOS compatible path."""

+ match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)

+ if not match:

+ raise ValueError(

+ 'Can\'t convert %s into a Win32 compatible path' % path,

+ path)

+ if not match.group(1) in self._MAPPING:

+ # Unmapped partitions may be accessed by windows for the

+ # fun of it while the test is running. Discard these.

+ return None

+ drive = self._MAPPING[match.group(1)]

+ if not drive or not match.group(2):

+ return drive

+ return drive + match.group(2)

+ def isabs(path):

+ """Accepts X: as an absolute path, unlike python's os.path.isabs()."""

+ return os.path.isabs(path) or len(path) == 2 and path[1] == ':'

+ def get_native_path_case(p):

+ """Returns the native path case for an existing file.

+ On Windows, removes any leading '\\?\'.

+ """

+ if not isabs(p):

+ raise ValueError(

+ 'Can\'t get native path case for a non-absolute path: %s' % p,

+ p)

+ # Windows used to have an option to turn on case sensitivity on non Win32

+ # subsystem but that's out of scope here and isn't supported anymore.

+ # Go figure why GetShortPathName() is needed.

+ try:

+ out = GetLongPathName(GetShortPathName(p))

+ except OSError, e:

+ if e.args[0] in (2, 3, 5):

+ # The path does not exist. Try to recurse and reconstruct the path.

+ base = os.path.dirname(p)

+ rest = os.path.basename(p)

+ return os.path.join(get_native_path_case(base), rest)

+ raise

+ if out.startswith('\\\\?\\'):

+ out = out[4:]

+ # Always upper case the first letter since GetLongPathName() will return the

+ # drive letter in the case it was given.

+ return out[0].upper() + out[1:]

+ def CommandLineToArgvW(command_line):

+ """Splits a commandline into argv using CommandLineToArgvW()."""

+ # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx

+ size = c_int()

+ ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))

+ try:

+ return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]

+ finally:

+ windll.kernel32.LocalFree(ptr)

+elif sys.platform == 'darwin':

+ # On non-windows, keep the stdlib behavior.

+ isabs = os.path.isabs

+ def _find_item_native_case(root_path, item):

+ """Gets the native path case of a single item based at root_path.

+ There is no API to get the native path case of symlinks on OSX. So it

+ needs to be done the slow way.

+ """

+ item = item.lower()

+ for element in os.listdir(root_path):

+ if element.lower() == item:

+ return element

+ def _native_case(p):

+ """Gets the native path case. Warning: this function resolves symlinks."""

+ logging.debug('native_case(%s)' % p)

+ try:

+ rel_ref, _ = Carbon.File.FSPathMakeRef(p)

+ out = rel_ref.FSRefMakePath()

+ if p.endswith(os.path.sep) and not out.endswith(os.path.sep):

+ return out + os.path.sep

+ return out

+ except MacOS.Error, e:

+ if e.args[0] in (-43, -120):

+ # The path does not exist. Try to recurse and reconstruct the path.

+ # -43 means file not found.

+ # -120 means directory not found.

+ base = os.path.dirname(p)

+ rest = os.path.basename(p)

+ return os.path.join(_native_case(base), rest)

+ raise OSError(

+ e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])

+ def _split_at_symlink_native(base_path, rest):

+ """Returns the native path for a symlink."""

+ base, symlink, rest = split_at_symlink(base_path, rest)

+ if symlink:

+ if not base_path:

+ base_path = base

+ else:

+ base_path = safe_join(base_path, base)

+ symlink = _find_item_native_case(base_path, symlink)

+ return base, symlink, rest

+ def get_native_path_case(path):

+ """Returns the native path case for an existing file.

+ Technically, it's only HFS+ on OSX that is case preserving and

+ insensitive. It's the default setting on HFS+ but can be changed.

+ """

+ if not isabs(path):

+ raise ValueError(

+ 'Can\'t get native path case for a non-absolute path: %s' % path,

+ path)

+ if path.startswith('/dev'):

+ # /dev is not visible from Carbon, causing an exception.

+ return path

+ # Starts assuming there is no symlink along the path.

+ resolved = _native_case(path)

+ if resolved.lower() == path.lower():

+ # This code path is incredibly faster.

+ return resolved

+ # There was a symlink, process it.

+ base, symlink, rest = _split_at_symlink_native(None, path)

+ assert symlink, (path, base, symlink, rest, resolved)

+ prev = base

+ base = safe_join(_native_case(base), symlink)

+ assert len(base) > len(prev)

+ while rest:

+ prev = base

+ relbase, symlink, rest = _split_at_symlink_native(base, rest)

+ base = safe_join(base, relbase)

+ assert len(base) > len(prev), (prev, base, symlink)

+ if symlink:

+ base = safe_join(base, symlink)

+ assert len(base) > len(prev), (prev, base, symlink)

+ # Make sure no symlink was resolved.

+ assert base.lower() == path.lower(), (base, path)

+ return base

+else: # OSes other than Windows and OSX.

+ # On non-windows, keep the stdlib behavior.

+ isabs = os.path.isabs

+ def get_native_path_case(path):

+ """Returns the native path case for an existing file.

+ On OSes other than OSX and Windows, assume the file system is

+ case-sensitive.

+ TODO(maruel): This is not strictly true. Implement if necessary.

+ """

+ if not isabs(path):

+ raise ValueError(

+ 'Can\'t get native path case for a non-absolute path: %s' % path,

+ path)

+ # Give up on cygwin, as GetLongPathName() can't be called.

+ # Linux traces tends to not be normalized so use this occasion to normalize

+ # it. This function implementation already normalizes the path on the other

+ # OS so this needs to be done here to be coherent between OSes.

+ out = os.path.normpath(path)

+ if path.endswith(os.path.sep) and not out.endswith(os.path.sep):

+ return out + os.path.sep

+ return out

+if sys.platform != 'win32': # All non-Windows OSes.

+ def safe_join(*args):

+ """Joins path elements like os.path.join() but doesn't abort on absolute

+ path.

+ os.path.join('foo', '/bar') == '/bar'

+ but safe_join('foo', '/bar') == 'foo/bar'.

+ """

+ out = ''

+ for element in args:

+ if element.startswith(os.path.sep):

+ if out.endswith(os.path.sep):

+ out += element[1:]

+ else:

+ out += element

+ else:

+ if out.endswith(os.path.sep):

+ out += element

+ else:

+ out += os.path.sep + element

+ return out

+ def split_at_symlink(base_dir, relfile):

+ """Scans each component of relfile and cut the string at the symlink if

+ there is any.

+ Returns a tuple (base_path, symlink, rest), with symlink == rest == None if

+ not symlink was found.

+ """

+ if base_dir:

+ assert relfile

+ assert os.path.isabs(base_dir)

+ index = 0

+ else:

+ assert os.path.isabs(relfile)

+ index = 1

+ def at_root(rest):

+ if base_dir:

+ return safe_join(base_dir, rest)

+ return rest

+ while True:

+ try:

+ index = relfile.index(os.path.sep, index)

+ except ValueError:

+ index = len(relfile)

+ full = at_root(relfile[:index])

+ if os.path.islink(full):

+ # A symlink!

+ base = os.path.dirname(relfile[:index])

+ symlink = os.path.basename(relfile[:index])

+ rest = relfile[index:]

+ logging.debug(

+ 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %

+ (base_dir, relfile, base, symlink, rest))

+ return base, symlink, rest

+ if index == len(relfile):

+ break

+ index += 1

+ return relfile, None, None

+def fix_python_path(cmd):

+ """Returns the fixed command line to call the right python executable."""

+ out = cmd[:]

+ if out[0] == 'python':

+ out[0] = sys.executable

+ elif out[0].endswith('.py'):

+ out.insert(0, sys.executable)

+ return out

+def create_thunk():

+ handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')

+ os.write(

+ handle,

+ (

+ 'import subprocess\n'

+ 'import sys\n'

+ 'sys.exit(subprocess.call(sys.argv[2:]))\n'

+ ))

+ os.close(handle)

+ return name

+def strace_process_quoted_arguments(text):

+ """Extracts quoted arguments on a string and return the arguments as a list.

+ Implemented as an automaton. Supports incomplete strings in the form

+ '"foo"...'.

+ Example:

+ With text = '"foo", "bar"', the function will return ['foo', 'bar']

+ TODO(maruel): Implement escaping.

+ """

+ # All the possible states of the DFA.

+ ( NEED_QUOTE, # Begining of a new arguments.

+ INSIDE_STRING, # Inside an argument.

+ ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.

+ NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be

+ # a serie of 3 dots or a comma.

+ NEED_SPACE, # Right after a comma

+ NEED_DOT_2, # Found a dot, need a second one.

+ NEED_DOT_3, # Found second dot, need a third one.

+ NEED_COMMA, # Found third dot, need a comma.

+ ) = range(8)

+ state = NEED_QUOTE

+ out = []

+ for index, char in enumerate(text):

+ if char == '"':

+ if state == NEED_QUOTE:

+ state = INSIDE_STRING

+ # A new argument was found.

+ out.append('')

+ elif state == INSIDE_STRING:

+ # The argument is now closed.

+ state = NEED_COMMA_OR_DOT

+ elif state == ESCAPED:

+ out[-1] += char

+ state = INSIDE_STRING

+ else:

+ raise ValueError(

+ 'Can\'t process char at column %d for: %r' % (index, text),

+ index,

+ text)

+ elif char == ',':

+ if state in (NEED_COMMA_OR_DOT, NEED_COMMA):

+ state = NEED_SPACE

+ elif state == INSIDE_STRING:

+ out[-1] += char

+ elif state == ESCAPED:

+ out[-1] += char

+ state = INSIDE_STRING

+ else:

+ raise ValueError(

+ 'Can\'t process char at column %d for: %r' % (index, text),

+ index,

+ text)

+ elif char == ' ':

+ if state == NEED_SPACE:

+ state = NEED_QUOTE

+ elif state == INSIDE_STRING:

+ out[-1] += char

+ elif state == ESCAPED:

+ out[-1] += char

+ state = INSIDE_STRING

+ else:

+ raise ValueError(

+ 'Can\'t process char at column %d for: %r' % (index, text),

+ index,

+ text)

+ elif char == '.':

+ if state == NEED_COMMA_OR_DOT:

+ # The string is incomplete, this mean the strace -s flag should be

+ # increased.

+ state = NEED_DOT_2

+ elif state == NEED_DOT_2:

+ state = NEED_DOT_3

+ elif state == NEED_DOT_3:

+ state = NEED_COMMA

+ elif state == INSIDE_STRING:

+ out[-1] += char

+ elif state == ESCAPED:

+ out[-1] += char

+ state = INSIDE_STRING

+ else:

+ raise ValueError(

+ 'Can\'t process char at column %d for: %r' % (index, text),

+ index,

+ text)

+ elif char == '\\':

+ if state == ESCAPED:

+ out[-1] += char

+ state = INSIDE_STRING

+ elif state == INSIDE_STRING:

+ state = ESCAPED

+ else:

+ raise ValueError(

+ 'Can\'t process char at column %d for: %r' % (index, text),

+ index,

+ text)

+ else:

+ if state == INSIDE_STRING:

+ out[-1] += char

+ else:

+ raise ValueError(

+ 'Can\'t process char at column %d for: %r' % (index, text),

+ index,

+ text)

+ if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):

+ raise ValueError(

+ 'String is incorrectly terminated: %r' % text,

+ text)

+ return out

+def read_json(filepath):

+ with open(filepath, 'r') as f:

+ return json.load(f)

+def write_json(filepath_or_handle, data, dense):

+ """Writes data into filepath or file handle encoded as json.

+ If dense is True, the json is packed. Otherwise, it is human readable.

+ """

+ if hasattr(filepath_or_handle, 'write'):

+ if dense:

+ filepath_or_handle.write(

+ json.dumps(data, sort_keys=True, separators=(',',':')))

+ else:

+ filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))

+ else:

+ with open(filepath_or_handle, 'wb') as f:

+ if dense:

+ json.dump(data, f, sort_keys=True, separators=(',',':'))

+ else:

+ json.dump(data, f, sort_keys=True, indent=2)

+class Results(object):

+ """Results of a trace session."""

+ class _TouchedObject(object):

+ """Something, a file or a directory, that was accessed."""

+ def __init__(self, root, path, tainted, size, nb_files):

+ logging.debug(

+ '%s(%s, %s, %s, %s, %s)' %

+ (self.__class__.__name__, root, path, tainted, size, nb_files))

+ self.root = root

+ self.path = path

+ self.tainted = tainted

+ self.nb_files = nb_files

+ # Can be used as a cache or a default value, depending on context.

+ self._size = size

+ # These are cache only.

+ self._real_path = None

+ # Check internal consistency.

+ assert path, path

+ assert tainted or bool(root) != bool(isabs(path)), (root, path)

+ assert tainted or (

+ not os.path.exists(self.full_path) or

+ (self.full_path == get_native_path_case(self.full_path))), (

+ tainted, self.full_path, get_native_path_case(self.full_path))

+ @property

+ def existent(self):

+ return self.size != -1

+ @property

+ def full_path(self):

+ if self.root:

+ return os.path.join(self.root, self.path)

+ return self.path

+ @property

+ def real_path(self):

+ """Returns the path with symlinks resolved."""

+ if not self._real_path:

+ self._real_path = os.path.realpath(self.full_path)

+ return self._real_path

+ @property

+ def size(self):

+ """File's size. -1 is not existent."""

+ if self._size is None and not self.tainted:

+ try:

+ self._size = os.stat(self.full_path).st_size

+ except OSError:

+ self._size = -1

+ return self._size

+ def flatten(self):

+ """Returns a dict representing this object.

+ A 'size' of 0 means the file was only touched and not read.

+ """

+ return {

+ 'path': self.path,

+ 'size': self.size,

+ }

+ def replace_variables(self, variables):

+ """Replaces the root of this File with one of the variables if it matches.

+ If a variable replacement occurs, the cloned object becomes tainted.

+ """

+ for variable, root_path in variables.iteritems():

+ if self.path.startswith(root_path):

+ return self._clone(

+ self.root, variable + self.path[len(root_path):], True)

+ # No need to clone, returns ourself.

+ return self

+ def strip_root(self, root):

+ """Returns a clone of itself with 'root' stripped off."""

+ # Check internal consistency.

+ assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root

+ if not self.full_path.startswith(root):

+ # Now try to resolve the symlinks to see if it can be reached this way.

+ # Only try *after* trying without resolving symlink.

+ if not self.real_path.startswith(root):

+ return None

+ path = self.real_path

+ else:

+ path = self.full_path

+ return self._clone(root, path[len(root):], self.tainted)

+ def _clone(self, new_root, new_path, tainted):

+ raise NotImplementedError(self.__class__.__name__)

+ class File(_TouchedObject):

+ """A file that was accessed. May not be present anymore.

+ If tainted is true, it means it is not a real path anymore as a variable

+ replacement occured.

+ If touched_only is True, this means the file was probed for existence, and

+ it is existent, but was never _opened_. If touched_only is True, the file

+ must have existed.

+ """

+ def __init__(self, root, path, tainted, size):

+ super(Results.File, self).__init__(root, path, tainted, size, 1)

+ def _clone(self, new_root, new_path, tainted):

+ """Clones itself keeping meta-data."""

+ # Keep the self.size and self._real_path caches for performance reason. It

+ # is also important when the file becomes tainted (with a variable instead

+ # of the real path) since self.path is not an on-disk path anymore so

+ # out._size cannot be updated.

+ out = self.__class__(new_root, new_path, tainted, self.size)

+ out._real_path = self._real_path

+ return out

+ class Directory(_TouchedObject):

+ """A directory of files. Must exist."""

+ def __init__(self, root, path, tainted, size, nb_files):

+ """path='.' is a valid value and must be handled appropriately."""

+ assert not path.endswith(os.path.sep), path

+ super(Results.Directory, self).__init__(

+ root, path + os.path.sep, tainted, size, nb_files)

+ # For a Directory instance, self.size is not a cache, it's an actual value

+ # that is never modified and represents the total size of the files

+ # contained in this directory. It is possible that the directory is empty

+ # so that size == 0; this happens if there's only an invalid symlink in

+ # it.

+ def flatten(self):

+ out = super(Results.Directory, self).flatten()

+ out['nb_files'] = self.nb_files

+ return out

+ def _clone(self, new_root, new_path, tainted):

+ """Clones itself keeping meta-data."""

+ out = self.__class__(

+ new_root,

+ new_path.rstrip(os.path.sep),

+ tainted,

+ self.size,

+ self.nb_files)

+ out._real_path = self._real_path

+ return out

+ class Process(object):

+ """A process that was traced.

+ Contains references to the files accessed by this process and its children.

+ """

+ def __init__(self, pid, files, executable, command, initial_cwd, children):

+ logging.debug('Process(%s, %d, ...)' % (pid, len(files)))

+ self.pid = pid

+ self.files = sorted(files, key=lambda x: x.path)

+ self.children = children

+ self.executable = executable

+ self.command = command

+ self.initial_cwd = initial_cwd

+ # Check internal consistency.

+ assert len(set(f.path for f in self.files)) == len(self.files), sorted(

+ f.path for f in self.files)

+ assert isinstance(self.children, list)

+ assert isinstance(self.files, list)

+ @property

+ def all(self):

+ for child in self.children:

+ for i in child.all:

+ yield i

+ yield self

+ def flatten(self):

+ return {

+ 'children': [c.flatten() for c in self.children],

+ 'command': self.command,

+ 'executable': self.executable,

+ 'files': [f.flatten() for f in self.files],

+ 'initial_cwd': self.initial_cwd,

+ 'pid': self.pid,

+ }

+ def strip_root(self, root):

+ assert isabs(root) and root.endswith(os.path.sep), root

+ # Loads the files after since they are constructed as objects.

+ out = self.__class__(

+ self.pid,

+ filter(None, (f.strip_root(root) for f in self.files)),

+ self.executable,

+ self.command,

+ self.initial_cwd,

+ [c.strip_root(root) for c in self.children])

+ logging.debug(

+ 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))

+ return out

+ def __init__(self, process):

+ self.process = process

+ # Cache.

+ self._files = None

+ def flatten(self):

+ return {

+ 'root': self.process.flatten(),

+ }

+ @property

+ def files(self):

+ if self._files is None:

+ self._files = sorted(

+ sum((p.files for p in self.process.all), []),

+ key=lambda x: x.path)

+ return self._files

+ @property

+ def existent(self):

+ return [f for f in self.files if f.existent]

+ @property

+ def non_existent(self):

+ return [f for f in self.files if not f.existent]

+ def strip_root(self, root):

+ """Returns a clone with all the files outside the directory |root| removed

+ and converts all the path to be relative paths.

+ """

+ # Resolve any symlink

+ root = os.path.realpath(root)

+ root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep

+ logging.debug('strip_root(%s)' % root)

+ return Results(self.process.strip_root(root))

+class ApiBase(object):

+ """OS-agnostic API to trace a process and its children."""

+ class Context(object):

+ """Processes one log line at a time and keeps the list of traced processes.

+ The parsing is complicated by the fact that logs are traced out of order for

+ strace but in-order for dtrace and logman. In addition, on Windows it is

+ very frequent that processids are reused so a flat list cannot be used. But

+ at the same time, it is impossible to faithfully construct a graph when the

+ logs are processed out of order. So both a tree and a flat mapping are used,

+ the tree is the real process tree, while the flat mapping stores the last

+ valid process for the corresponding processid. For the strace case, the

+ tree's head is guessed at the last moment.

+ """

+ class Process(object):

+ """Keeps context for one traced child process.

+ Logs all the files this process touched. Ignores directories.

+ """

+ def __init__(self, blacklist, pid, initial_cwd):

+ # Check internal consistency.

+ assert isinstance(pid, int), repr(pid)

+ self.pid = pid

+ # children are Process instances.

+ self.children = []

+ self.initial_cwd = initial_cwd

+ self.cwd = None

+ self.files = set()

+ self.only_touched = set()

+ self.executable = None

+ self.command = None

+ self._blacklist = blacklist

+ def to_results_process(self):

+ """Resolves file case sensitivity and or late-bound strings."""

+ # When resolving files, it's normal to get dupe because a file could be

+ # opened multiple times with different case. Resolve the deduplication

+ # here.

+ def fix_path(x):

+ """Returns the native file path case.

+ Converts late-bound strings.

+ """

+ if not x:

+ # Do not convert None instance to 'None'.

+ return x

+ # TODO(maruel): Do not upconvert to unicode here, on linux we don't

+ # know the file path encoding so they must be treated as bytes.

+ x = unicode(x)

+ if os.path.isabs(x):

+ # If the path is not absolute, which tends to happen occasionally on

+ # Windows, it is not possible to get the native path case so ignore

+ # that trace. It mostly happens for 'executable' value.

+ x = get_native_path_case(x)

+ return x

+ def fix_and_blacklist_path(x):

+ x = fix_path(x)

+ if not x:

+ return

+ # The blacklist needs to be reapplied, since path casing could

+ # influence blacklisting.

+ if self._blacklist(x):

+ return

+ return x

+ # Filters out directories. Some may have passed through.

+ files = set(f for f in map(fix_and_blacklist_path, self.files) if f)

+ only_touched = set(

+ f for f in map(fix_and_blacklist_path, self.only_touched) if f)

+ only_touched -= files

+ files = [

+ Results.File(None, f, False, None) for f in files

+ if not os.path.isdir(f)

+ ]

+ # Using 0 as size means the file's content is ignored since the file was

+ # never opened for I/O.

+ files.extend(

+ Results.File(None, f, False, 0) for f in only_touched

+ if not os.path.isdir(f)

+ )

+ return Results.Process(

+ self.pid,

+ files,

+ fix_path(self.executable),

+ self.command,

+ fix_path(self.initial_cwd),

+ [c.to_results_process() for c in self.children])

+ def add_file(self, filepath, touch_only):

+ """Adds a file if it passes the blacklist."""

+ if self._blacklist(unicode(filepath)):

+ return

+ logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))

+ # Note that filepath and not unicode(filepath) is added. It is because

+ # filepath could be something else than a string, like a RelativePath

+ # instance for dtrace logs.

+ if touch_only:

+ self.only_touched.add(filepath)

+ else:

+ self.files.add(filepath)

+ def __init__(self, blacklist):

+ self.blacklist = blacklist

+ # Initial process.

+ self.root_process = None

+ # dict to accelerate process lookup, to not have to lookup the whole graph

+ # each time.

+ self._process_lookup = {}

+ class Tracer(object):

+ """During it's lifetime, the tracing subsystem is enabled."""

+ def __init__(self, logname):

+ self._logname = logname

+ self._lock = threading.Lock()

+ self._traces = []

+ self._initialized = True

+ def trace(self, cmd, cwd, tracename, output):

+ """Runs the OS-specific trace program on an executable.

+ Arguments:

+ - cmd: The command (a list) to run.

+ - cwd: Current directory to start the child process in.

+ - tracename: Name of the trace in the logname file.

+ - output: If False, redirects output to PIPEs.

+ Returns a tuple (resultcode, output) and updates the internal trace

+ entries.

+ """

+ # The implementation adds an item to self._traces.

+ raise NotImplementedError(self.__class__.__name__)

+ def close(self, _timeout=None):

+ """Saves the meta-data in the logname file.

+ For kernel-based tracing, stops the tracing subsystem.

+ Must not be used manually when using 'with' construct.

+ """

+ with self._lock:

+ assert self._initialized

+ try:

+ data = {

+ 'traces': self._traces,

+ }

+ write_json(self._logname, data, False)

+ finally:

+ self._initialized = False

+ def post_process_log(self):

+ """Post-processes the log so it becomes faster to load afterward.

+ Must not be used manually when using 'with' construct.

+ """

+ assert not self._initialized, 'Must stop tracing first.'

+ def __enter__(self):

+ """Enables 'with' statement."""

+ return self

+ def __exit__(self, exc_type, exc_value, traceback):

+ """Enables 'with' statement."""

+ self.close()

+ # If an exception was thrown, do not process logs.

+ if not exc_type:

+ self.post_process_log()

+ def get_tracer(self, logname):

+ """Returns an ApiBase.Tracer instance.

+ Initializes the tracing subsystem, which is a requirement for kernel-based

+ tracers. Only one tracer instance should be live at a time!

+ logname is the filepath to the json file that will contain the meta-data

+ about the logs.

+ """

+ return self.Tracer(logname)

+ @staticmethod

+ def clean_trace(logname):

+ """Deletes an old log."""

+ raise NotImplementedError()

+ @classmethod

+ def parse_log(cls, logname, blacklist):

+ """Processes trace logs and returns the files opened and the files that do

+ not exist.

+ It does not track directories.

+ Most of the time, files that do not exist are temporary test files that

+ should be put in /tmp instead. See http://crbug.com/116251.

+ Returns a list of dict with keys:

+ - results: A Results instance.

+ - trace: The corresponding tracename parameter provided to

+ get_tracer().trace().

+ - output: Output gathered during execution, if get_tracer().trace(...,

+ output=False) was used.

+ """

+ raise NotImplementedError(cls.__class__.__name__)

+class Strace(ApiBase):

+ """strace implies linux."""

+ class Context(ApiBase.Context):

+ """Processes a strace log line and keeps the list of existent and non

+ existent files accessed.

+ Ignores directories.

+ Uses late-binding to processes the cwd of each process. The problem is that

+ strace generates one log file per process it traced but doesn't give any

+ information about which process was started when and by who. So we don't

+ even know which process is the initial one. So process the logs out of

+ order and use late binding with RelativePath to be able to deduce the

+ initial directory of each process once all the logs are parsed.

+ """

+ class Process(ApiBase.Context.Process):

+ """Represents the state of a process.

+ Contains all the information retrieved from the pid-specific log.

+ """

+ # Function names are using ([a-z_0-9]+)

+ # This is the most common format. function(args) = result

+ RE_HEADER = re.compile(r'^([a-z_0-9]+)$(.+?)$\s+= (.+)$')

+ # An interrupted function call, only grab the minimal header.

+ RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')

+ # A resumed function call.

+ RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')

+ # A process received a signal.

+ RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')

+ # A process didn't handle a signal. Ignore any junk appearing before,

+ # because the process was forcibly killed so it won't open any new file.

+ RE_KILLED = re.compile(

+ r'^.*\+\+\+ killed by ([A-Z]+)( $core dumped$)? \+\+\+$')

+ # The process has exited.

+ RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')

+ # A call was canceled. Ignore any prefix.

+ RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')

+ # Happens when strace fails to even get the function name.

+ UNNAMED_FUNCTION = '????'

+ # Corner-case in python, a class member function decorator must not be

+ # @staticmethod.

+ def parse_args(regexp, expect_zero): # pylint: disable=E0213

+ """Automatically convert the str 'args' into a list of processed

+ arguments.

+ Arguments:

+ - regexp is used to parse args.

+ - expect_zero: one of True, False or None.

+ - True: will check for result.startswith('0') first and will ignore

+ the trace line completely otherwise. This is important because for

+ many functions, the regexp will not process if the call failed.

+ - False: will check for not result.startswith(('?', '-1')) for the

+ same reason than with True.

+ - None: ignore result.

+ """

+ def meta_hook(function):

+ assert function.__name__.startswith('handle_')

+ def hook(self, args, result):

+ if expect_zero is True and not result.startswith('0'):

+ return

+ if expect_zero is False and result.startswith(('?', '-1')):

+ return

+ match = re.match(regexp, args)

+ if not match:

+ raise TracingFailure(

+ 'Failed to parse %s(%s) = %s' %

+ (function.__name__[len('handle_'):], args, result),

+ None, None, None)

+ return function(self, match.groups(), result)

+ return hook

+ return meta_hook

+ class RelativePath(object):

+ """A late-bound relative path."""

+ def __init__(self, parent, value):

+ self.parent = parent

+ self.value = value

+ def render(self):

+ """Returns the current directory this instance is representing.

+ This function is used to return the late-bound value.

+ """

+ if self.value and self.value.startswith(u'/'):

+ # An absolute path.

+ return self.value

+ parent = self.parent.render() if self.parent else u'<None>'

+ if self.value:

+ return os.path.normpath(os.path.join(parent, self.value))

+ return parent

+ def __unicode__(self):

+ """Acts as a string whenever needed."""

+ return unicode(self.render())

+ def __str__(self):

+ """Acts as a string whenever needed."""

+ return str(self.render())

+ def __init__(self, root, pid):

+ """Keeps enough information to be able to guess the original process

+ root.

+ strace doesn't store which process was the initial process. So more

+ information needs to be kept so the graph can be reconstructed from the

+ flat map.

+ """

+ logging.info('%s(%d)' % (self.__class__.__name__, pid))

+ super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)

+ assert isinstance(root, ApiBase.Context)

+ self._root = weakref.ref(root)

+ # The dict key is the function name of the pending call, like 'open'

+ # or 'execve'.

+ self._pending_calls = {}

+ self._line_number = 0

+ # Current directory when the process started.

+ self.initial_cwd = self.RelativePath(self._root(), None)

+ self.parentid = None

+ def get_cwd(self):

+ """Returns the best known value of cwd."""

+ return self.cwd or self.initial_cwd

+ def render(self):

+ """Returns the string value of the RelativePath() object.

+ Used by RelativePath. Returns the initial directory and not the

+ current one since the current directory 'cwd' validity is time-limited.

+ The validity is only guaranteed once all the logs are processed.

+ """

+ return self.initial_cwd.render()

+ def on_line(self, line):

+ self._line_number += 1

+ if self.RE_SIGNAL.match(line):

+ # Ignore signals.

+ return

+ try:

+ match = self.RE_KILLED.match(line)

+ if match:

+ # Converts a '+++ killed by Foo +++' trace into an exit_group().

+ self.handle_exit_group(match.group(1), None)

+ return

+ match = self.RE_PROCESS_EXITED.match(line)

+ if match:

+ # Converts a '+++ exited with 1 +++' trace into an exit_group()

+ self.handle_exit_group(match.group(1), None)

+ return

+ match = self.RE_UNFINISHED.match(line)

+ if match:

+ if match.group(1) in self._pending_calls:

+ raise TracingFailure(

+ 'Found two unfinished calls for the same function',

+ None, None, None,

+ self._pending_calls)

+ self._pending_calls[match.group(1)] = (

+ match.group(1) + match.group(2))

+ return

+ match = self.RE_UNAVAILABLE.match(line)

+ if match:

+ # This usually means a process was killed and a pending call was

+ # canceled.

+ # TODO(maruel): Look up the last exit_group() trace just above and

+ # make sure any self._pending_calls[anything] is properly flushed.

+ return

+ match = self.RE_RESUMED.match(line)

+ if match:

+ if match.group(1) not in self._pending_calls:

+ raise TracingFailure(

+ 'Found a resumed call that was not logged as unfinished',

+ None, None, None,

+ self._pending_calls)

+ pending = self._pending_calls.pop(match.group(1))

+ # Reconstruct the line.

+ line = pending + match.group(2)

+ match = self.RE_HEADER.match(line)

+ if not match:

+ raise TracingFailure(

+ 'Found an invalid line: %s' % line,

+ None, None, None)

+ if match.group(1) == self.UNNAMED_FUNCTION:

+ return

+ # It's a valid line, handle it.

+ handler = getattr(self, 'handle_%s' % match.group(1), None)

+ if not handler:

+ self._handle_unknown(match.group(1), match.group(2), match.group(3))

+ return handler(match.group(2), match.group(3))

+ except TracingFailure, e:

+ # Hack in the values since the handler could be a static function.

+ e.pid = self.pid

+ e.line = line

+ e.line_number = self._line_number

+ # Re-raise the modified exception.

+ raise

+ except (KeyError, NotImplementedError, ValueError), e:

+ raise TracingFailure(

+ 'Trace generated a %s exception: %s' % (

+ e.__class__.__name__, str(e)),

+ self.pid,

+ self._line_number,

+ line,

+ e)

+ @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)

+ def handle_access(self, args, _result):

+ self._handle_file(args[0], True)

+ @parse_args(r'^\"(.+?)\"$', True)

+ def handle_chdir(self, args, _result):

+ """Updates cwd."""

+ self.cwd = self.RelativePath(self, args[0])

+ logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))

+ def handle_clone(self, _args, result):

+ """Transfers cwd."""

+ if result.startswith(('?', '-1')):

+ # The call failed.

+ return

+ # Update the other process right away.

+ childpid = int(result)

+ child = self._root().get_or_set_proc(childpid)

+ if child.parentid is not None or childpid in self.children:

+ raise TracingFailure(

+ 'Found internal inconsitency in process lifetime detection '

+ 'during a clone() call',

+ None, None, None)

+ # Copy the cwd object.

+ child.initial_cwd = self.get_cwd()

+ child.parentid = self.pid

+ # It is necessary because the logs are processed out of order.

+ self.children.append(child)

+ def handle_close(self, _args, _result):

+ pass

+ def handle_chmod(self, _args, _result):

+ pass

+ def handle_creat(self, _args, _result):

+ # Ignore files created, since they didn't need to exist.

+ pass

+ @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)

+ def handle_execve(self, args, _result):

+ # Even if in practice execve() doesn't returns when it succeeds, strace

+ # still prints '0' as the result.

+ filepath = args[0]

+ self._handle_file(filepath, False)

+ self.executable = self.RelativePath(self.get_cwd(), filepath)

+ self.command = strace_process_quoted_arguments(args[1])

+ def handle_exit_group(self, _args, _result):

+ """Removes cwd."""

+ self.cwd = None

+ def handle_fork(self, args, result):

+ self._handle_unknown('fork', args, result)

+ def handle_getcwd(self, _args, _result):

+ pass

+ @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)

+ def handle_link(self, args, _result):

+ self._handle_file(args[0], False)

+ self._handle_file(args[1], False)

+ @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)

+ def handle_lstat(self, args, _result):

+ self._handle_file(args[0], True)

+ def handle_mkdir(self, _args, _result):

+ pass

+ @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)

+ def handle_open(self, args, _result):

+ if 'O_DIRECTORY' in args[1]:

+ return

+ self._handle_file(args[0], False)

+ @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)

+ def handle_openat(self, args, _result):

+ if 'O_DIRECTORY' in args[2]:

+ return

+ if args[1] == 'AT_FDCWD':

+ self._handle_file(args[1], False)

+ else:

+ # TODO(maruel): Implement relative open if necessary instead of the

+ # AT_FDCWD flag, let's hope not since this means tracking all active

+ # directory handles.

+ raise Exception('Relative open via openat not implemented.')

+ @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)

+ def handle_readlink(self, args, _result):

+ self._handle_file(args[0], False)

+ @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)

+ def handle_rename(self, args, _result):

+ self._handle_file(args[0], False)

+ self._handle_file(args[1], False)

+ def handle_rmdir(self, _args, _result):

+ pass

+ def handle_setxattr(self, _args, _result):

+ pass

+ @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)

+ def handle_stat(self, args, _result):

+ self._handle_file(args[0], True)

+ def handle_symlink(self, _args, _result):

+ pass

+ def handle_unlink(self, _args, _result):

+ # In theory, the file had to be created anyway.

+ pass

+ def handle_statfs(self, _args, _result):

+ pass

+ def handle_vfork(self, args, result):

+ self._handle_unknown('vfork', args, result)

+ @staticmethod

+ def _handle_unknown(function, args, result):

+ raise TracingFailure(

+ 'Unexpected/unimplemented trace %s(%s)= %s' %

+ (function, args, result),

+ None, None, None)

+ def _handle_file(self, filepath, touch_only):

+ filepath = self.RelativePath(self.get_cwd(), filepath)

+ #assert not touch_only, unicode(filepath)

+ self.add_file(filepath, touch_only)

+ def __init__(self, blacklist, initial_cwd):

+ super(Strace.Context, self).__init__(blacklist)

+ self.initial_cwd = initial_cwd

+ def render(self):

+ """Returns the string value of the initial cwd of the root process.

+ Used by RelativePath.

+ """

+ return self.initial_cwd

+ def on_line(self, pid, line):

+ """Transfers control into the Process.on_line() function."""

+ self.get_or_set_proc(pid).on_line(line.strip())

+ def to_results(self):

+ """Finds back the root process and verify consistency."""

+ # TODO(maruel): Absolutely unecessary, fix me.

+ root = [p for p in self._process_lookup.itervalues() if not p.parentid]

+ if len(root) != 1:

+ raise TracingFailure(

+ 'Found internal inconsitency in process lifetime detection '

+ 'while finding the root process',

+ None,

+ sorted(p.pid for p in root))

+ self.root_process = root[0]

+ process = self.root_process.to_results_process()

+ if sorted(self._process_lookup) != sorted(p.pid for p in process.all):

+ raise TracingFailure(

+ 'Found internal inconsitency in process lifetime detection '

+ 'while looking for len(tree) == len(list)',

+ None,

+ sorted(self._process_lookup),

+ sorted(p.pid for p in process.all))

+ return Results(process)

+ def get_or_set_proc(self, pid):

+ """Returns the Context.Process instance for this pid or creates a new one.

+ """

+ if not pid or not isinstance(pid, int):

+ raise TracingFailure(

+ 'Unpexpected value for pid: %r' % pid,

+ pid,

+ None,

+ pid)

+ if pid not in self._process_lookup:

+ self._process_lookup[pid] = self.Process(self, pid)

+ return self._process_lookup[pid]

+ @classmethod

+ def traces(cls):

+ """Returns the list of all handled traces to pass this as an argument to

+ strace.

+ """

+ prefix = 'handle_'

+ return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]

+ class Tracer(ApiBase.Tracer):

+ MAX_LEN = 256

+ def trace(self, cmd, cwd, tracename, output):

+ """Runs strace on an executable."""

+ logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))

+ assert os.path.isabs(cmd[0]), cmd[0]

+ assert os.path.isabs(cwd), cwd

+ assert os.path.normpath(cwd) == cwd, cwd

+ with self._lock:

+ if not self._initialized:

+ raise TracingFailure(

+ 'Called Tracer.trace() on an unitialized object',

+ None, None, None, tracename)

+ assert tracename not in (i['trace'] for i in self._traces)

+ stdout = stderr = None

+ if output:

+ stdout = subprocess.PIPE

+ stderr = subprocess.STDOUT

+ # Ensure all file related APIs are hooked.

+ traces = ','.join(Strace.Context.traces() + ['file'])

+ trace_cmd = [

+ 'strace',

+ '-ff',

+ '-s', '%d' % self.MAX_LEN,

+ '-e', 'trace=%s' % traces,

+ '-o', self._logname + '.' + tracename,

+ ]

+ child = subprocess.Popen(

+ trace_cmd + cmd,

+ cwd=cwd,

+ stdin=subprocess.PIPE,

+ stdout=stdout,

+ stderr=stderr)

+ out = child.communicate()[0]

+ # TODO(maruel): Walk the logs and figure out the root process would

+ # simplify parsing the logs a *lot*.

+ with self._lock:

+ assert tracename not in (i['trace'] for i in self._traces)

+ self._traces.append(

+ {

+ 'cmd': cmd,

+ 'cwd': cwd,

+ # The pid of strace process, not very useful.

+ 'pid': child.pid,

+ 'trace': tracename,

+ 'output': out,

+ })

+ return child.returncode, out

+ @staticmethod

+ def clean_trace(logname):

+ if os.path.isfile(logname):

+ os.remove(logname)

+ # Also delete any pid specific file from previous traces.

+ for i in glob.iglob(logname + '.*'):

+ if i.rsplit('.', 1)[1].isdigit():

+ os.remove(i)

+ @classmethod

+ def parse_log(cls, logname, blacklist):

+ logging.info('parse_log(%s, %s)' % (logname, blacklist))

+ data = read_json(logname)

+ out = []

+ for item in data['traces']:

+ result = {

+ 'trace': item['trace'],

+ 'output': item['output'],

+ }

+ try:

+ context = cls.Context(blacklist, item['cwd'])

+ for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):

+ pid = pidfile.rsplit('.', 1)[1]

+ if pid.isdigit():

+ pid = int(pid)

+ # TODO(maruel): Load as utf-8

+ for line in open(pidfile, 'rb'):

+ context.on_line(pid, line)

+ result['results'] = context.to_results()

+ except TracingFailure, e:

+ result['exception'] = e

+ out.append(result)

+ return out

+class Dtrace(ApiBase):

+ """Uses DTrace framework through dtrace. Requires root access.

+ Implies Mac OSX.

+ dtruss can't be used because it has compatibility issues with python.

+ Also, the pid->cwd handling needs to be done manually since OSX has no way to

+ get the absolute path of the 'cwd' dtrace variable from the probe.

+ Also, OSX doesn't populate curpsinfo->pr_psargs properly, see

+ https://discussions.apple.com/thread/1980539. So resort to handling execve()

+ manually.

+ errno is not printed in the log since this implementation currently only cares

+ about files that were successfully opened.

+ """

+ class Context(ApiBase.Context):

+ # Format: index pid function(args)

+ RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)$(.*?)$$')

+ # Arguments parsing.

+ RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')

+ RE_CHDIR = re.compile(r'^\"(.+?)\"$')

+ RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')

+ RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')

+ RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')

+ RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')

+ O_DIRECTORY = 0x100000

+ class Process(ApiBase.Context.Process):

+ def __init__(self, *args):

+ super(Dtrace.Context.Process, self).__init__(*args)

+ self.cwd = self.initial_cwd

+ def __init__(self, blacklist, tracer_pid, initial_cwd):

+ logging.info(

+ '%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))

+ super(Dtrace.Context, self).__init__(blacklist)

+ # Process ID of the temporary script created by create_thunk().

+ self._tracer_pid = tracer_pid

+ self._initial_cwd = initial_cwd

+ self._line_number = 0

+ def on_line(self, line):

+ self._line_number += 1

+ match = self.RE_HEADER.match(line)

+ if not match:

+ raise TracingFailure(

+ 'Found malformed line: %s' % line,

+ None,

+ self._line_number,

+ line)

+ fn = getattr(

+ self,

+ 'handle_%s' % match.group(2).replace('-', '_'),

+ self._handle_ignored)

+ # It is guaranteed to succeed because of the regexp. Or at least I thought

+ # it would.

+ pid = int(match.group(1))

+ try:

+ return fn(pid, match.group(3))

+ except TracingFailure, e:

+ # Hack in the values since the handler could be a static function.

+ e.pid = pid

+ e.line = line

+ e.line_number = self._line_number

+ # Re-raise the modified exception.

+ raise

+ except (KeyError, NotImplementedError, ValueError), e:

+ raise TracingFailure(

+ 'Trace generated a %s exception: %s' % (

+ e.__class__.__name__, str(e)),

+ pid,

+ self._line_number,

+ line,

+ e)

+ def to_results(self):

+ process = self.root_process.to_results_process()

+ # Internal concistency check.

+ if sorted(self._process_lookup) != sorted(p.pid for p in process.all):

+ raise TracingFailure(

+ 'Found internal inconsitency in process lifetime detection '

+ 'while looking for len(tree) == len(list)',

+ None,

+ sorted(self._process_lookup),

+ sorted(p.pid for p in process.all))

+ return Results(process)

+ def handle_dtrace_BEGIN(self, _pid, args):

+ if not self.RE_DTRACE_BEGIN.match(args):

+ raise TracingFailure(

+ 'Found internal inconsitency in dtrace_BEGIN log line',

+ None, None, None)

+ def handle_proc_start(self, pid, args):

+ """Transfers cwd.

+ The dtrace script already takes care of only tracing the processes that

+ are child of the traced processes so there is no need to verify the

+ process hierarchy.

+ """

+ if pid in self._process_lookup:

+ raise TracingFailure(

+ 'Found internal inconsitency in proc_start: %d started two times' %

+ pid,

+ None, None, None)

+ match = self.RE_PROC_START.match(args)

+ if not match:

+ raise TracingFailure(

+ 'Failed to parse arguments: %s' % args,

+ None, None, None)

+ ppid = int(match.group(1))

+ if ppid == self._tracer_pid and not self.root_process:

+ proc = self.root_process = self.Process(

+ self.blacklist, pid, self._initial_cwd)

+ elif ppid in self._process_lookup:

+ proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)

+ self._process_lookup[ppid].children.append(proc)

+ else:

+ # Another process tree, ignore.

+ return

+ self._process_lookup[pid] = proc

+ logging.debug(

+ 'New child: %s -> %d cwd:%s' %

+ (ppid, pid, unicode(proc.initial_cwd)))

+ def handle_proc_exit(self, pid, _args):

+ """Removes cwd."""

+ if pid in self._process_lookup:

+ # self._tracer_pid is not traced itself and other traces run neither.

+ self._process_lookup[pid].cwd = None

+ def handle_execve(self, pid, args):

+ """Sets the process' executable.

+ TODO(maruel): Read command line arguments. See

+ https://discussions.apple.com/thread/1980539 for an example.

+ https://gist.github.com/1242279

+ Will have to put the answer at http://stackoverflow.com/questions/7556249.

+ :)

+ """

+ if not pid in self._process_lookup:

+ # Another process tree, ignore.

+ return

+ match = self.RE_EXECVE.match(args)

+ if not match:

+ raise TracingFailure(

+ 'Failed to parse arguments: %r' % args,

+ None, None, None)

+ proc = self._process_lookup[pid]

+ proc.executable = match.group(1)

+ proc.command = self.process_escaped_arguments(match.group(3))

+ if int(match.group(2)) != len(proc.command):

+ raise TracingFailure(

+ 'Failed to parse execve() arguments: %s' % args,

+ None, None, None)

+ def handle_chdir(self, pid, args):

+ """Updates cwd."""

+ if pid not in self._process_lookup:

+ # Another process tree, ignore.

+ return

+ cwd = self.RE_CHDIR.match(args).group(1)

+ if not cwd.startswith('/'):

+ cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)

+ logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))

+ else:

+ logging.debug('handle_chdir(%d, %s)' % (pid, cwd))

+ cwd2 = cwd

+ self._process_lookup[pid].cwd = cwd2

+ def handle_open_nocancel(self, pid, args):

+ """Redirects to handle_open()."""

+ return self.handle_open(pid, args)

+ def handle_open(self, pid, args):

+ if pid not in self._process_lookup:

+ # Another process tree, ignore.

+ return

+ match = self.RE_OPEN.match(args)

+ if not match:

+ raise TracingFailure(

+ 'Failed to parse arguments: %s' % args,

+ None, None, None)

+ flag = int(match.group(2), 16)

+ if self.O_DIRECTORY & flag == self.O_DIRECTORY:

+ # Ignore directories.

+ return

+ self._handle_file(pid, match.group(1))

+ def handle_rename(self, pid, args):

+ if pid not in self._process_lookup:

+ # Another process tree, ignore.

+ return

+ match = self.RE_RENAME.match(args)

+ if not match:

+ raise TracingFailure(

+ 'Failed to parse arguments: %s' % args,

+ None, None, None)

+ self._handle_file(pid, match.group(1))

+ self._handle_file(pid, match.group(2))

+ def _handle_file(self, pid, filepath):

+ if not filepath.startswith('/'):

+ filepath = os.path.join(self._process_lookup[pid].cwd, filepath)

+ # We can get '..' in the path.

+ filepath = os.path.normpath(filepath)

+ # Sadly, still need to filter out directories here;

+ # saw open_nocancel(".", 0, 0) = 0 lines.

+ if os.path.isdir(filepath):

+ return

+ self._process_lookup[pid].add_file(filepath, False)

+ def handle_ftruncate(self, pid, args):

+ """Just used as a signal to kill dtrace, ignoring."""

+ pass

+ @staticmethod

+ def _handle_ignored(pid, args):

+ """Is called for all the event traces that are not handled."""

+ raise NotImplementedError('Please implement me')

+ @staticmethod

+ def process_escaped_arguments(text):

+ """Extracts escaped arguments on a string and return the arguments as a

+ list.

+ Implemented as an automaton.

+ Example:

+ With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the

+ function will return ['python2.7', '-c', 'print("hi")]

+ """

+ if not text.endswith('\\0'):

+ raise ValueError('String is not null terminated: %r' % text, text)

+ text = text[:-2]

+ def unescape(x):

+ """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""

+ out = []

+ escaped = False

+ for i in x:

+ if i == '\\' and not escaped:

+ escaped = True

+ continue

+ escaped = False

+ out.append(i)

+ return ''.join(out)

+ return [unescape(i) for i in text.split('\\001')]

+ class Tracer(ApiBase.Tracer):

+ # pylint: disable=C0301

+ #

+ # To understand the following code, you'll want to take a look at:

+ # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html

+ # https://wikis.oracle.com/display/DTrace/Variables

+ # http://docs.oracle.com/cd/E19205-01/820-4221/

+ #

+ # 0. Dump all the valid probes into a text file. It is important, you

+ # want to redirect into a file and you don't want to constantly 'sudo'.

+ # $ sudo dtrace -l > probes.txt

+ #

+ # 1. Count the number of probes:

+ # $ wc -l probes.txt

+ # 81823 # On OSX 10.7, including 1 header line.

+ #

+ # 2. List providers, intentionally skipping all the 'syspolicy10925' and the

+ # likes and skipping the header with NR>1:

+ # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'

+ # dtrace

+ # fbt

+ # io

+ # ip

+ # lockstat

+ # mach_trap

+ # proc

+ # profile

+ # sched

+ # syscall

+ # tcp

+ # vminfo

+ #

+ # 3. List of valid probes:

+ # $ grep syscall probes.txt | less

+ # or use dtrace directly:

+ # $ sudo dtrace -l -P syscall | less

+ #

+ # trackedpid is an associative array where its value can be 0, 1 or 2.

+ # 0 is for untracked processes and is the default value for items not

+ # in the associative array.

+ # 1 is for tracked processes.

+ # 2 is for the script created by create_thunk() only. It is not tracked

+ # itself but all its decendants are.

+ #

+ # The script will kill itself only once waiting_to_die == 1 and

+ # current_processes == 0, so that both getlogin() was called and that

+ # all traced processes exited.

+ #

+ # TODO(maruel): Use cacheable predicates. See

+ # https://wikis.oracle.com/display/DTrace/Performance+Considerations

+ D_CODE = """

+ dtrace:::BEGIN {

+ waiting_to_die = 0;

+ current_processes = 0;

+ logindex = 0;

+ printf("%d %d %s_%s(\\"%s\\")\\n",

+ logindex, PID, probeprov, probename, SCRIPT);

+ logindex++;

+ }

+ proc:::start /trackedpid[ppid]/ {

+ trackedpid[pid] = 1;

+ current_processes += 1;

+ printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",

+ logindex, pid, probeprov, probename,

+ ppid,

+ execname,

+ current_processes);

+ logindex++;

+ }

+ /* Should use SCRIPT but there is no access to this variable at that

+ * point. */

+ proc:::start /ppid == PID && execname == "Python"/ {

+ trackedpid[pid] = 2;

+ current_processes += 1;

+ printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",

+ logindex, pid, probeprov, probename,

+ ppid,

+ execname,

+ current_processes);

+ logindex++;

+ }

+ proc:::exit /trackedpid[pid] &&

+ current_processes == 1 &&

+ waiting_to_die == 1/ {

+ trackedpid[pid] = 0;

+ current_processes -= 1;

+ printf("%d %d %s_%s(%d)\\n",

+ logindex, pid, probeprov, probename,

+ current_processes);

+ logindex++;

+ exit(0);

+ }

+ proc:::exit /trackedpid[pid]/ {

+ trackedpid[pid] = 0;

+ current_processes -= 1;

+ printf("%d %d %s_%s(%d)\\n",

+ logindex, pid, probeprov, probename,

+ current_processes);

+ logindex++;

+ }

+ /* Use an arcane function to detect when we need to die */

+ syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {

+ waiting_to_die = 1;

+ printf("%d %d %s()\\n", logindex, pid, probefunc);

+ logindex++;

+ }

+ syscall::ftruncate:entry /

+ pid == PID && arg0 == FILE_ID && current_processes == 0/ {

+ exit(0);

+ }

+ syscall::open*:entry /trackedpid[pid] == 1/ {

+ self->open_arg0 = arg0;

+ self->open_arg1 = arg1;

+ self->open_arg2 = arg2;

+ }

+ syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {

+ this->open_arg0 = copyinstr(self->open_arg0);

+ printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",

+ logindex, pid, probefunc,

+ this->open_arg0,

+ self->open_arg1,

+ self->open_arg2);

+ logindex++;

+ this->open_arg0 = 0;

+ }

+ syscall::open*:return /trackedpid[pid] == 1/ {

+ self->open_arg0 = 0;

+ self->open_arg1 = 0;

+ self->open_arg2 = 0;

+ }

+ syscall::rename:entry /trackedpid[pid] == 1/ {

+ self->rename_arg0 = arg0;

+ self->rename_arg1 = arg1;

+ }

+ syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {

+ this->rename_arg0 = copyinstr(self->rename_arg0);

+ this->rename_arg1 = copyinstr(self->rename_arg1);

+ printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",

+ logindex, pid, probefunc,

+ this->rename_arg0,

+ this->rename_arg1);

+ logindex++;

+ this->rename_arg0 = 0;

+ this->rename_arg1 = 0;

+ }

+ syscall::rename:return /trackedpid[pid] == 1/ {

+ self->rename_arg0 = 0;

+ self->rename_arg1 = 0;

+ }

+ /* Track chdir, it's painful because it is only receiving relative path.

+ */

+ syscall::chdir:entry /trackedpid[pid] == 1/ {

+ self->chdir_arg0 = arg0;

+ }

+ syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {

+ this->chdir_arg0 = copyinstr(self->chdir_arg0);

+ printf("%d %d %s(\\"%s\\")\\n",

+ logindex, pid, probefunc,

+ this->chdir_arg0);

+ logindex++;

+ this->chdir_arg0 = 0;

+ }

+ syscall::chdir:return /trackedpid[pid] == 1/ {

+ self->chdir_arg0 = 0;

+ }

+ """

+ # execve-specific code, tends to throw a lot of exceptions.

+ D_CODE_EXECVE = """

+ /* Finally what we care about! */

+ syscall::exec*:entry /trackedpid[pid]/ {

+ self->exec_arg0 = copyinstr(arg0);

+ /* Incrementally probe for a NULL in the argv parameter of execve() to

+ * figure out argc. */

+ /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was

+ * found. */

+ self->exec_argc = 0;

+ /* Probe for argc==1 */

+ this->exec_argv = (user_addr_t*)copyin(

+ arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

+ self->exec_argc = this->exec_argv[self->exec_argc] ?

+ (self->exec_argc + 1) : self->exec_argc;

+ /* Probe for argc==2 */

+ this->exec_argv = (user_addr_t*)copyin(

+ arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

+ self->exec_argc = this->exec_argv[self->exec_argc] ?

+ (self->exec_argc + 1) : self->exec_argc;

+ /* Probe for argc==3 */

+ this->exec_argv = (user_addr_t*)copyin(

+ arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

+ self->exec_argc = this->exec_argv[self->exec_argc] ?

+ (self->exec_argc + 1) : self->exec_argc;

+ /* Probe for argc==4 */

+ this->exec_argv = (user_addr_t*)copyin(

+ arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

+ self->exec_argc = this->exec_argv[self->exec_argc] ?

+ (self->exec_argc + 1) : self->exec_argc;

+ /* Copy the inputs strings since there is no guarantee they'll be

+ * present after the call completed. */

+ self->exec_argv0 = (self->exec_argc > 0) ?

+ copyinstr(this->exec_argv[0]) : "";

+ self->exec_argv1 = (self->exec_argc > 1) ?

+ copyinstr(this->exec_argv[1]) : "";

+ self->exec_argv2 = (self->exec_argc > 2) ?

+ copyinstr(this->exec_argv[2]) : "";

+ self->exec_argv3 = (self->exec_argc > 3) ?

+ copyinstr(this->exec_argv[3]) : "";

+ this->exec_argv = 0;

+ }

+ syscall::exec*:return /trackedpid[pid] && errno == 0/ {

+ /* We need to join strings here, as using multiple printf() would

+ * cause tearing when multiple threads/processes are traced.

+ * Since it is impossible to escape a string and join it to another one,

+ * like sprintf("%s%S", previous, more), use hackery.

+ * Each of the elements are split with a \\1. \\0 cannot be used because

+ * it is simply ignored. This will conflict with any program putting a

+ * \\1 in their execve() string but this should be "rare enough" */

+ this->args = "";

+ /* Process exec_argv[0] */

+ this->args = strjoin(

+ this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");

+ /* Process exec_argv[1] */

+ this->args = strjoin(

+ this->args, (self->exec_argc > 1) ? "\\1" : "");

+ this->args = strjoin(

+ this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");

+ /* Process exec_argv[2] */

+ this->args = strjoin(

+ this->args, (self->exec_argc > 2) ? "\\1" : "");

+ this->args = strjoin(

+ this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");

+ /* Process exec_argv[3] */

+ this->args = strjoin(

+ this->args, (self->exec_argc > 3) ? "\\1" : "");

+ this->args = strjoin(

+ this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");

+ /* Prints self->exec_argc to permits verifying the internal

+ * consistency since this code is quite fishy. */

+ printf("%d %d %s(\\"%s\\", [%d, %S])\\n",

+ logindex, pid, probefunc,

+ self->exec_arg0,

+ self->exec_argc,

+ this->args);

+ logindex++;

+ this->args = 0;

+ }

+ syscall::exec*:return /trackedpid[pid]/ {

+ self->exec_arg0 = 0;

+ self->exec_argc = 0;

+ self->exec_argv0 = 0;

+ self->exec_argv1 = 0;

+ self->exec_argv2 = 0;

+ self->exec_argv3 = 0;

+ }

+ """

+ # Code currently not used.

+ D_EXTRANEOUS = """

+ /* This is a good learning experience, since it traces a lot of things

+ * related to the process and child processes.

+ * Warning: it generates a gigantic log. For example, tracing

+ * "data/trace_inputs/child1.py --child" generates a 2mb log and takes

+ * several minutes to execute.

+ */

+ /*

+ mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {

+ printf("%d %d %s_%s() = %d\\n",

+ logindex, pid, probeprov, probefunc, errno);

+ logindex++;

+ }

+ proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {

+ printf("%d %d %s_%s() = %d\\n",

+ logindex, pid, probeprov, probefunc, errno);

+ logindex++;

+ }

+ sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {

+ printf("%d %d %s_%s() = %d\\n",

+ logindex, pid, probeprov, probefunc, errno);

+ logindex++;

+ }

+ syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {

+ printf("%d %d %s_%s() = %d\\n",

+ logindex, pid, probeprov, probefunc, errno);

+ logindex++;

+ }

+ vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {

+ printf("%d %d %s_%s() = %d\\n",

+ logindex, pid, probeprov, probefunc, errno);

+ logindex++;

+ }

+ */

+ /* TODO(maruel): *stat* functions and friends

+ syscall::access:return,

+ syscall::chdir:return,

+ syscall::chflags:return,

+ syscall::chown:return,

+ syscall::chroot:return,

+ syscall::getattrlist:return,

+ syscall::getxattr:return,

+ syscall::lchown:return,

+ syscall::lstat64:return,

+ syscall::lstat:return,

+ syscall::mkdir:return,

+ syscall::pathconf:return,

+ syscall::readlink:return,

+ syscall::removexattr:return,

+ syscall::setxattr:return,

+ syscall::stat64:return,

+ syscall::stat:return,

+ syscall::truncate:return,

+ syscall::unlink:return,

+ syscall::utimes:return,

+ */

+ """

+ def __init__(self, logname):

+ """Starts the log collection with dtrace.

+ Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so

+ this needs to wait for dtrace to be "warmed up".

+ """

+ super(Dtrace.Tracer, self).__init__(logname)

+ self._script = create_thunk()

+ # This unique dummy temp file is used to signal the dtrace script that it

+ # should stop as soon as all the child processes are done. A bit hackish

+ # but works fine enough.

+ self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(

+ prefix='trace_signal_file')

+ # Note: do not use the -p flag. It's useless if the initial process quits

+ # too fast, resulting in missing traces from the grand-children. The D

+ # code manages the dtrace lifetime itself.

+ trace_cmd = [

+ 'sudo',

+ 'dtrace',

+ # Use a larger buffer if getting 'out of scratch space' errors.

+ # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables

+ '-b', '10m',

+ '-x', 'dynvarsize=10m',

+ #'-x', 'dtrace_global_maxsize=1m',

+ '-x', 'evaltime=exec',

+ '-o', '/dev/stderr',

+ '-q',

+ '-n', self._get_dtrace_code(),

+ ]

+ with open(self._logname + '.log', 'wb') as logfile:

+ self._dtrace = subprocess.Popen(

+ trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)

+ logging.debug('Started dtrace pid: %d' % self._dtrace.pid)

+ # Reads until one line is printed, which signifies dtrace is up and ready.

+ with open(self._logname + '.log', 'rb') as logfile:

+ while 'dtrace_BEGIN' not in logfile.readline():

+ if self._dtrace.poll() is not None:

+ # Do a busy wait. :/

+ break

+ logging.debug('dtrace started')

+ def _get_dtrace_code(self):

+ """Setups the D code to implement child process tracking.

+ Injects the cookie in the script so it knows when to stop.

+ The script will detect any instance of the script created with

+ create_thunk() and will start tracing it.

+ """

+ return (

+ 'inline int PID = %d;\n'

+ 'inline string SCRIPT = "%s";\n'

+ 'inline int FILE_ID = %d;\n'

+ '\n'

+ '%s') % (

+ os.getpid(),

+ self._script,

+ self._dummy_file_id,

+ self.D_CODE) + self.D_CODE_EXECVE

+ def trace(self, cmd, cwd, tracename, output):

+ """Runs dtrace on an executable.

+ This dtruss is broken when it starts the process itself or when tracing

+ child processes, this code starts a wrapper process

+ generated with create_thunk() which starts the executable to trace.

+ """

+ logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))

+ assert os.path.isabs(cmd[0]), cmd[0]

+ assert os.path.isabs(cwd), cwd

+ assert os.path.normpath(cwd) == cwd, cwd

+ with self._lock:

+ if not self._initialized:

+ raise TracingFailure(

+ 'Called Tracer.trace() on an unitialized object',

+ None, None, None, tracename)

+ assert tracename not in (i['trace'] for i in self._traces)

+ # Starts the script wrapper to start the child process. This signals the

+ # dtrace script that this process is to be traced.

+ stdout = stderr = None

+ if output:

+ stdout = subprocess.PIPE

+ stderr = subprocess.STDOUT

+ child_cmd = [

+ sys.executable,

+ self._script,

+ tracename,

+ ]

+ # Call a dummy function so that dtrace knows I'm about to launch a process

+ # that needs to be traced.

+ # Yummy.

+ child = subprocess.Popen(

+ child_cmd + fix_python_path(cmd),

+ stdin=subprocess.PIPE,

+ stdout=stdout,

+ stderr=stderr,

+ cwd=cwd)

+ logging.debug('Started child pid: %d' % child.pid)

+ out = child.communicate()[0]

+ # This doesn't mean tracing is done, one of the grand-child process may

+ # still be alive. It will be tracked with the dtrace script.

+ with self._lock:

+ assert tracename not in (i['trace'] for i in self._traces)

+ self._traces.append(

+ {

+ 'cmd': cmd,

+ 'cwd': cwd,

+ # The pid of strace process, not very useful.

+ 'pid': child.pid,

+ 'trace': tracename,

+ 'output': out,

+ })

+ return child.returncode, out

+ def close(self, timeout=None):

+ """Terminates dtrace."""

+ logging.debug('close(%s)' % timeout)

+ try:

+ super(Dtrace.Tracer, self).close(timeout)

+ # Signal dtrace that it should stop now.

+ os.ftruncate(self._dummy_file_id, 0)

+ if timeout:

+ start = time.time()

+ # Use polling. :/

+ while (self._dtrace.poll() is None and

+ (time.time() - start) < timeout):

+ time.sleep(0.1)

+ self._dtrace.kill()

+ self._dtrace.wait()

+ finally:

+ # Make sure to kill it in any case.

+ if self._dtrace.poll() is None:

+ try:

+ self._dtrace.kill()

+ self._dtrace.wait()

+ except OSError:

+ pass

+ if self._dtrace.returncode != 0:

+ # Warn about any dtrace failure but basically ignore it.

+ print 'dtrace failure: %s' % self._dtrace.returncode

+ finally:

+ os.close(self._dummy_file_id)

+ os.remove(self._dummy_file_name)

+ os.remove(self._script)

+ def post_process_log(self):

+ """Sorts the log back in order when each call occured.

+ dtrace doesn't save the buffer in strict order since it keeps one buffer

+ per CPU.

+ """

+ super(Dtrace.Tracer, self).post_process_log()

+ logname = self._logname + '.log'

+ with open(logname, 'rb') as logfile:

+ lines = [l for l in logfile if l.strip()]

+ errors = [l for l in lines if l.startswith('dtrace:')]

+ if errors:

+ raise TracingFailure(

+ 'Found errors in the trace: %s' % '\n'.join(errors),

+ None, None, None, logname)

+ try:

+ lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))

+ except ValueError:

+ raise TracingFailure(

+ 'Found errors in the trace: %s' % '\n'.join(

+ l for l in lines if l.split(' ', 1)[0].isdigit()),

+ None, None, None, logname)

+ with open(logname, 'wb') as logfile:

+ logfile.write(''.join(lines))

+ @staticmethod

+ def clean_trace(logname):

+ for ext in ('', '.log'):

+ if os.path.isfile(logname + ext):

+ os.remove(logname + ext)

+ @classmethod

+ def parse_log(cls, logname, blacklist):

+ logging.info('parse_log(%s, ...)' % logname)

+ def blacklist_more(filepath):

+ # All the HFS metadata is in the form /.vol/...

+ return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)

+ data = read_json(logname)

+ out = []

+ for item in data['traces']:

+ context = cls.Context(blacklist_more, item['pid'], item['cwd'])

+ for line in open(logname + '.log', 'rb'):

+ context.on_line(line)

+ out.append(

+ {

+ 'results': context.to_results(),

+ 'trace': item['trace'],

+ 'output': item['output'],

+ })

+ return out

+class LogmanTrace(ApiBase):

+ """Uses the native Windows ETW based tracing functionality to trace a child

+ process.

+ Caveat: this implementations doesn't track cwd or initial_cwd. It is because

+ the Windows Kernel doesn't have a concept of 'current working directory' at

+ all. A Win32 process has a map of current directories, one per drive letter

+ and it is managed by the user mode kernel32.dll. In kernel, a file is always

+ opened relative to another file_object or as an absolute path. All the current

+ working directory logic is done in user mode.

+ """

+ class Context(ApiBase.Context):

+ """Processes a ETW log line and keeps the list of existent and non

+ existent files accessed.

+ Ignores directories.

+ """

+ # These indexes are for the stripped version in json.

+ EVENT_NAME = 0

+ TYPE = 1

+ PID = 2

+ TID = 3

+ PROCESSOR_ID = 4

+ TIMESTAMP = 5

+ USER_DATA = 6

+ class Process(ApiBase.Context.Process):

+ def __init__(self, *args):

+ super(LogmanTrace.Context.Process, self).__init__(*args)

+ # Handle file objects that succeeded.

+ self.file_objects = {}

+ def __init__(self, blacklist, tracer_pid):

+ logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))

+ super(LogmanTrace.Context, self).__init__(blacklist)

+ self._drive_map = DosDriveMap()

+ # Threads mapping to the corresponding process id.

+ self._threads_active = {}

+ # Process ID of the tracer, e.g. tracer_inputs.py

+ self._tracer_pid = tracer_pid

+ self._line_number = 0

+ def on_line(self, line):

+ """Processes a json Event line."""

+ self._line_number += 1

+ try:

+ # By Opcode

+ handler = getattr(

+ self,

+ 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),

+ None)

+ if not handler:

+ raise TracingFailure(

+ 'Unexpected event %s_%s' % (

+ line[self.EVENT_NAME], line[self.TYPE]),

+ None, None, None)

+ handler(line)

+ except TracingFailure, e:

+ # Hack in the values since the handler could be a static function.

+ e.pid = line[self.PID]

+ e.line = line

+ e.line_number = self._line_number

+ # Re-raise the modified exception.

+ raise

+ except (KeyError, NotImplementedError, ValueError), e:

+ raise TracingFailure(

+ 'Trace generated a %s exception: %s' % (

+ e.__class__.__name__, str(e)),

+ line[self.PID],

+ self._line_number,

+ line,

+ e)

+ def to_results(self):

+ if not self.root_process:

+ raise TracingFailure(

+ 'Failed to detect the initial process',

+ None, None, None)

+ process = self.root_process.to_results_process()

+ return Results(process)

+ def _thread_to_process(self, tid):

+ """Finds the process from the thread id."""

+ tid = int(tid, 16)

+ pid = self._threads_active.get(tid)

+ if not pid or not self._process_lookup.get(pid):

+ return

+ return self._process_lookup[pid]

+ @classmethod

+ def handle_EventTrace_Header(cls, line):

+ """Verifies no event was dropped, e.g. no buffer overrun occured."""

+ BUFFER_SIZE = cls.USER_DATA

+ #VERSION = cls.USER_DATA + 1

+ #PROVIDER_VERSION = cls.USER_DATA + 2

+ #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3

+ #END_TIME = cls.USER_DATA + 4

+ #TIMER_RESOLUTION = cls.USER_DATA + 5

+ #MAX_FILE_SIZE = cls.USER_DATA + 6

+ #LOG_FILE_MODE = cls.USER_DATA + 7

+ #BUFFERS_WRITTEN = cls.USER_DATA + 8

+ #START_BUFFERS = cls.USER_DATA + 9

+ #POINTER_SIZE = cls.USER_DATA + 10

+ EVENTS_LOST = cls.USER_DATA + 11

+ #CPU_SPEED = cls.USER_DATA + 12

+ #LOGGER_NAME = cls.USER_DATA + 13

+ #LOG_FILE_NAME = cls.USER_DATA + 14

+ #BOOT_TIME = cls.USER_DATA + 15

+ #PERF_FREQ = cls.USER_DATA + 16

+ #START_TIME = cls.USER_DATA + 17

+ #RESERVED_FLAGS = cls.USER_DATA + 18

+ #BUFFERS_LOST = cls.USER_DATA + 19

+ #SESSION_NAME_STRING = cls.USER_DATA + 20

+ #LOG_FILE_NAME_STRING = cls.USER_DATA + 21

+ if line[EVENTS_LOST] != '0':

+ raise TracingFailure(

+ ( '%s events were lost during trace, please increase the buffer '

+ 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),

+ None, None, None)

+ def handle_FileIo_Cleanup(self, line):

+ """General wisdom: if a file is closed, it's because it was opened.

+ Note that FileIo_Close is not used since if a file was opened properly but

+ not closed before the process exits, only Cleanup will be logged.

+ """

+ #IRP = self.USER_DATA

+ TTID = self.USER_DATA + 1 # Thread ID, that's what we want.

+ FILE_OBJECT = self.USER_DATA + 2

+ #FILE_KEY = self.USER_DATA + 3

+ proc = self._thread_to_process(line[TTID])

+ if not proc:

+ # Not a process we care about.

+ return

+ file_object = line[FILE_OBJECT]

+ if file_object in proc.file_objects:

+ proc.add_file(proc.file_objects.pop(file_object), False)

+ def handle_FileIo_Create(self, line):

+ """Handles a file open.

+ All FileIo events are described at

+ http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx

+ for some value of 'description'.

+ " (..) process and thread id values of the IO events (..) are not valid "

+ http://msdn.microsoft.com/magazine/ee358703.aspx

+ The FileIo.Create event doesn't return if the CreateFile() call

+ succeeded, so keep track of the file_object and check that it is

+ eventually closed with FileIo_Cleanup.

+ """

+ #IRP = self.USER_DATA

+ TTID = self.USER_DATA + 1 # Thread ID, that's what we want.

+ FILE_OBJECT = self.USER_DATA + 2

+ #CREATE_OPTIONS = self.USER_DATA + 3

+ #FILE_ATTRIBUTES = self.USER_DATA + 4

+ #self.USER_DATA + SHARE_ACCESS = 5

+ OPEN_PATH = self.USER_DATA + 6

+ proc = self._thread_to_process(line[TTID])

+ if not proc:

+ # Not a process we care about.

+ return

+ match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])

+ raw_path = match.group(1)

+ # Ignore directories and bare drive right away.

+ if raw_path.endswith(os.path.sep):

+ return

+ filepath = self._drive_map.to_win32(raw_path)

+ # Ignore bare drive right away. Some may still fall through with format

+ # like '\\?\X:'

+ if len(filepath) == 2:

+ return

+ file_object = line[FILE_OBJECT]

+ if os.path.isdir(filepath):

+ # There is no O_DIRECTORY equivalent on Windows. The closed is

+ # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So

+ # simply discard directories are they are found.

+ return

+ # Override any stale file object

+ proc.file_objects[file_object] = filepath

+ def handle_FileIo_Rename(self, line):

+ # TODO(maruel): Handle?

+ pass

+ def handle_Process_End(self, line):

+ pid = line[self.PID]

+ if self._process_lookup.get(pid):

+ logging.info('Terminated: %d' % pid)

+ self._process_lookup[pid] = None

+ else:

+ logging.debug('Terminated: %d' % pid)

+ def handle_Process_Start(self, line):

+ """Handles a new child process started by PID."""

+ #UNIQUE_PROCESS_KEY = self.USER_DATA

+ PROCESS_ID = self.USER_DATA + 1

+ #PARENT_PID = self.USER_DATA + 2

+ #SESSION_ID = self.USER_DATA + 3

+ #EXIT_STATUS = self.USER_DATA + 4

+ #DIRECTORY_TABLE_BASE = self.USER_DATA + 5

+ #USER_SID = self.USER_DATA + 6

+ IMAGE_FILE_NAME = self.USER_DATA + 7

+ COMMAND_LINE = self.USER_DATA + 8

+ ppid = line[self.PID]

+ pid = int(line[PROCESS_ID], 16)

+ logging.debug(

+ 'New process %d->%d (%s) %s' %

+ (ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))

+ if ppid == self._tracer_pid:

+ # Need to ignore processes we don't know about because the log is

+ # system-wide. self._tracer_pid shall start only one process.

+ if self.root_process:

+ raise TracingFailure(

+ ( 'Parent process is _tracer_pid(%d) but root_process(%d) is '

+ 'already set') % (self._tracer_pid, self.root_process.pid),

+ None, None, None)

+ proc = self.Process(self.blacklist, pid, None)

+ self.root_process = proc

+ ppid = None

+ elif self._process_lookup.get(ppid):

+ proc = self.Process(self.blacklist, pid, None)

+ self._process_lookup[ppid].children.append(proc)

+ else:

+ # Ignore

+ return

+ self._process_lookup[pid] = proc

+ if (not line[IMAGE_FILE_NAME].startswith('"') or

+ not line[IMAGE_FILE_NAME].endswith('"')):

+ raise TracingFailure(

+ 'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],

+ None, None, None)

+ # TODO(maruel): Process escapes.

+ if (not line[COMMAND_LINE].startswith('"') or

+ not line[COMMAND_LINE].endswith('"')):

+ raise TracingFailure(

+ 'Command line is not properly quoted: %s' % line[COMMAND_LINE],

+ None, None, None)

+ proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])

+ proc.executable = line[IMAGE_FILE_NAME][1:-1]

+ # proc.command[0] may be the absolute path of 'executable' but it may be

+ # anything else too. If it happens that command[0] ends with executable,

+ # use it, otherwise defaults to the base name.

+ cmd0 = proc.command[0].lower()

+ if not cmd0.endswith('.exe'):

+ # TODO(maruel): That's not strictly true either.

+ cmd0 += '.exe'

+ if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):

+ # Fix the path.

+ cmd0 = cmd0.replace('/', os.path.sep)

+ cmd0 = os.path.normpath(cmd0)

+ proc.executable = get_native_path_case(cmd0)

+ logging.info(

+ 'New child: %s -> %d %s' % (ppid, pid, proc.executable))

+ def handle_Thread_End(self, line):

+ """Has the same parameters as Thread_Start."""

+ tid = int(line[self.TID], 16)

+ self._threads_active.pop(tid, None)

+ def handle_Thread_Start(self, line):

+ """Handles a new thread created.

+ Do not use self.PID here since a process' initial thread is created by

+ the parent process.

+ """

+ PROCESS_ID = self.USER_DATA

+ TTHREAD_ID = self.USER_DATA + 1

+ #STACK_BASE = self.USER_DATA + 2

+ #STACK_LIMIT = self.USER_DATA + 3

+ #USER_STACK_BASE = self.USER_DATA + 4

+ #USER_STACK_LIMIT = self.USER_DATA + 5

+ #AFFINITY = self.USER_DATA + 6

+ #WIN32_START_ADDR = self.USER_DATA + 7

+ #TEB_BASE = self.USER_DATA + 8

+ #SUB_PROCESS_TAG = self.USER_DATA + 9

+ #BASE_PRIORITY = self.USER_DATA + 10

+ #PAGE_PRIORITY = self.USER_DATA + 11

+ #IO_PRIORITY = self.USER_DATA + 12

+ #THREAD_FLAGS = self.USER_DATA + 13

+ # Do not use self.PID here since a process' initial thread is created by

+ # the parent process.

+ pid = int(line[PROCESS_ID], 16)

+ tid = int(line[TTHREAD_ID], 16)

+ logging.debug('New thread pid:%d, tid:%d' % (pid, tid))

+ self._threads_active[tid] = pid

+ @classmethod

+ def supported_events(cls):

+ """Returns all the procesed events."""

+ out = []

+ for member in dir(cls):

+ match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)

+ if match:

+ out.append(match.groups())

+ return out

+ class Tracer(ApiBase.Tracer):

+ # The basic headers.

+ EXPECTED_HEADER = [

+ u'Event Name',

+ u'Type',

+ u'Event ID',

+ u'Version',

+ u'Channel',

+ u'Level', # 5

+ u'Opcode',

+ u'Task',

+ u'Keyword',

+ u'PID',

+ u'TID', # 10

+ u'Processor Number',

+ u'Instance ID',

+ u'Parent Instance ID',

+ u'Activity ID',

+ u'Related Activity ID', # 15

+ u'Clock-Time',

+ u'Kernel(ms)', # Both have a resolution of ~15ms which makes them

+ u'User(ms)', # pretty much useless.

+ u'User Data', # Extra arguments that are event-specific.

+ ]

+ # Only the useful headers common to all entries are listed there. Any column

+ # at 19 or higher is dependent on the specific event.

+ EVENT_NAME = 0

+ TYPE = 1

+ PID = 9

+ TID = 10

+ PROCESSOR_ID = 11

+ TIMESTAMP = 16

+ NULL_GUID = '{00000000-0000-0000-0000-000000000000}'

+ USER_DATA = 19

+ def __init__(self, logname):

+ """Starts the log collection.

+ Requires administrative access. logman.exe is synchronous so no need for a

+ "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID

+ instead. The GUID constant name is SystemTraceControlGuid. Lovely.

+ One can get the list of potentially interesting providers with:

+ "logman query providers | findstr /i file"

+ """

+ super(LogmanTrace.Tracer, self).__init__(logname)

+ self._script = create_thunk()

+ cmd_start = [

+ 'logman.exe',

+ 'start',

+ 'NT Kernel Logger',

+ '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',

+ # splitio,fileiocompletion,syscall,file,cswitch,img

+ '(process,fileio,thread)',

+ '-o', self._logname + '.etl',

+ '-ets', # Send directly to kernel

+ # Values extracted out of thin air.

+ # Event Trace Session buffer size in kb.

+ '-bs', '10240',

+ # Number of Event Trace Session buffers.

+ '-nb', '16', '256',

+ ]

+ logging.debug('Running: %s' % cmd_start)

+ try:

+ subprocess.check_call(

+ cmd_start,

+ stdin=subprocess.PIPE,

+ stdout=subprocess.PIPE,

+ stderr=subprocess.STDOUT)

+ except subprocess.CalledProcessError, e:

+ if e.returncode == -2147024891:

+ print >> sys.stderr, 'Please restart with an elevated admin prompt'

+ elif e.returncode == -2144337737:

+ print >> sys.stderr, (

+ 'A kernel trace was already running, stop it and try again')

+ raise

+ def trace(self, cmd, cwd, tracename, output):

+ logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))

+ assert os.path.isabs(cmd[0]), cmd[0]

+ assert os.path.isabs(cwd), cwd

+ assert os.path.normpath(cwd) == cwd, cwd

+ with self._lock:

+ if not self._initialized:

+ raise TracingFailure(

+ 'Called Tracer.trace() on an unitialized object',

+ None, None, None, tracename)

+ assert tracename not in (i['trace'] for i in self._traces)

+ # Use "logman -?" for help.

+ stdout = stderr = None

+ if output:

+ stdout = subprocess.PIPE

+ stderr = subprocess.STDOUT

+ # Run the child process.

+ logging.debug('Running: %s' % cmd)

+ # Use the temporary script generated with create_thunk() so we have a

+ # clear pid owner. Since trace_inputs.py can be used as a library and

+ # could trace multiple processes simultaneously, it makes it more complex

+ # if the executable to be traced is executed directly here. It also solves

+ # issues related to logman.exe that needs to be executed to control the

+ # kernel trace.

+ child_cmd = [

+ sys.executable,

+ self._script,

+ tracename,

+ ]

+ child = subprocess.Popen(

+ child_cmd + fix_python_path(cmd),

+ cwd=cwd,

+ stdin=subprocess.PIPE,

+ stdout=stdout,

+ stderr=stderr)

+ logging.debug('Started child pid: %d' % child.pid)

+ out = child.communicate()[0]

+ # This doesn't mean all the grand-children are done. Sadly, we don't have

+ # a good way to determine that.

+ with self._lock:

+ assert tracename not in (i['trace'] for i in self._traces)

+ self._traces.append({

+ 'command': cmd,

+ 'cwd': cwd,

+ 'pid': child.pid,

+ 'trace': tracename,

+ 'output': out,

+ })

+ return child.returncode, out

+ def close(self, _timeout=None):

+ """Stops the kernel log collection and converts the traces to text

+ representation.

+ """

+ with self._lock:

+ if not self._initialized:

+ raise TracingFailure(

+ 'Called Tracer.close() on an unitialized object',

+ None, None, None)

+ os.remove(self._script)

+ # Save metadata, add 'format' key..

+ data = {

+ 'format': 'csv',

+ 'traces': self._traces,

+ }

+ write_json(self._logname, data, False)

+ cmd_stop = [

+ 'logman.exe',

+ 'stop',

+ 'NT Kernel Logger',

+ '-ets', # Sends the command directly to the kernel.

+ ]

+ logging.debug('Running: %s' % cmd_stop)

+ subprocess.check_call(

+ cmd_stop,

+ stdin=subprocess.PIPE,

+ stdout=subprocess.PIPE,

+ stderr=subprocess.STDOUT)

+ self._initialized = False

+ def post_process_log(self):

+ """Converts the .etl file into .csv then into .json."""

+ super(LogmanTrace.Tracer, self).post_process_log()

+ logformat = 'csv'

+ self._convert_log(logformat)

+ if logformat == 'csv_utf16':

+ def load_file():

+ def utf_8_encoder(unicode_csv_data):

+ """Encodes the unicode object as utf-8 encoded str instance"""

+ for line in unicode_csv_data:

+ yield line.encode('utf-8')

+ def unicode_csv_reader(unicode_csv_data, **kwargs):

+ """Encodes temporarily as UTF-8 since csv module doesn't do unicode.

+ """

+ csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)

+ for row in csv_reader:

+ # Decode str utf-8 instances back to unicode instances, cell by

+ # cell:

+ yield [cell.decode('utf-8') for cell in row]

+ # The CSV file is UTF-16 so use codecs.open() to load the file into

+ # the python internal unicode format (utf-8). Then explicitly

+ # re-encode as utf8 as str instances so csv can parse it fine. Then

+ # decode the utf-8 str back into python unicode instances. This

+ # sounds about right.

+ for line in unicode_csv_reader(

+ codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):

+ # line is a list of unicode objects

+ # So much white space!

+ yield [i.strip() for i in line]

+ elif logformat == 'csv':

+ def load_file():

+ def ansi_csv_reader(ansi_csv_data, **kwargs):

+ """Loads an 'ANSI' code page and returns unicode() objects."""

+ assert sys.getfilesystemencoding() == 'mbcs'

+ encoding = get_current_encoding()

+ for row in csv.reader(ansi_csv_data, **kwargs):

+ # Decode str 'ansi' instances to unicode instances, cell by cell:

+ yield [cell.decode(encoding) for cell in row]

+ # The fastest and smallest format but only supports 'ANSI' file paths.

+ # E.g. the filenames are encoding in the 'current' encoding.

+ for line in ansi_csv_reader(open(self._logname + '.' + logformat)):

+ # line is a list of unicode objects.

+ yield [i.strip() for i in line]

+ supported_events = LogmanTrace.Context.supported_events()

+ def trim(generator):

+ for index, line in enumerate(generator):

+ if not index:

+ if line != self.EXPECTED_HEADER:

+ raise TracingFailure(

+ 'Found malformed header: %s' % ' '.join(line),

+ None, None, None)

+ continue

+ # As you can see, the CSV is full of useful non-redundant information:

+ if (line[2] != '0' or # Event ID

+ line[3] not in ('2', '3') or # Version

+ line[4] != '0' or # Channel

+ line[5] != '0' or # Level

+ line[7] != '0' or # Task

+ line[8] != '0x0000000000000000' or # Keyword

+ line[12] != '' or # Instance ID

+ line[13] != '' or # Parent Instance ID

+ line[14] != self.NULL_GUID or # Activity ID

+ line[15] != ''): # Related Activity ID

+ raise TracingFailure(

+ 'Found unexpected values in line: %s' % ' '.join(line),

+ None, None, None)

+ if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:

+ continue

+ # Convert the PID in-place from hex.

+ line[self.PID] = int(line[self.PID], 16)

+ yield [

+ line[self.EVENT_NAME],

+ line[self.TYPE],

+ line[self.PID],

+ line[self.TID],

+ line[self.PROCESSOR_ID],

+ line[self.TIMESTAMP],

+ ] + line[self.USER_DATA:]

+ write_json('%s.json' % self._logname, list(trim(load_file())), True)

+ def _convert_log(self, logformat):

+ """Converts the ETL trace to text representation.

+ Normally, 'csv' is sufficient. If complex scripts are used (like eastern

+ languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.

+ Arguments:

+ - logformat: Text format to be generated, csv, csv_utf16 or xml.

+ Use "tracerpt -?" for help.

+ """

+ LOCALE_INVARIANT = 0x7F

+ windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)

+ cmd_convert = [

+ 'tracerpt.exe',

+ '-l', self._logname + '.etl',

+ '-o', self._logname + '.' + logformat,

+ '-gmt', # Use UTC

+ '-y', # No prompt

+ # Use -of XML to get the header of each items after column 19, e.g. all

+ # the actual headers of 'User Data'.

+ ]

+ if logformat == 'csv':

+ # tracerpt localizes the 'Type' column, for major brainfuck

+ # entertainment. I can't imagine any sane reason to do that.

+ cmd_convert.extend(['-of', 'CSV'])

+ elif logformat == 'csv_utf16':

+ # This causes it to use UTF-16, which doubles the log size but ensures

+ # the log is readable for non-ASCII characters.

+ cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])

+ elif logformat == 'xml':

+ cmd_convert.extend(['-of', 'XML'])

+ else:

+ raise ValueError('Unexpected log format \'%s\'' % logformat)

+ logging.debug('Running: %s' % cmd_convert)

+ # This can takes tens of minutes for large logs.

+ # Redirects all output to stderr.

+ subprocess.check_call(

+ cmd_convert,

+ stdin=subprocess.PIPE,

+ stdout=sys.stderr,

+ stderr=sys.stderr)

+ @staticmethod

+ def clean_trace(logname):

+ for ext in ('', '.csv', '.etl', '.json', '.xml'):

+ if os.path.isfile(logname + ext):

+ os.remove(logname + ext)

+ @classmethod

+ def parse_log(cls, logname, blacklist):

+ logging.info('parse_log(%s, %s)' % (logname, blacklist))

+ def blacklist_more(filepath):

+ # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.

+ return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)

+ data = read_json(logname)

+ lines = read_json(logname + '.json')

+ out = []

+ for item in data['traces']:

+ context = cls.Context(blacklist_more, item['pid'])

+ for line in lines:

+ context.on_line(line)

+ out.append(

+ {

+ 'results': context.to_results(),

+ 'trace': item['trace'],

+ 'output': item['output'],

+ })

+ return out

+def get_api():

+ """Returns the correct implementation for the current OS."""

+ if sys.platform == 'cygwin':

+ raise NotImplementedError(

+ 'Not implemented for cygwin, start the script from Win32 python')

+ flavors = {

+ 'win32': LogmanTrace,

+ 'darwin': Dtrace,

+ 'sunos5': Dtrace,

+ 'freebsd7': Dtrace,

+ 'freebsd8': Dtrace,

+ }

+ # Defaults to strace.

+ return flavors.get(sys.platform, Strace)()

+def extract_directories(root_dir, files, blacklist):

+ """Detects if all the files in a directory are in |files| and if so, replace

+ the individual files by a Results.Directory instance.

+ Takes a list of Results.File instances and returns a shorter list of

+ Results.File and Results.Directory instances.

+ Arguments:

+ - root_dir: Optional base directory that shouldn't be search further.

+ - files: list of Results.File instances.

+ - blacklist: regexp of files to ignore, for example r'.+\.pyc'.

+ """

+ logging.info(

+ 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))

+ assert not (root_dir or '').endswith(os.path.sep), root_dir

+ assert not root_dir or (get_native_path_case(root_dir) == root_dir)

+ assert not any(isinstance(f, Results.Directory) for f in files)

+ # Remove non existent files.

+ files = [f for f in files if f.existent]

+ if not files:

+ return files

+ # All files must share the same root, which can be None.

+ assert len(set(f.root for f in files)) == 1, set(f.root for f in files)

+ # Creates a {directory: {filename: File}} mapping, up to root.

+ buckets = {}

+ if root_dir:

+ buckets[root_dir] = {}

+ for fileobj in files:

+ path = fileobj.full_path

+ directory = os.path.dirname(path)

+ assert directory

+ # Do not use os.path.basename() so trailing os.path.sep is kept.

+ basename = path[len(directory)+1:]

+ files_in_directory = buckets.setdefault(directory, {})

+ files_in_directory[basename] = fileobj

+ # Add all the directories recursively up to root.

+ while True:

+ old_d = directory

+ directory = os.path.dirname(directory)

+ if directory + os.path.sep == root_dir or directory == old_d:

+ break

+ buckets.setdefault(directory, {})

+ root_prefix = len(root_dir) + 1 if root_dir else 0

+ for directory in sorted(buckets, reverse=True):

+ actual = set(f for f in os.listdir(directory) if not blacklist(f))

+ expected = set(buckets[directory])

+ if not (actual - expected):

+ parent = os.path.dirname(directory)

+ buckets[parent][os.path.basename(directory)] = Results.Directory(

+ root_dir,

+ directory[root_prefix:],

+ False,

+ sum(f.size for f in buckets[directory].itervalues()),

+ sum(f.nb_files for f in buckets[directory].itervalues()))

+ # Remove the whole bucket.

+ del buckets[directory]

+ # Reverse the mapping with what remains. The original instances are returned,

+ # so the cached meta data is kept.

+ files = sum((x.values() for x in buckets.itervalues()), [])

+ return sorted(files, key=lambda x: x.path)

+def trace(logfile, cmd, cwd, api, output):

+ """Traces an executable. Returns (returncode, output) from api.

+ Arguments:

+ - logfile: file to write to.

+ - cmd: command to run.

+ - cwd: current directory to start the process in.

+ - api: a tracing api instance.

+ - output: if True, returns output, otherwise prints it at the console.

+ """

+ cmd = fix_python_path(cmd)

+ api.clean_trace(logfile)

+ with api.get_tracer(logfile) as tracer:

+ return tracer.trace(cmd, cwd, 'default', output)

+def load_trace(logfile, root_dir, api, blacklist):

+ """Loads a trace file and returns the Results instance.

+ Arguments:

+ - logfile: File to load.

+ - root_dir: Root directory to use to determine if a file is relevant to the

+ trace or not.

+ - api: A tracing api instance.

+ - blacklist: Optional blacklist function to filter out unimportant files.

+ """

+ data = api.parse_log(logfile, (blacklist or (lambda _: False)))

+ assert len(data) == 1, 'More than one trace was detected!'

+ if 'exception' in data[0]:

+ # It got an exception, raise it.

+ raise data[0]['exception']

+ results = data[0]['results']

+ if root_dir:

+ results = results.strip_root(root_dir)

+ return results

+def CMDclean(args):

+ """Cleans up traces."""

+ parser = OptionParserTraceInputs(command='clean')

+ options, args = parser.parse_args(args)

+ api = get_api()

+ api.clean_trace(options.log)

+ return 0

+def CMDtrace(args):

+ """Traces an executable."""

+ parser = OptionParserTraceInputs(command='trace')

+ parser.allow_interspersed_args = False

+ parser.add_option(

+ '-q', '--quiet', action='store_true',

+ help='Redirects traced executable output to /dev/null')

+ options, args = parser.parse_args(args)

+ if not args:

+ parser.error('Please provide a command to run')

+ if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):

+ args[0] = os.path.abspath(args[0])

+ api = get_api()

+ return trace(options.log, args, os.getcwd(), api, options.quiet)[0]

+def CMDread(args):

+ """Reads the logs and prints the result."""

+ parser = OptionParserTraceInputs(command='read')

+ parser.add_option(

+ '-V', '--variable',

+ nargs=2,

+ action='append',

+ dest='variables',

+ metavar='VAR_NAME directory',

+ default=[],

+ help=('Variables to replace relative directories against. Example: '

+ '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '

+ 'home dir with $HOME') % getpass.getuser())

+ parser.add_option(

+ '--root-dir',

+ help='Root directory to base everything off it. Anything outside of this '

+ 'this directory will not be reported')

+ parser.add_option(

+ '-j', '--json', action='store_true',

+ help='Outputs raw result data as json')

+ parser.add_option(

+ '-b', '--blacklist', action='append', default=[],

+ help='List of regexp to use as blacklist filter')

+ options, args = parser.parse_args(args)

+ if options.root_dir:

+ options.root_dir = os.path.abspath(options.root_dir)

+ variables = dict(options.variables)

+ api = get_api()

+ def blacklist(f):

+ return any(re.match(b, f) for b in options.blacklist)

+ data = api.parse_log(options.log, blacklist)

+ # Process each trace.

+ output_as_json = []

+ for item in data:

+ if 'exception' in item:

+ print >> sys.stderr, (

+ 'Trace %s: Got an exception: %s' % (item['trace'], item['exception']))

+ continue

+ results = item['results']

+ if options.root_dir:

+ results = results.strip_root(options.root_dir)

+ if options.json:

+ output_as_json.append(results.flatten())

+ else:

+ simplified = extract_directories(

+ options.root_dir, results.files, blacklist)

+ simplified = [f.replace_variables(variables) for f in simplified]

+ if len(data) > 1:

+ print('Trace: %s' % item['trace'])

+ print('Total: %d' % len(results.files))

+ print('Non existent: %d' % len(results.non_existent))

+ for f in results.non_existent:

+ print(' %s' % f.path)

+ print(

+ 'Interesting: %d reduced to %d' % (

+ len(results.existent), len(simplified)))

+ for f in simplified:

+ print(' %s' % f.path)

+ if options.json:

+ write_json(sys.stdout, output_as_json, False)

+ return 0

+class OptionParserWithLogging(optparse.OptionParser):

+ """Adds --verbose option."""

+ def __init__(self, verbose=0, **kwargs):

+ optparse.OptionParser.__init__(self, **kwargs)

+ self.add_option(

+ '-v', '--verbose',

+ action='count',

+ default=verbose,

+ help='Use multiple times to increase verbosity')

+ def parse_args(self, *args, **kwargs):

+ options, args = optparse.OptionParser.parse_args(self, *args, **kwargs)

+ levels = [logging.ERROR, logging.INFO, logging.DEBUG]

+ logging.basicConfig(

+ level=levels[min(len(levels)-1, options.verbose)],

+ format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')

+ return options, args

+class OptionParserWithNiceDescription(OptionParserWithLogging):

+ """Generates the description with the command's docstring."""

+ def __init__(self, **kwargs):

+ """Sets 'description' and 'usage' if not already specified."""

+ command = kwargs.pop('command', 'help')

+ kwargs.setdefault(

+ 'description',

+ re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))

+ kwargs.setdefault('usage', '%%prog %s [options]' % command)

+ OptionParserWithLogging.__init__(self, **kwargs)

+class OptionParserTraceInputs(OptionParserWithNiceDescription):

+ """Adds automatic --log handling."""

+ def __init__(self, **kwargs):

+ OptionParserWithNiceDescription.__init__(self, **kwargs)

+ self.add_option(

+ '-l', '--log', help='Log file to generate or read, required')

+ def parse_args(self, *args, **kwargs):

+ """Makes sure the paths make sense.

+ On Windows, / and \ are often mixed together in a path.

+ """

+ options, args = OptionParserWithNiceDescription.parse_args(

+ self, *args, **kwargs)

+ if not options.log:

+ self.error('Must supply a log file with -l')

+ options.log = os.path.abspath(options.log)

+ return options, args

+def extract_documentation():

+ """Returns a dict {command: description} for each of documented command."""

+ commands = (

+ fn[3:]

+ for fn in dir(sys.modules['__main__'])

+ if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)

+ return dict((fn, get_command_handler(fn).__doc__) for fn in commands)

+def CMDhelp(args):

+ """Prints list of commands or help for a specific command."""

+ doc = extract_documentation()

+ # Calculates the optimal offset.

+ offset = max(len(cmd) for cmd in doc)

+ format_str = ' %-' + str(offset + 2) + 's %s'

+ # Generate a one-liner documentation of each commands.

+ commands_description = '\n'.join(

+ format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))

+ parser = OptionParserWithNiceDescription(

+ usage='%prog <command> [options]',

+ description='Commands are:\n%s\n' % commands_description)

+ parser.format_description = lambda _: parser.description

+ # Strip out any -h or --help argument.

+ _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])

+ if len(args) == 1:

+ if not get_command_handler(args[0]):

+ parser.error('Unknown command %s' % args[0])

+ # The command was "%prog help command", replaces ourself with

+ # "%prog command --help" so help is correctly printed out.

+ return main(args + ['--help'])

+ elif args:

+ parser.error('Unknown argument "%s"' % ' '.join(args))

+ parser.print_help()

+ return 0

+def get_command_handler(name):

+ """Returns the command handler or CMDhelp if it doesn't exist."""

+ return getattr(sys.modules['__main__'], 'CMD%s' % name, None)

+def main_impl(argv):

+ command = get_command_handler(argv[0] if argv else 'help')

+ if not command:

+ return CMDhelp(argv)

+ return command(argv[1:])

+def main(argv):

+ try:

+ main_impl(argv)

+ except TracingFailure, e:

+ sys.stderr.write('\nError: ')

+ sys.stderr.write(str(e))

+ sys.stderr.write('\n')

+ return 1

+if __name__ == '__main__':

+ sys.exit(main(sys.argv[1:]))

« no previous file with comments | « tests/trace_test_cases_smoke_test.py ('k') | trace_test_cases.py » ('j') | no next file with comments »