| Index: swarm_client/trace_inputs.py
|
| ===================================================================
|
| --- swarm_client/trace_inputs.py (revision 235167)
|
| +++ swarm_client/trace_inputs.py (working copy)
|
| @@ -1,3402 +0,0 @@
|
| -#!/usr/bin/env python
|
| -# coding=utf-8
|
| -# Copyright (c) 2012 The Chromium Authors. All rights reserved.
|
| -# Use of this source code is governed by a BSD-style license that can be
|
| -# found in the LICENSE file.
|
| -
|
| -"""Traces an executable and its child processes and extract the files accessed
|
| -by them.
|
| -
|
| -The implementation uses OS-specific API. The native Kernel logger and the ETL
|
| -interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.
|
| -The OS-specific implementation is hidden in an 'API' interface.
|
| -
|
| -The results are embedded in a Results instance. The tracing is done in two
|
| -phases, the first is to do the actual trace and generate an
|
| -implementation-specific log file. Then the log file is parsed to extract the
|
| -information, including the individual child processes and the files accessed
|
| -from the log.
|
| -"""
|
| -
|
| -import codecs
|
| -import csv
|
| -import errno
|
| -import getpass
|
| -import glob
|
| -import json
|
| -import logging
|
| -import os
|
| -import re
|
| -import stat
|
| -import subprocess
|
| -import sys
|
| -import tempfile
|
| -import threading
|
| -import time
|
| -import weakref
|
| -
|
| -from third_party import colorama
|
| -from third_party.depot_tools import fix_encoding
|
| -from third_party.depot_tools import subcommand
|
| -
|
| -from utils import file_path
|
| -from utils import tools
|
| -
|
| -## OS-specific imports
|
| -
|
| -if sys.platform == 'win32':
|
| - from ctypes.wintypes import byref, c_int, c_wchar_p
|
| - from ctypes.wintypes import windll # pylint: disable=E0611
|
| -
|
| -
|
| -__version__ = '0.1'
|
| -
|
| -
|
| -BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| -ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))
|
| -
|
| -
|
| -class TracingFailure(Exception):
|
| - """An exception occured during tracing."""
|
| - def __init__(self, description, pid, line_number, line, *args):
|
| - super(TracingFailure, self).__init__(
|
| - description, pid, line_number, line, *args)
|
| - self.description = description
|
| - self.pid = pid
|
| - self.line_number = line_number
|
| - self.line = line
|
| - self.extra = args
|
| -
|
| - def __str__(self):
|
| - out = self.description
|
| - if self.pid:
|
| - out += '\npid: %d' % self.pid
|
| - if self.line_number:
|
| - out += '\nline: %d' % self.line_number
|
| - if self.line:
|
| - out += '\n%s' % self.line
|
| - if self.extra:
|
| - out += '\n' + ', '.join(map(str, filter(None, self.extra)))
|
| - return out
|
| -
|
| -
|
| -## OS-specific functions
|
| -
|
| -if sys.platform == 'win32':
|
| - def get_current_encoding():
|
| - """Returns the 'ANSI' code page associated to the process."""
|
| - return 'cp%d' % int(windll.kernel32.GetACP())
|
| -
|
| -
|
| - def CommandLineToArgvW(command_line):
|
| - """Splits a commandline into argv using CommandLineToArgvW()."""
|
| - # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx
|
| - size = c_int()
|
| - assert isinstance(command_line, unicode)
|
| - ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size))
|
| - try:
|
| - return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]
|
| - finally:
|
| - windll.kernel32.LocalFree(ptr)
|
| -
|
| -
|
| -
|
| -def gen_blacklist(regexes):
|
| - """Returns a lambda to be used as a blacklist."""
|
| - compiled = [re.compile(i) for i in regexes]
|
| - def match(f):
|
| - return any(j.match(f) for j in compiled)
|
| - return match
|
| -
|
| -
|
| -def create_subprocess_thunk():
|
| - """Creates a small temporary script to start the child process.
|
| -
|
| - This thunk doesn't block, its unique name is used to identify it as the
|
| - parent.
|
| - """
|
| - handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
|
| - try:
|
| - os.write(
|
| - handle,
|
| - (
|
| - 'import subprocess, sys\n'
|
| - 'sys.exit(subprocess.call(sys.argv[2:]))\n'
|
| - ))
|
| - finally:
|
| - os.close(handle)
|
| - return name
|
| -
|
| -
|
| -def create_exec_thunk():
|
| - """Creates a small temporary script to start the child executable.
|
| -
|
| - Reads from the file handle provided as the fisrt argument to block, then
|
| - execv() the command to be traced.
|
| - """
|
| - handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')
|
| - try:
|
| - os.write(
|
| - handle,
|
| - (
|
| - 'import os, sys\n'
|
| - 'fd = int(sys.argv[1])\n'
|
| - # This will block until the controlling process writes a byte on the
|
| - # pipe. It will do so once the tracing tool, e.g. strace, is ready to
|
| - # trace.
|
| - 'os.read(fd, 1)\n'
|
| - 'os.close(fd)\n'
|
| - 'os.execv(sys.argv[2], sys.argv[2:])\n'
|
| - ))
|
| - finally:
|
| - os.close(handle)
|
| - return name
|
| -
|
| -
|
| -def strace_process_quoted_arguments(text):
|
| - """Extracts quoted arguments on a string and return the arguments as a list.
|
| -
|
| - Implemented as an automaton. Supports incomplete strings in the form
|
| - '"foo"...'.
|
| -
|
| - Example:
|
| - With text = '"foo", "bar"', the function will return ['foo', 'bar']
|
| -
|
| - TODO(maruel): Implement escaping.
|
| - """
|
| - # All the possible states of the DFA.
|
| - ( NEED_QUOTE, # Begining of a new arguments.
|
| - INSIDE_STRING, # Inside an argument.
|
| - ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.
|
| - NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be
|
| - # a serie of 3 dots or a comma.
|
| - NEED_SPACE, # Right after a comma
|
| - NEED_DOT_2, # Found a dot, need a second one.
|
| - NEED_DOT_3, # Found second dot, need a third one.
|
| - NEED_COMMA, # Found third dot, need a comma.
|
| - ) = range(8)
|
| -
|
| - state = NEED_QUOTE
|
| - out = []
|
| - for index, char in enumerate(text):
|
| - if char == '"':
|
| - if state == NEED_QUOTE:
|
| - state = INSIDE_STRING
|
| - # A new argument was found.
|
| - out.append('')
|
| - elif state == INSIDE_STRING:
|
| - # The argument is now closed.
|
| - state = NEED_COMMA_OR_DOT
|
| - elif state == ESCAPED:
|
| - out[-1] += char
|
| - state = INSIDE_STRING
|
| - else:
|
| - raise ValueError(
|
| - 'Can\'t process char \'%s\' at column %d for: %r' % (
|
| - char, index, text),
|
| - index,
|
| - text)
|
| - elif char == ',':
|
| - if state in (NEED_COMMA_OR_DOT, NEED_COMMA):
|
| - state = NEED_SPACE
|
| - elif state == INSIDE_STRING:
|
| - out[-1] += char
|
| - elif state == ESCAPED:
|
| - out[-1] += char
|
| - state = INSIDE_STRING
|
| - else:
|
| - raise ValueError(
|
| - 'Can\'t process char \'%s\' at column %d for: %r' % (
|
| - char, index, text),
|
| - index,
|
| - text)
|
| - elif char == ' ':
|
| - if state == NEED_SPACE:
|
| - state = NEED_QUOTE
|
| - elif state == INSIDE_STRING:
|
| - out[-1] += char
|
| - elif state == ESCAPED:
|
| - out[-1] += char
|
| - state = INSIDE_STRING
|
| - else:
|
| - raise ValueError(
|
| - 'Can\'t process char \'%s\' at column %d for: %r' % (
|
| - char, index, text),
|
| - index,
|
| - text)
|
| - elif char == '.':
|
| - if state in (NEED_QUOTE, NEED_COMMA_OR_DOT):
|
| - # The string is incomplete, this mean the strace -s flag should be
|
| - # increased.
|
| - # For NEED_QUOTE, the input string would look like '"foo", ...'.
|
| - # For NEED_COMMA_OR_DOT, the input string would look like '"foo"...'
|
| - state = NEED_DOT_2
|
| - elif state == NEED_DOT_2:
|
| - state = NEED_DOT_3
|
| - elif state == NEED_DOT_3:
|
| - state = NEED_COMMA
|
| - elif state == INSIDE_STRING:
|
| - out[-1] += char
|
| - elif state == ESCAPED:
|
| - out[-1] += char
|
| - state = INSIDE_STRING
|
| - else:
|
| - raise ValueError(
|
| - 'Can\'t process char \'%s\' at column %d for: %r' % (
|
| - char, index, text),
|
| - index,
|
| - text)
|
| - elif char == '\\':
|
| - if state == ESCAPED:
|
| - out[-1] += char
|
| - state = INSIDE_STRING
|
| - elif state == INSIDE_STRING:
|
| - state = ESCAPED
|
| - else:
|
| - raise ValueError(
|
| - 'Can\'t process char \'%s\' at column %d for: %r' % (
|
| - char, index, text),
|
| - index,
|
| - text)
|
| - else:
|
| - if state == INSIDE_STRING:
|
| - out[-1] += char
|
| - else:
|
| - raise ValueError(
|
| - 'Can\'t process char \'%s\' at column %d for: %r' % (
|
| - char, index, text),
|
| - index,
|
| - text)
|
| - if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):
|
| - raise ValueError(
|
| - 'String is incorrectly terminated: %r' % text,
|
| - text)
|
| - return out
|
| -
|
| -
|
| -def read_json(filepath):
|
| - with open(filepath, 'r') as f:
|
| - return json.load(f)
|
| -
|
| -
|
| -def write_json(filepath_or_handle, data, dense):
|
| - """Writes data into filepath or file handle encoded as json.
|
| -
|
| - If dense is True, the json is packed. Otherwise, it is human readable.
|
| - """
|
| - if hasattr(filepath_or_handle, 'write'):
|
| - if dense:
|
| - filepath_or_handle.write(
|
| - json.dumps(data, sort_keys=True, separators=(',',':')))
|
| - else:
|
| - filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))
|
| - else:
|
| - with open(filepath_or_handle, 'wb') as f:
|
| - if dense:
|
| - json.dump(data, f, sort_keys=True, separators=(',',':'))
|
| - else:
|
| - json.dump(data, f, sort_keys=True, indent=2)
|
| -
|
| -
|
| -def assert_is_renderable(pseudo_string):
|
| - """Asserts the input is a valid object to be processed by render()."""
|
| - assert (
|
| - pseudo_string is None or
|
| - isinstance(pseudo_string, unicode) or
|
| - hasattr(pseudo_string, 'render')), repr(pseudo_string)
|
| -
|
| -
|
| -def render(pseudo_string):
|
| - """Converts the pseudo-string to an unicode string."""
|
| - if pseudo_string is None or isinstance(pseudo_string, unicode):
|
| - return pseudo_string
|
| - return pseudo_string.render()
|
| -
|
| -
|
| -class Results(object):
|
| - """Results of a trace session."""
|
| -
|
| - class _TouchedObject(object):
|
| - """Something, a file or a directory, that was accessed."""
|
| - def __init__(self, root, path, tainted, size, nb_files):
|
| - logging.debug(
|
| - '%s(%s, %s, %s, %s, %s)' %
|
| - (self.__class__.__name__, root, path, tainted, size, nb_files))
|
| - assert_is_renderable(root)
|
| - assert_is_renderable(path)
|
| - self.root = root
|
| - self.path = path
|
| - self.tainted = tainted
|
| - self.nb_files = nb_files
|
| - # Can be used as a cache or a default value, depending on context. In
|
| - # particular, once self.tainted is True, because the path was replaced
|
| - # with a variable, it is not possible to look up the file size.
|
| - self._size = size
|
| - # These are cache only.
|
| - self._real_path = None
|
| -
|
| - # Check internal consistency.
|
| - assert path, path
|
| - assert tainted or bool(root) != bool(file_path.isabs(path)), (root, path)
|
| - assert tainted or (
|
| - not os.path.exists(self.full_path) or
|
| - (self.full_path == file_path.get_native_path_case(self.full_path))), (
|
| - tainted,
|
| - self.full_path,
|
| - file_path.get_native_path_case(self.full_path))
|
| -
|
| - @property
|
| - def existent(self):
|
| - return self.size != -1
|
| -
|
| - @property
|
| - def full_path(self):
|
| - if self.root:
|
| - return os.path.join(self.root, self.path)
|
| - return self.path
|
| -
|
| - @property
|
| - def real_path(self):
|
| - """Returns the path with symlinks resolved."""
|
| - if not self._real_path:
|
| - self._real_path = os.path.realpath(self.full_path)
|
| - return self._real_path
|
| -
|
| - @property
|
| - def size(self):
|
| - """File's size. -1 is not existent.
|
| -
|
| - Once tainted, it is not possible the retrieve the file size anymore since
|
| - the path is composed of variables.
|
| - """
|
| - if self._size is None and not self.tainted:
|
| - try:
|
| - self._size = os.stat(self.full_path).st_size
|
| - except OSError:
|
| - self._size = -1
|
| - return self._size
|
| -
|
| - def flatten(self):
|
| - """Returns a dict representing this object.
|
| -
|
| - A 'size' of 0 means the file was only touched and not read.
|
| - """
|
| - return {
|
| - 'path': self.path,
|
| - 'size': self.size,
|
| - }
|
| -
|
| - def replace_variables(self, variables):
|
| - """Replaces the root of this File with one of the variables if it matches.
|
| -
|
| - If a variable replacement occurs, the cloned object becomes tainted.
|
| - """
|
| - for variable, root_path in variables.iteritems():
|
| - if self.path.startswith(root_path):
|
| - return self._clone(
|
| - self.root, variable + self.path[len(root_path):], True)
|
| - # No need to clone, returns ourself.
|
| - return self
|
| -
|
| - def strip_root(self, root):
|
| - """Returns a clone of itself with 'root' stripped off.
|
| -
|
| - Note that the file is kept if it is either accessible from a symlinked
|
| - path that was used to access the file or through the real path.
|
| - """
|
| - # Check internal consistency.
|
| - assert (
|
| - self.tainted or
|
| - (file_path.isabs(root) and root.endswith(os.path.sep))), root
|
| - if not self.full_path.startswith(root):
|
| - # Now try to resolve the symlinks to see if it can be reached this way.
|
| - # Only try *after* trying without resolving symlink.
|
| - if not self.real_path.startswith(root):
|
| - return None
|
| - path = self.real_path
|
| - else:
|
| - path = self.full_path
|
| - return self._clone(root, path[len(root):], self.tainted)
|
| -
|
| - def _clone(self, new_root, new_path, tainted):
|
| - raise NotImplementedError(self.__class__.__name__)
|
| -
|
| - class File(_TouchedObject):
|
| - """A file that was accessed. May not be present anymore.
|
| -
|
| - If tainted is true, it means it is not a real path anymore as a variable
|
| - replacement occured.
|
| -
|
| - |mode| can be one of None, TOUCHED, READ or WRITE.
|
| - """
|
| - # Was probed for existence, and it is existent, but was never _opened_.
|
| - TOUCHED = 't'
|
| - # Opened for read only and guaranteed to not have been written to.
|
| - READ = 'r'
|
| - # Opened for write.
|
| - WRITE = 'w'
|
| -
|
| - # They are listed in order of priority. E.g. if a file is traced as TOUCHED
|
| - # then as WRITE, only keep WRITE. None means no idea, which is a problem on
|
| - # Windows.
|
| - ACCEPTABLE_MODES = (None, TOUCHED, READ, WRITE)
|
| -
|
| - def __init__(self, root, path, tainted, size, mode):
|
| - assert mode in self.ACCEPTABLE_MODES
|
| - super(Results.File, self).__init__(root, path, tainted, size, 1)
|
| - self.mode = mode
|
| -
|
| - def _clone(self, new_root, new_path, tainted):
|
| - """Clones itself keeping meta-data."""
|
| - # Keep the self.size and self._real_path caches for performance reason. It
|
| - # is also important when the file becomes tainted (with a variable instead
|
| - # of the real path) since self.path is not an on-disk path anymore so
|
| - # out._size cannot be updated.
|
| - out = self.__class__(new_root, new_path, tainted, self.size, self.mode)
|
| - out._real_path = self._real_path
|
| - return out
|
| -
|
| - def flatten(self):
|
| - out = super(Results.File, self).flatten()
|
| - out['mode'] = self.mode
|
| - return out
|
| -
|
| - class Directory(_TouchedObject):
|
| - """A directory of files. Must exist.
|
| -
|
| - For a Directory instance, self.size is not a cache, it's an actual value
|
| - that is never modified and represents the total size of the files contained
|
| - in this directory. It is possible that the directory is empty so that
|
| - size==0; this happens if there's only an invalid symlink in it.
|
| - """
|
| - def __init__(self, root, path, tainted, size, nb_files):
|
| - """path='.' is a valid value and must be handled appropriately."""
|
| - assert not path.endswith(os.path.sep), path
|
| - super(Results.Directory, self).__init__(
|
| - root, path + os.path.sep, tainted, size, nb_files)
|
| -
|
| - def flatten(self):
|
| - out = super(Results.Directory, self).flatten()
|
| - out['nb_files'] = self.nb_files
|
| - return out
|
| -
|
| - def _clone(self, new_root, new_path, tainted):
|
| - """Clones itself keeping meta-data."""
|
| - out = self.__class__(
|
| - new_root,
|
| - new_path.rstrip(os.path.sep),
|
| - tainted,
|
| - self.size,
|
| - self.nb_files)
|
| - out._real_path = self._real_path
|
| - return out
|
| -
|
| - class Process(object):
|
| - """A process that was traced.
|
| -
|
| - Contains references to the files accessed by this process and its children.
|
| - """
|
| - def __init__(self, pid, files, executable, command, initial_cwd, children):
|
| - logging.debug('Process(%s, %d, ...)' % (pid, len(files)))
|
| - self.pid = pid
|
| - self.files = sorted(files, key=lambda x: x.path)
|
| - self.children = children
|
| - self.executable = executable
|
| - self.command = command
|
| - self.initial_cwd = initial_cwd
|
| -
|
| - # Check internal consistency.
|
| - assert len(set(f.path for f in self.files)) == len(self.files), sorted(
|
| - f.path for f in self.files)
|
| - assert isinstance(self.children, list)
|
| - assert isinstance(self.files, list)
|
| -
|
| - @property
|
| - def all(self):
|
| - for child in self.children:
|
| - for i in child.all:
|
| - yield i
|
| - yield self
|
| -
|
| - def flatten(self):
|
| - return {
|
| - 'children': [c.flatten() for c in self.children],
|
| - 'command': self.command,
|
| - 'executable': self.executable,
|
| - 'files': [f.flatten() for f in self.files],
|
| - 'initial_cwd': self.initial_cwd,
|
| - 'pid': self.pid,
|
| - }
|
| -
|
| - def strip_root(self, root):
|
| - assert file_path.isabs(root) and root.endswith(os.path.sep), root
|
| - # Loads the files after since they are constructed as objects.
|
| - out = self.__class__(
|
| - self.pid,
|
| - filter(None, (f.strip_root(root) for f in self.files)),
|
| - self.executable,
|
| - self.command,
|
| - self.initial_cwd,
|
| - [c.strip_root(root) for c in self.children])
|
| - logging.debug(
|
| - 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))
|
| - return out
|
| -
|
| - def __init__(self, process):
|
| - self.process = process
|
| - # Cache.
|
| - self._files = None
|
| -
|
| - def flatten(self):
|
| - return {
|
| - 'root': self.process.flatten(),
|
| - }
|
| -
|
| - @property
|
| - def files(self):
|
| - if self._files is None:
|
| - self._files = sorted(
|
| - sum((p.files for p in self.process.all), []),
|
| - key=lambda x: x.path)
|
| - return self._files
|
| -
|
| - @property
|
| - def existent(self):
|
| - return [f for f in self.files if f.existent]
|
| -
|
| - @property
|
| - def non_existent(self):
|
| - return [f for f in self.files if not f.existent]
|
| -
|
| - def strip_root(self, root):
|
| - """Returns a clone with all the files outside the directory |root| removed
|
| - and converts all the path to be relative paths.
|
| -
|
| - It keeps files accessible through the |root| directory or that have been
|
| - accessed through any symlink which points to the same directory.
|
| - """
|
| - # Resolve any symlink
|
| - root = os.path.realpath(root)
|
| - root = (
|
| - file_path.get_native_path_case(root).rstrip(os.path.sep) + os.path.sep)
|
| - logging.debug('strip_root(%s)' % root)
|
| - return Results(self.process.strip_root(root))
|
| -
|
| -
|
| -class ApiBase(object):
|
| - """OS-agnostic API to trace a process and its children."""
|
| - class Context(object):
|
| - """Processes one log line at a time and keeps the list of traced processes.
|
| -
|
| - The parsing is complicated by the fact that logs are traced out of order for
|
| - strace but in-order for dtrace and logman. In addition, on Windows it is
|
| - very frequent that processids are reused so a flat list cannot be used. But
|
| - at the same time, it is impossible to faithfully construct a graph when the
|
| - logs are processed out of order. So both a tree and a flat mapping are used,
|
| - the tree is the real process tree, while the flat mapping stores the last
|
| - valid process for the corresponding processid. For the strace case, the
|
| - tree's head is guessed at the last moment.
|
| - """
|
| - class Process(object):
|
| - """Keeps context for one traced child process.
|
| -
|
| - Logs all the files this process touched. Ignores directories.
|
| - """
|
| - def __init__(self, blacklist, pid, initial_cwd):
|
| - # Check internal consistency.
|
| - assert isinstance(pid, int), repr(pid)
|
| - assert_is_renderable(initial_cwd)
|
| - self.pid = pid
|
| - # children are Process instances.
|
| - self.children = []
|
| - self.initial_cwd = initial_cwd
|
| - self.cwd = None
|
| - self.files = {}
|
| - self.executable = None
|
| - self.command = None
|
| - self._blacklist = blacklist
|
| -
|
| - def to_results_process(self):
|
| - """Resolves file case sensitivity and or late-bound strings."""
|
| - # When resolving files, it's normal to get dupe because a file could be
|
| - # opened multiple times with different case. Resolve the deduplication
|
| - # here.
|
| - def fix_path(x):
|
| - """Returns the native file path case.
|
| -
|
| - Converts late-bound strings.
|
| - """
|
| - if not x:
|
| - # Do not convert None instance to 'None'.
|
| - return x
|
| - x = render(x)
|
| - if os.path.isabs(x):
|
| - # If the path is not absolute, which tends to happen occasionally on
|
| - # Windows, it is not possible to get the native path case so ignore
|
| - # that trace. It mostly happens for 'executable' value.
|
| - x = file_path.get_native_path_case(x)
|
| - return x
|
| -
|
| - def fix_and_blacklist_path(x, m):
|
| - """Receives a tuple (filepath, mode) and processes filepath."""
|
| - x = fix_path(x)
|
| - if not x:
|
| - return
|
| - # The blacklist needs to be reapplied, since path casing could
|
| - # influence blacklisting.
|
| - if self._blacklist(x):
|
| - return
|
| - # Filters out directories. Some may have passed through.
|
| - if os.path.isdir(x):
|
| - return
|
| - return x, m
|
| -
|
| - # Renders all the files as strings, as some could be RelativePath
|
| - # instances. It is important to do it first since there could still be
|
| - # multiple entries with the same path but different modes.
|
| - rendered = (
|
| - fix_and_blacklist_path(f, m) for f, m in self.files.iteritems())
|
| - files = sorted(
|
| - (f for f in rendered if f),
|
| - key=lambda x: (x[0], Results.File.ACCEPTABLE_MODES.index(x[1])))
|
| - # Then converting into a dict will automatically clean up lesser
|
| - # important values.
|
| - files = [
|
| - Results.File(None, f, False, None, m)
|
| - for f, m in dict(files).iteritems()
|
| - ]
|
| - return Results.Process(
|
| - self.pid,
|
| - files,
|
| - fix_path(self.executable),
|
| - self.command,
|
| - fix_path(self.initial_cwd),
|
| - [c.to_results_process() for c in self.children])
|
| -
|
| - def add_file(self, filepath, mode):
|
| - """Adds a file if it passes the blacklist."""
|
| - if self._blacklist(render(filepath)):
|
| - return
|
| - logging.debug('add_file(%d, %s, %s)', self.pid, filepath, mode)
|
| - # Note that filepath and not render(filepath) is added. It is because
|
| - # filepath could be something else than a string, like a RelativePath
|
| - # instance for dtrace logs.
|
| - modes = Results.File.ACCEPTABLE_MODES
|
| - old_mode = self.files.setdefault(filepath, mode)
|
| - if old_mode != mode and modes.index(old_mode) < modes.index(mode):
|
| - # Take the highest value.
|
| - self.files[filepath] = mode
|
| -
|
| - def __init__(self, blacklist):
|
| - self.blacklist = blacklist
|
| - # Initial process.
|
| - self.root_process = None
|
| - # dict to accelerate process lookup, to not have to lookup the whole graph
|
| - # each time.
|
| - self._process_lookup = {}
|
| -
|
| - class Tracer(object):
|
| - """During it's lifetime, the tracing subsystem is enabled."""
|
| - def __init__(self, logname):
|
| - self._logname = logname
|
| - self._lock = threading.RLock()
|
| - self._traces = []
|
| - self._initialized = True
|
| - self._scripts_to_cleanup = []
|
| -
|
| - def trace(self, cmd, cwd, tracename, output):
|
| - """Runs the OS-specific trace program on an executable.
|
| -
|
| - Arguments:
|
| - - cmd: The command (a list) to run.
|
| - - cwd: Current directory to start the child process in.
|
| - - tracename: Name of the trace in the logname file.
|
| - - output: If False, redirects output to PIPEs.
|
| -
|
| - Returns a tuple (resultcode, output) and updates the internal trace
|
| - entries.
|
| - """
|
| - # The implementation adds an item to self._traces.
|
| - raise NotImplementedError(self.__class__.__name__)
|
| -
|
| - def close(self, _timeout=None):
|
| - """Saves the meta-data in the logname file.
|
| -
|
| - For kernel-based tracing, stops the tracing subsystem.
|
| -
|
| - Must not be used manually when using 'with' construct.
|
| - """
|
| - with self._lock:
|
| - if not self._initialized:
|
| - raise TracingFailure(
|
| - 'Called %s.close() on an unitialized object' %
|
| - self.__class__.__name__,
|
| - None, None, None)
|
| - try:
|
| - while self._scripts_to_cleanup:
|
| - try:
|
| - os.remove(self._scripts_to_cleanup.pop())
|
| - except OSError as e:
|
| - logging.error('Failed to delete a temporary script: %s', e)
|
| - write_json(self._logname, self._gen_logdata(), False)
|
| - finally:
|
| - self._initialized = False
|
| -
|
| - def post_process_log(self):
|
| - """Post-processes the log so it becomes faster to load afterward.
|
| -
|
| - Must not be used manually when using 'with' construct.
|
| - """
|
| - assert not self._initialized, 'Must stop tracing first.'
|
| -
|
| - def _gen_logdata(self):
|
| - """Returns the data to be saved in the trace file."""
|
| - return {
|
| - 'traces': self._traces,
|
| - }
|
| -
|
| - def __enter__(self):
|
| - """Enables 'with' statement."""
|
| - return self
|
| -
|
| - def __exit__(self, exc_type, exc_value, traceback):
|
| - """Enables 'with' statement."""
|
| - self.close()
|
| - # If an exception was thrown, do not process logs.
|
| - if not exc_type:
|
| - self.post_process_log()
|
| -
|
| - def get_tracer(self, logname):
|
| - """Returns an ApiBase.Tracer instance.
|
| -
|
| - Initializes the tracing subsystem, which is a requirement for kernel-based
|
| - tracers. Only one tracer instance should be live at a time!
|
| -
|
| - logname is the filepath to the json file that will contain the meta-data
|
| - about the logs.
|
| - """
|
| - return self.Tracer(logname)
|
| -
|
| - @staticmethod
|
| - def clean_trace(logname):
|
| - """Deletes an old log."""
|
| - raise NotImplementedError()
|
| -
|
| - @classmethod
|
| - def parse_log(cls, logname, blacklist, trace_name):
|
| - """Processes trace logs and returns the files opened and the files that do
|
| - not exist.
|
| -
|
| - It does not track directories.
|
| -
|
| - Arguments:
|
| - - logname: must be an absolute path.
|
| - - blacklist: must be a lambda.
|
| - - trace_name: optional trace to read, defaults to reading all traces.
|
| -
|
| - Most of the time, files that do not exist are temporary test files that
|
| - should be put in /tmp instead. See http://crbug.com/116251.
|
| -
|
| - Returns a list of dict with keys:
|
| - - results: A Results instance.
|
| - - trace: The corresponding tracename parameter provided to
|
| - get_tracer().trace().
|
| - - output: Output gathered during execution, if get_tracer().trace(...,
|
| - output=False) was used.
|
| - """
|
| - raise NotImplementedError(cls.__class__.__name__)
|
| -
|
| -
|
| -class Strace(ApiBase):
|
| - """strace implies linux."""
|
| - @staticmethod
|
| - def load_filename(filename):
|
| - """Parses a filename in a log."""
|
| - # TODO(maruel): Be compatible with strace -x.
|
| - assert isinstance(filename, str)
|
| - out = ''
|
| - i = 0
|
| - while i < len(filename):
|
| - c = filename[i]
|
| - if c == '\\':
|
| - out += chr(int(filename[i+1:i+4], 8))
|
| - i += 4
|
| - else:
|
| - out += c
|
| - i += 1
|
| - # TODO(maruel): That's not necessarily true that the current code page is
|
| - # utf-8.
|
| - return out.decode('utf-8')
|
| -
|
| - class Context(ApiBase.Context):
|
| - """Processes a strace log line and keeps the list of existent and non
|
| - existent files accessed.
|
| -
|
| - Ignores directories.
|
| -
|
| - Uses late-binding to processes the cwd of each process. The problem is that
|
| - strace generates one log file per process it traced but doesn't give any
|
| - information about which process was started when and by who. So process the
|
| - logs out of order and use late binding with RelativePath to be able to
|
| - deduce the initial directory of each process once all the logs are parsed.
|
| -
|
| - TODO(maruel): Use the script even in the non-sudo case, so log parsing can
|
| - be done in two phase: first find the root process, then process the child
|
| - processes in order. With that, it should be possible to not use RelativePath
|
| - anymore. This would significantly simplify the code!
|
| - """
|
| - class Process(ApiBase.Context.Process):
|
| - """Represents the state of a process.
|
| -
|
| - Contains all the information retrieved from the pid-specific log.
|
| - """
|
| - # Function names are using ([a-z_0-9]+)
|
| - # This is the most common format. function(args) = result
|
| - RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$')
|
| - # An interrupted function call, only grab the minimal header.
|
| - RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')
|
| - # A resumed function call.
|
| - RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')
|
| - # A process received a signal.
|
| - RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')
|
| - # A process didn't handle a signal. Ignore any junk appearing before,
|
| - # because the process was forcibly killed so it won't open any new file.
|
| - RE_KILLED = re.compile(
|
| - r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$')
|
| - # The process has exited.
|
| - RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')
|
| - # A call was canceled. Ignore any prefix.
|
| - RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$')
|
| - # Happens when strace fails to even get the function name.
|
| - UNNAMED_FUNCTION = '????'
|
| -
|
| - # Corner-case in python, a class member function decorator must not be
|
| - # @staticmethod.
|
| - def parse_args(regexp, expect_zero): # pylint: disable=E0213
|
| - """Automatically convert the str 'args' into a list of processed
|
| - arguments.
|
| -
|
| - Arguments:
|
| - - regexp is used to parse args.
|
| - - expect_zero: one of True, False or None.
|
| - - True: will check for result.startswith('0') first and will ignore
|
| - the trace line completely otherwise. This is important because for
|
| - many functions, the regexp will not process if the call failed.
|
| - - False: will check for not result.startswith(('?', '-1')) for the
|
| - same reason than with True.
|
| - - None: ignore result.
|
| - """
|
| - def meta_hook(function):
|
| - assert function.__name__.startswith('handle_')
|
| - def hook(self, args, result):
|
| - if expect_zero is True and not result.startswith('0'):
|
| - return
|
| - if expect_zero is False and result.startswith(('?', '-1')):
|
| - return
|
| - match = re.match(regexp, args)
|
| - if not match:
|
| - raise TracingFailure(
|
| - 'Failed to parse %s(%s) = %s' %
|
| - (function.__name__[len('handle_'):], args, result),
|
| - None, None, None)
|
| - return function(self, match.groups(), result)
|
| - return hook
|
| - return meta_hook
|
| -
|
| - class RelativePath(object):
|
| - """A late-bound relative path."""
|
| - def __init__(self, parent, value):
|
| - assert_is_renderable(parent)
|
| - self.parent = parent
|
| - assert (
|
| - value is None or
|
| - (isinstance(value, unicode) and not os.path.isabs(value)))
|
| - self.value = value
|
| - if self.value:
|
| - # TODO(maruel): On POSIX, '\\' is a valid character so remove this
|
| - # assert.
|
| - assert '\\' not in self.value, value
|
| - assert '\\' not in self.value, (repr(value), repr(self.value))
|
| -
|
| - def render(self):
|
| - """Returns the current directory this instance is representing.
|
| -
|
| - This function is used to return the late-bound value.
|
| - """
|
| - assert self.parent is not None
|
| - parent = render(self.parent)
|
| - if self.value:
|
| - return os.path.normpath(os.path.join(parent, self.value))
|
| - return parent
|
| -
|
| - def __init__(self, root, pid):
|
| - """Keeps enough information to be able to guess the original process
|
| - root.
|
| -
|
| - strace doesn't store which process was the initial process. So more
|
| - information needs to be kept so the graph can be reconstructed from the
|
| - flat map.
|
| - """
|
| - logging.info('%s(%d)' % (self.__class__.__name__, pid))
|
| - super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)
|
| - assert isinstance(root, ApiBase.Context)
|
| - self._root = weakref.ref(root)
|
| - # The dict key is the function name of the pending call, like 'open'
|
| - # or 'execve'.
|
| - self._pending_calls = {}
|
| - self._line_number = 0
|
| - # Current directory when the process started.
|
| - if isinstance(self._root(), unicode):
|
| - self.initial_cwd = self._root()
|
| - else:
|
| - self.initial_cwd = self.RelativePath(self._root(), None)
|
| - self.parentid = None
|
| - self._done = False
|
| -
|
| - def get_cwd(self):
|
| - """Returns the best known value of cwd."""
|
| - return self.cwd or self.initial_cwd
|
| -
|
| - def render(self):
|
| - """Returns the string value of the RelativePath() object.
|
| -
|
| - Used by RelativePath. Returns the initial directory and not the
|
| - current one since the current directory 'cwd' validity is time-limited.
|
| -
|
| - The validity is only guaranteed once all the logs are processed.
|
| - """
|
| - return self.initial_cwd.render()
|
| -
|
| - def on_line(self, line):
|
| - assert isinstance(line, str)
|
| - self._line_number += 1
|
| - try:
|
| - if self._done:
|
| - raise TracingFailure(
|
| - 'Found a trace for a terminated process or corrupted log',
|
| - None, None, None)
|
| -
|
| - if self.RE_SIGNAL.match(line):
|
| - # Ignore signals.
|
| - return
|
| -
|
| - match = self.RE_KILLED.match(line)
|
| - if match:
|
| - # Converts a '+++ killed by Foo +++' trace into an exit_group().
|
| - self.handle_exit_group(match.group(1), None)
|
| - return
|
| -
|
| - match = self.RE_PROCESS_EXITED.match(line)
|
| - if match:
|
| - # Converts a '+++ exited with 1 +++' trace into an exit_group()
|
| - self.handle_exit_group(match.group(1), None)
|
| - return
|
| -
|
| - match = self.RE_UNFINISHED.match(line)
|
| - if match:
|
| - if match.group(1) in self._pending_calls:
|
| - raise TracingFailure(
|
| - 'Found two unfinished calls for the same function',
|
| - None, None, None,
|
| - self._pending_calls)
|
| - self._pending_calls[match.group(1)] = (
|
| - match.group(1) + match.group(2))
|
| - return
|
| -
|
| - match = self.RE_UNAVAILABLE.match(line)
|
| - if match:
|
| - # This usually means a process was killed and a pending call was
|
| - # canceled.
|
| - # TODO(maruel): Look up the last exit_group() trace just above and
|
| - # make sure any self._pending_calls[anything] is properly flushed.
|
| - return
|
| -
|
| - match = self.RE_RESUMED.match(line)
|
| - if match:
|
| - if match.group(1) not in self._pending_calls:
|
| - raise TracingFailure(
|
| - 'Found a resumed call that was not logged as unfinished',
|
| - None, None, None,
|
| - self._pending_calls)
|
| - pending = self._pending_calls.pop(match.group(1))
|
| - # Reconstruct the line.
|
| - line = pending + match.group(2)
|
| -
|
| - match = self.RE_HEADER.match(line)
|
| - if not match:
|
| - # The line is corrupted. It happens occasionally when a process is
|
| - # killed forcibly with activity going on. Assume the process died.
|
| - # No other line can be processed afterward.
|
| - logging.debug('%d is done: %s', self.pid, line)
|
| - self._done = True
|
| - return
|
| -
|
| - if match.group(1) == self.UNNAMED_FUNCTION:
|
| - return
|
| -
|
| - # It's a valid line, handle it.
|
| - handler = getattr(self, 'handle_%s' % match.group(1), None)
|
| - if not handler:
|
| - self._handle_unknown(match.group(1), match.group(2), match.group(3))
|
| - return handler(match.group(2), match.group(3))
|
| - except TracingFailure, e:
|
| - # Hack in the values since the handler could be a static function.
|
| - e.pid = self.pid
|
| - e.line = line
|
| - e.line_number = self._line_number
|
| - # Re-raise the modified exception.
|
| - raise
|
| - except (KeyError, NotImplementedError, ValueError), e:
|
| - raise TracingFailure(
|
| - 'Trace generated a %s exception: %s' % (
|
| - e.__class__.__name__, str(e)),
|
| - self.pid,
|
| - self._line_number,
|
| - line,
|
| - e)
|
| -
|
| - @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True)
|
| - def handle_access(self, args, _result):
|
| - self._handle_file(args[0], Results.File.TOUCHED)
|
| -
|
| - @parse_args(r'^\"(.+?)\"$', True)
|
| - def handle_chdir(self, args, _result):
|
| - """Updates cwd."""
|
| - self.cwd = self._mangle(args[0])
|
| - logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
|
| -
|
| - @parse_args(r'^\"(.+?)\", (\d+), (\d+)$', False)
|
| - def handle_chown(self, args, _result):
|
| - # TODO(maruel): Look at result?
|
| - self._handle_file(args[0], Results.File.WRITE)
|
| -
|
| - def handle_clone(self, _args, result):
|
| - self._handling_forking('clone', result)
|
| -
|
| - def handle_close(self, _args, _result):
|
| - pass
|
| -
|
| - @parse_args(r'^\"(.+?)\", (\d+)$', False)
|
| - def handle_chmod(self, args, _result):
|
| - self._handle_file(args[0], Results.File.WRITE)
|
| -
|
| - @parse_args(r'^\"(.+?)\", (\d+)$', False)
|
| - def handle_creat(self, args, _result):
|
| - self._handle_file(args[0], Results.File.WRITE)
|
| -
|
| - @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)
|
| - def handle_execve(self, args, _result):
|
| - # Even if in practice execve() doesn't returns when it succeeds, strace
|
| - # still prints '0' as the result.
|
| - filepath = args[0]
|
| - self._handle_file(filepath, Results.File.READ)
|
| - self.executable = self._mangle(filepath)
|
| - try:
|
| - self.command = strace_process_quoted_arguments(args[1])
|
| - except ValueError as e:
|
| - raise TracingFailure(
|
| - 'Failed to process command line argument:\n%s' % e.args[0],
|
| - None, None, None)
|
| -
|
| - def handle_exit_group(self, _args, _result):
|
| - """Removes cwd."""
|
| - self.cwd = None
|
| -
|
| - @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True)
|
| - def handle_faccessat(self, args, _results):
|
| - if args[0] == 'AT_FDCWD':
|
| - self._handle_file(args[1], Results.File.TOUCHED)
|
| - else:
|
| - raise Exception('Relative faccess not implemented.')
|
| -
|
| - def handle_fallocate(self, _args, result):
|
| - pass
|
| -
|
| - def handle_fork(self, args, result):
|
| - self._handle_unknown('fork', args, result)
|
| -
|
| - def handle_futex(self, _args, _result):
|
| - pass
|
| -
|
| - @parse_args(r'^\"(.+?)\", (\d+)$', False)
|
| - def handle_getcwd(self, args, _result):
|
| - if os.path.isabs(args[0]):
|
| - logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))
|
| - if not isinstance(self.cwd, unicode):
|
| - # Take the occasion to reset the path.
|
| - self.cwd = self._mangle(args[0])
|
| - else:
|
| - # It should always match.
|
| - assert self.cwd == Strace.load_filename(args[0]), (
|
| - self.cwd, args[0])
|
| -
|
| - @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
|
| - def handle_link(self, args, _result):
|
| - self._handle_file(args[0], Results.File.READ)
|
| - self._handle_file(args[1], Results.File.WRITE)
|
| -
|
| - @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
|
| - def handle_lstat(self, args, _result):
|
| - self._handle_file(args[0], Results.File.TOUCHED)
|
| -
|
| - def handle_mkdir(self, _args, _result):
|
| - # We track content, not directories.
|
| - pass
|
| -
|
| - @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
|
| - def handle_open(self, args, _result):
|
| - if 'O_DIRECTORY' in args[1]:
|
| - return
|
| - self._handle_file(
|
| - args[0],
|
| - Results.File.READ if 'O_RDONLY' in args[1] else Results.File.WRITE)
|
| -
|
| - @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False)
|
| - def handle_openat(self, args, _result):
|
| - if 'O_DIRECTORY' in args[2]:
|
| - return
|
| - if args[0] == 'AT_FDCWD':
|
| - self._handle_file(
|
| - args[1],
|
| - Results.File.READ if 'O_RDONLY' in args[2]
|
| - else Results.File.WRITE)
|
| - else:
|
| - # TODO(maruel): Implement relative open if necessary instead of the
|
| - # AT_FDCWD flag, let's hope not since this means tracking all active
|
| - # directory handles.
|
| - raise NotImplementedError('Relative open via openat not implemented.')
|
| -
|
| - @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)
|
| - def handle_readlink(self, args, _result):
|
| - self._handle_file(args[0], Results.File.READ)
|
| -
|
| - @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
|
| - def handle_rename(self, args, _result):
|
| - self._handle_file(args[0], Results.File.READ)
|
| - self._handle_file(args[1], Results.File.WRITE)
|
| -
|
| - def handle_rmdir(self, _args, _result):
|
| - # We track content, not directories.
|
| - pass
|
| -
|
| - def handle_setxattr(self, _args, _result):
|
| - # TODO(maruel): self._handle_file(args[0], Results.File.WRITE)
|
| - pass
|
| -
|
| - @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)
|
| - def handle_stat(self, args, _result):
|
| - self._handle_file(args[0], Results.File.TOUCHED)
|
| -
|
| - @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)
|
| - def handle_symlink(self, args, _result):
|
| - self._handle_file(args[0], Results.File.TOUCHED)
|
| - self._handle_file(args[1], Results.File.WRITE)
|
| -
|
| - @parse_args(r'^\"(.+?)\", \d+', True)
|
| - def handle_truncate(self, args, _result):
|
| - self._handle_file(args[0], Results.File.WRITE)
|
| -
|
| - def handle_unlink(self, _args, _result):
|
| - # In theory, the file had to be created anyway.
|
| - pass
|
| -
|
| - def handle_unlinkat(self, _args, _result):
|
| - # In theory, the file had to be created anyway.
|
| - pass
|
| -
|
| - def handle_statfs(self, _args, _result):
|
| - pass
|
| -
|
| - def handle_utimensat(self, _args, _result):
|
| - pass
|
| -
|
| - def handle_vfork(self, _args, result):
|
| - self._handling_forking('vfork', result)
|
| -
|
| - @staticmethod
|
| - def _handle_unknown(function, args, result):
|
| - raise TracingFailure(
|
| - 'Unexpected/unimplemented trace %s(%s)= %s' %
|
| - (function, args, result),
|
| - None, None, None)
|
| -
|
| - def _handling_forking(self, name, result):
|
| - """Transfers cwd."""
|
| - if result.startswith(('?', '-1')):
|
| - # The call failed.
|
| - return
|
| - # Update the other process right away.
|
| - childpid = int(result)
|
| - child = self._root().get_or_set_proc(childpid)
|
| - if child.parentid is not None or childpid in self.children:
|
| - raise TracingFailure(
|
| - 'Found internal inconsitency in process lifetime detection '
|
| - 'during a %s() call' % name,
|
| - None, None, None)
|
| -
|
| - # Copy the cwd object.
|
| - child.initial_cwd = self.get_cwd()
|
| - child.parentid = self.pid
|
| - # It is necessary because the logs are processed out of order.
|
| - self.children.append(child)
|
| -
|
| - def _handle_file(self, filepath, mode):
|
| - filepath = self._mangle(filepath)
|
| - self.add_file(filepath, mode)
|
| -
|
| - def _mangle(self, filepath):
|
| - """Decodes a filepath found in the log and convert it to a late-bound
|
| - path if necessary.
|
| -
|
| - |filepath| is an strace 'encoded' string and the returned value is
|
| - either an unicode string if the path was absolute or a late bound path
|
| - otherwise.
|
| - """
|
| - filepath = Strace.load_filename(filepath)
|
| - if os.path.isabs(filepath):
|
| - return filepath
|
| - else:
|
| - if isinstance(self.get_cwd(), unicode):
|
| - return os.path.normpath(os.path.join(self.get_cwd(), filepath))
|
| - return self.RelativePath(self.get_cwd(), filepath)
|
| -
|
| - def __init__(self, blacklist, root_pid, initial_cwd):
|
| - """|root_pid| may be None when the root process is not known.
|
| -
|
| - In that case, a search is done after reading all the logs to figure out
|
| - the root process.
|
| - """
|
| - super(Strace.Context, self).__init__(blacklist)
|
| - assert_is_renderable(initial_cwd)
|
| - self.root_pid = root_pid
|
| - self.initial_cwd = initial_cwd
|
| -
|
| - def render(self):
|
| - """Returns the string value of the initial cwd of the root process.
|
| -
|
| - Used by RelativePath.
|
| - """
|
| - return self.initial_cwd
|
| -
|
| - def on_line(self, pid, line):
|
| - """Transfers control into the Process.on_line() function."""
|
| - self.get_or_set_proc(pid).on_line(line.strip())
|
| -
|
| - def to_results(self):
|
| - """If necessary, finds back the root process and verify consistency."""
|
| - if not self.root_pid:
|
| - # The non-sudo case. The traced process was started by strace itself,
|
| - # so the pid of the traced process is not known.
|
| - root = [p for p in self._process_lookup.itervalues() if not p.parentid]
|
| - if len(root) == 1:
|
| - self.root_process = root[0]
|
| - # Save it for later.
|
| - self.root_pid = self.root_process.pid
|
| - else:
|
| - # The sudo case. The traced process was started manually so its pid is
|
| - # known.
|
| - self.root_process = self._process_lookup.get(self.root_pid)
|
| - if not self.root_process:
|
| - raise TracingFailure(
|
| - 'Found internal inconsitency in process lifetime detection '
|
| - 'while finding the root process',
|
| - None,
|
| - None,
|
| - None,
|
| - self.root_pid,
|
| - sorted(self._process_lookup))
|
| - process = self.root_process.to_results_process()
|
| - if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
|
| - raise TracingFailure(
|
| - 'Found internal inconsitency in process lifetime detection '
|
| - 'while looking for len(tree) == len(list)',
|
| - None,
|
| - None,
|
| - None,
|
| - sorted(self._process_lookup),
|
| - sorted(p.pid for p in process.all))
|
| - return Results(process)
|
| -
|
| - def get_or_set_proc(self, pid):
|
| - """Returns the Context.Process instance for this pid or creates a new one.
|
| - """
|
| - if not pid or not isinstance(pid, int):
|
| - raise TracingFailure(
|
| - 'Unpexpected value for pid: %r' % pid,
|
| - pid,
|
| - None,
|
| - None,
|
| - pid)
|
| - if pid not in self._process_lookup:
|
| - self._process_lookup[pid] = self.Process(self, pid)
|
| - return self._process_lookup[pid]
|
| -
|
| - @classmethod
|
| - def traces(cls):
|
| - """Returns the list of all handled traces to pass this as an argument to
|
| - strace.
|
| - """
|
| - prefix = 'handle_'
|
| - return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]
|
| -
|
| - class Tracer(ApiBase.Tracer):
|
| - MAX_LEN = 256
|
| -
|
| - def __init__(self, logname, use_sudo):
|
| - super(Strace.Tracer, self).__init__(logname)
|
| - self.use_sudo = use_sudo
|
| - if use_sudo:
|
| - # TODO(maruel): Use the jump script systematically to make it easy to
|
| - # figure out the root process, so RelativePath is not necessary anymore.
|
| - self._child_script = create_exec_thunk()
|
| - self._scripts_to_cleanup.append(self._child_script)
|
| -
|
| - def trace(self, cmd, cwd, tracename, output):
|
| - """Runs strace on an executable.
|
| -
|
| - When use_sudo=True, it is a 3-phases process: start the thunk, start
|
| - sudo strace with the pid of the thunk and then have the thunk os.execve()
|
| - the process to trace.
|
| - """
|
| - logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
|
| - assert os.path.isabs(cmd[0]), cmd[0]
|
| - assert os.path.isabs(cwd), cwd
|
| - assert os.path.normpath(cwd) == cwd, cwd
|
| - with self._lock:
|
| - if not self._initialized:
|
| - raise TracingFailure(
|
| - 'Called Tracer.trace() on an unitialized object',
|
| - None, None, None, tracename)
|
| - assert tracename not in (i['trace'] for i in self._traces)
|
| - stdout = stderr = None
|
| - if output:
|
| - stdout = subprocess.PIPE
|
| - stderr = subprocess.STDOUT
|
| -
|
| - # Ensure all file related APIs are hooked.
|
| - traces = ','.join(Strace.Context.traces() + ['file'])
|
| - flags = [
|
| - # Each child process has its own trace file. It is necessary because
|
| - # strace may generate corrupted log file if multiple processes are
|
| - # heavily doing syscalls simultaneously.
|
| - '-ff',
|
| - # Reduce whitespace usage.
|
| - '-a1',
|
| - # hex encode non-ascii strings.
|
| - # TODO(maruel): '-x',
|
| - # TODO(maruel): '-ttt',
|
| - # Signals are unnecessary noise here. Note the parser can cope with them
|
| - # but reduce the unnecessary output.
|
| - '-esignal=none',
|
| - # Print as much data as wanted.
|
| - '-s', '%d' % self.MAX_LEN,
|
| - '-e', 'trace=%s' % traces,
|
| - '-o', self._logname + '.' + tracename,
|
| - ]
|
| -
|
| - if self.use_sudo:
|
| - pipe_r, pipe_w = os.pipe()
|
| - # Start the child process paused.
|
| - target_cmd = [sys.executable, self._child_script, str(pipe_r)] + cmd
|
| - logging.debug(' '.join(target_cmd))
|
| - child_proc = subprocess.Popen(
|
| - target_cmd,
|
| - stdin=subprocess.PIPE,
|
| - stdout=stdout,
|
| - stderr=stderr,
|
| - cwd=cwd)
|
| -
|
| - # TODO(maruel): both processes must use the same UID for it to work
|
| - # without sudo. Look why -p is failing at the moment without sudo.
|
| - trace_cmd = [
|
| - 'sudo',
|
| - 'strace',
|
| - '-p', str(child_proc.pid),
|
| - ] + flags
|
| - logging.debug(' '.join(trace_cmd))
|
| - strace_proc = subprocess.Popen(
|
| - trace_cmd,
|
| - cwd=cwd,
|
| - stdin=subprocess.PIPE,
|
| - stdout=subprocess.PIPE,
|
| - stderr=subprocess.PIPE)
|
| -
|
| - line = strace_proc.stderr.readline()
|
| - if not re.match(r'^Process \d+ attached \- interrupt to quit$', line):
|
| - # TODO(maruel): Raise an exception.
|
| - assert False, line
|
| -
|
| - # Now fire the child process.
|
| - os.write(pipe_w, 'x')
|
| -
|
| - out = child_proc.communicate()[0]
|
| - strace_out = strace_proc.communicate()[0]
|
| -
|
| - # TODO(maruel): if strace_proc.returncode: Add an exception.
|
| - saved_out = strace_out if strace_proc.returncode else out
|
| - root_pid = child_proc.pid
|
| - else:
|
| - # Non-sudo case.
|
| - trace_cmd = [
|
| - 'strace',
|
| - ] + flags + cmd
|
| - logging.debug(' '.join(trace_cmd))
|
| - child_proc = subprocess.Popen(
|
| - trace_cmd,
|
| - cwd=cwd,
|
| - stdin=subprocess.PIPE,
|
| - stdout=stdout,
|
| - stderr=stderr)
|
| - out = child_proc.communicate()[0]
|
| - # TODO(maruel): Walk the logs and figure out the root process would
|
| - # simplify parsing the logs a *lot*.
|
| - saved_out = out
|
| - # The trace reader will have to figure out.
|
| - root_pid = None
|
| -
|
| - with self._lock:
|
| - assert tracename not in (i['trace'] for i in self._traces)
|
| - self._traces.append(
|
| - {
|
| - 'cmd': cmd,
|
| - 'cwd': cwd,
|
| - 'output': saved_out,
|
| - 'pid': root_pid,
|
| - 'trace': tracename,
|
| - })
|
| - return child_proc.returncode, out
|
| -
|
| - def __init__(self, use_sudo=None):
|
| - super(Strace, self).__init__()
|
| - self.use_sudo = use_sudo
|
| -
|
| - def get_tracer(self, logname):
|
| - return self.Tracer(logname, self.use_sudo)
|
| -
|
| - @staticmethod
|
| - def clean_trace(logname):
|
| - if os.path.isfile(logname):
|
| - os.remove(logname)
|
| - # Also delete any pid specific file from previous traces.
|
| - for i in glob.iglob(logname + '.*'):
|
| - if i.rsplit('.', 1)[1].isdigit():
|
| - os.remove(i)
|
| -
|
| - @classmethod
|
| - def parse_log(cls, logname, blacklist, trace_name):
|
| - logging.info('parse_log(%s, ..., %s)', logname, trace_name)
|
| - assert os.path.isabs(logname)
|
| - data = read_json(logname)
|
| - out = []
|
| - for item in data['traces']:
|
| - if trace_name and item['trace'] != trace_name:
|
| - continue
|
| - result = {
|
| - 'output': item['output'],
|
| - 'trace': item['trace'],
|
| - }
|
| - try:
|
| - context = cls.Context(blacklist, item['pid'], item['cwd'])
|
| - for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):
|
| - logging.debug('Reading %s', pidfile)
|
| - pid = pidfile.rsplit('.', 1)[1]
|
| - if pid.isdigit():
|
| - pid = int(pid)
|
| - found_line = False
|
| - for line in open(pidfile, 'rb'):
|
| - context.on_line(pid, line)
|
| - found_line = True
|
| - if not found_line:
|
| - # Ensures that a completely empty trace still creates the
|
| - # corresponding Process instance by logging a dummy line.
|
| - context.on_line(pid, '')
|
| - else:
|
| - logging.warning('Found unexpected file %s', pidfile)
|
| - result['results'] = context.to_results()
|
| - except TracingFailure:
|
| - result['exception'] = sys.exc_info()
|
| - out.append(result)
|
| - return out
|
| -
|
| -
|
| -class Dtrace(ApiBase):
|
| - """Uses DTrace framework through dtrace. Requires root access.
|
| -
|
| - Implies Mac OSX.
|
| -
|
| - dtruss can't be used because it has compatibility issues with python.
|
| -
|
| - Also, the pid->cwd handling needs to be done manually since OSX has no way to
|
| - get the absolute path of the 'cwd' dtrace variable from the probe.
|
| -
|
| - Also, OSX doesn't populate curpsinfo->pr_psargs properly, see
|
| - https://discussions.apple.com/thread/1980539. So resort to handling execve()
|
| - manually.
|
| -
|
| - errno is not printed in the log since this implementation currently only cares
|
| - about files that were successfully opened.
|
| - """
|
| - class Context(ApiBase.Context):
|
| - # Format: index pid function(args)
|
| - RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$')
|
| -
|
| - # Arguments parsing.
|
| - RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')
|
| - RE_CHDIR = re.compile(r'^\"(.+?)\"$')
|
| - RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')
|
| - RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')
|
| - RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')
|
| - RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')
|
| -
|
| - # O_DIRECTORY is not defined on Windows and dtrace doesn't exist on Windows.
|
| - O_DIRECTORY = os.O_DIRECTORY if hasattr(os, 'O_DIRECTORY') else None
|
| - O_RDWR = os.O_RDWR
|
| - O_WRONLY = os.O_WRONLY
|
| -
|
| - class Process(ApiBase.Context.Process):
|
| - def __init__(self, *args):
|
| - super(Dtrace.Context.Process, self).__init__(*args)
|
| - self.cwd = self.initial_cwd
|
| -
|
| - def __init__(self, blacklist, thunk_pid, initial_cwd):
|
| - logging.info(
|
| - '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd))
|
| - super(Dtrace.Context, self).__init__(blacklist)
|
| - assert isinstance(initial_cwd, unicode), initial_cwd
|
| - # Process ID of the temporary script created by create_subprocess_thunk().
|
| - self._thunk_pid = thunk_pid
|
| - self._initial_cwd = initial_cwd
|
| - self._line_number = 0
|
| -
|
| - def on_line(self, line):
|
| - assert isinstance(line, unicode), line
|
| - self._line_number += 1
|
| - match = self.RE_HEADER.match(line)
|
| - if not match:
|
| - raise TracingFailure(
|
| - 'Found malformed line: %s' % line,
|
| - None,
|
| - self._line_number,
|
| - line)
|
| - fn = getattr(
|
| - self,
|
| - 'handle_%s' % match.group(2).replace('-', '_'),
|
| - self._handle_ignored)
|
| - # It is guaranteed to succeed because of the regexp. Or at least I thought
|
| - # it would.
|
| - pid = int(match.group(1))
|
| - try:
|
| - return fn(pid, match.group(3))
|
| - except TracingFailure, e:
|
| - # Hack in the values since the handler could be a static function.
|
| - e.pid = pid
|
| - e.line = line
|
| - e.line_number = self._line_number
|
| - # Re-raise the modified exception.
|
| - raise
|
| - except (KeyError, NotImplementedError, ValueError), e:
|
| - raise TracingFailure(
|
| - 'Trace generated a %s exception: %s' % (
|
| - e.__class__.__name__, str(e)),
|
| - pid,
|
| - self._line_number,
|
| - line,
|
| - e)
|
| -
|
| - def to_results(self):
|
| - process = self.root_process.to_results_process()
|
| - # Internal concistency check.
|
| - if sorted(self._process_lookup) != sorted(p.pid for p in process.all):
|
| - raise TracingFailure(
|
| - 'Found internal inconsitency in process lifetime detection '
|
| - 'while looking for len(tree) == len(list)',
|
| - None,
|
| - None,
|
| - None,
|
| - sorted(self._process_lookup),
|
| - sorted(p.pid for p in process.all))
|
| - return Results(process)
|
| -
|
| - def handle_dtrace_BEGIN(self, _pid, args):
|
| - if not self.RE_DTRACE_BEGIN.match(args):
|
| - raise TracingFailure(
|
| - 'Found internal inconsitency in dtrace_BEGIN log line',
|
| - None, None, None)
|
| -
|
| - def handle_proc_start(self, pid, args):
|
| - """Transfers cwd.
|
| -
|
| - The dtrace script already takes care of only tracing the processes that
|
| - are child of the traced processes so there is no need to verify the
|
| - process hierarchy.
|
| - """
|
| - if pid in self._process_lookup:
|
| - raise TracingFailure(
|
| - 'Found internal inconsitency in proc_start: %d started two times' %
|
| - pid,
|
| - None, None, None)
|
| - match = self.RE_PROC_START.match(args)
|
| - if not match:
|
| - raise TracingFailure(
|
| - 'Failed to parse arguments: %s' % args,
|
| - None, None, None)
|
| - ppid = int(match.group(1))
|
| - if ppid == self._thunk_pid and not self.root_process:
|
| - proc = self.root_process = self.Process(
|
| - self.blacklist, pid, self._initial_cwd)
|
| - elif ppid in self._process_lookup:
|
| - proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)
|
| - self._process_lookup[ppid].children.append(proc)
|
| - else:
|
| - # Another process tree, ignore.
|
| - return
|
| - self._process_lookup[pid] = proc
|
| - logging.debug(
|
| - 'New child: %s -> %d cwd:%s' %
|
| - (ppid, pid, render(proc.initial_cwd)))
|
| -
|
| - def handle_proc_exit(self, pid, _args):
|
| - """Removes cwd."""
|
| - if pid in self._process_lookup:
|
| - # self._thunk_pid is not traced itself and other traces run neither.
|
| - self._process_lookup[pid].cwd = None
|
| -
|
| - def handle_execve(self, pid, args):
|
| - """Sets the process' executable.
|
| -
|
| - TODO(maruel): Read command line arguments. See
|
| - https://discussions.apple.com/thread/1980539 for an example.
|
| - https://gist.github.com/1242279
|
| -
|
| - Will have to put the answer at http://stackoverflow.com/questions/7556249.
|
| - :)
|
| - """
|
| - if not pid in self._process_lookup:
|
| - # Another process tree, ignore.
|
| - return
|
| - match = self.RE_EXECVE.match(args)
|
| - if not match:
|
| - raise TracingFailure(
|
| - 'Failed to parse arguments: %r' % args,
|
| - None, None, None)
|
| - proc = self._process_lookup[pid]
|
| - proc.executable = match.group(1)
|
| - self._handle_file(pid, proc.executable, Results.File.READ)
|
| - proc.command = self.process_escaped_arguments(match.group(3))
|
| - if int(match.group(2)) != len(proc.command):
|
| - raise TracingFailure(
|
| - 'Failed to parse execve() arguments: %s' % args,
|
| - None, None, None)
|
| -
|
| - def handle_chdir(self, pid, args):
|
| - """Updates cwd."""
|
| - if pid not in self._process_lookup:
|
| - # Another process tree, ignore.
|
| - return
|
| - cwd = self.RE_CHDIR.match(args).group(1)
|
| - if not cwd.startswith('/'):
|
| - cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)
|
| - logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))
|
| - else:
|
| - logging.debug('handle_chdir(%d, %s)' % (pid, cwd))
|
| - cwd2 = cwd
|
| - self._process_lookup[pid].cwd = cwd2
|
| -
|
| - def handle_open_nocancel(self, pid, args):
|
| - """Redirects to handle_open()."""
|
| - return self.handle_open(pid, args)
|
| -
|
| - def handle_open(self, pid, args):
|
| - if pid not in self._process_lookup:
|
| - # Another process tree, ignore.
|
| - return
|
| - match = self.RE_OPEN.match(args)
|
| - if not match:
|
| - raise TracingFailure(
|
| - 'Failed to parse arguments: %s' % args,
|
| - None, None, None)
|
| - flag = int(match.group(2), 16)
|
| - if self.O_DIRECTORY & flag == self.O_DIRECTORY:
|
| - # Ignore directories.
|
| - return
|
| - self._handle_file(
|
| - pid,
|
| - match.group(1),
|
| - Results.File.READ if not ((self.O_RDWR | self.O_WRONLY) & flag)
|
| - else Results.File.WRITE)
|
| -
|
| - def handle_rename(self, pid, args):
|
| - if pid not in self._process_lookup:
|
| - # Another process tree, ignore.
|
| - return
|
| - match = self.RE_RENAME.match(args)
|
| - if not match:
|
| - raise TracingFailure(
|
| - 'Failed to parse arguments: %s' % args,
|
| - None, None, None)
|
| - self._handle_file(pid, match.group(1), Results.File.READ)
|
| - self._handle_file(pid, match.group(2), Results.File.WRITE)
|
| -
|
| - def _handle_file(self, pid, filepath, mode):
|
| - if not filepath.startswith('/'):
|
| - filepath = os.path.join(self._process_lookup[pid].cwd, filepath)
|
| - # We can get '..' in the path.
|
| - filepath = os.path.normpath(filepath)
|
| - # Sadly, still need to filter out directories here;
|
| - # saw open_nocancel(".", 0, 0) = 0 lines.
|
| - if os.path.isdir(filepath):
|
| - return
|
| - self._process_lookup[pid].add_file(filepath, mode)
|
| -
|
| - def handle_ftruncate(self, pid, args):
|
| - """Just used as a signal to kill dtrace, ignoring."""
|
| - pass
|
| -
|
| - @staticmethod
|
| - def _handle_ignored(pid, args):
|
| - """Is called for all the event traces that are not handled."""
|
| - raise NotImplementedError('Please implement me')
|
| -
|
| - @staticmethod
|
| - def process_escaped_arguments(text):
|
| - """Extracts escaped arguments on a string and return the arguments as a
|
| - list.
|
| -
|
| - Implemented as an automaton.
|
| -
|
| - Example:
|
| - With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the
|
| - function will return ['python2.7', '-c', 'print("hi")]
|
| - """
|
| - if not text.endswith('\\0'):
|
| - raise ValueError('String is not null terminated: %r' % text, text)
|
| - text = text[:-2]
|
| -
|
| - def unescape(x):
|
| - """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""
|
| - out = []
|
| - escaped = False
|
| - for i in x:
|
| - if i == '\\' and not escaped:
|
| - escaped = True
|
| - continue
|
| - escaped = False
|
| - out.append(i)
|
| - return ''.join(out)
|
| -
|
| - return [unescape(i) for i in text.split('\\001')]
|
| -
|
| - class Tracer(ApiBase.Tracer):
|
| - # pylint: disable=C0301
|
| - #
|
| - # To understand the following code, you'll want to take a look at:
|
| - # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html
|
| - # https://wikis.oracle.com/display/DTrace/Variables
|
| - # http://docs.oracle.com/cd/E19205-01/820-4221/
|
| - #
|
| - # 0. Dump all the valid probes into a text file. It is important, you
|
| - # want to redirect into a file and you don't want to constantly 'sudo'.
|
| - # $ sudo dtrace -l > probes.txt
|
| - #
|
| - # 1. Count the number of probes:
|
| - # $ wc -l probes.txt
|
| - # 81823 # On OSX 10.7, including 1 header line.
|
| - #
|
| - # 2. List providers, intentionally skipping all the 'syspolicy10925' and the
|
| - # likes and skipping the header with NR>1:
|
| - # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]'
|
| - # dtrace
|
| - # fbt
|
| - # io
|
| - # ip
|
| - # lockstat
|
| - # mach_trap
|
| - # proc
|
| - # profile
|
| - # sched
|
| - # syscall
|
| - # tcp
|
| - # vminfo
|
| - #
|
| - # 3. List of valid probes:
|
| - # $ grep syscall probes.txt | less
|
| - # or use dtrace directly:
|
| - # $ sudo dtrace -l -P syscall | less
|
| - #
|
| - # trackedpid is an associative array where its value can be 0, 1 or 2.
|
| - # 0 is for untracked processes and is the default value for items not
|
| - # in the associative array.
|
| - # 1 is for tracked processes.
|
| - # 2 is for the script created by create_subprocess_thunk() only. It is not
|
| - # tracked itself but all its decendants are.
|
| - #
|
| - # The script will kill itself only once waiting_to_die == 1 and
|
| - # current_processes == 0, so that both getlogin() was called and that
|
| - # all traced processes exited.
|
| - #
|
| - # TODO(maruel): Use cacheable predicates. See
|
| - # https://wikis.oracle.com/display/DTrace/Performance+Considerations
|
| - D_CODE = """
|
| - dtrace:::BEGIN {
|
| - waiting_to_die = 0;
|
| - current_processes = 0;
|
| - logindex = 0;
|
| - printf("%d %d %s_%s(\\"%s\\")\\n",
|
| - logindex, PID, probeprov, probename, SCRIPT);
|
| - logindex++;
|
| - }
|
| -
|
| - proc:::start /trackedpid[ppid]/ {
|
| - trackedpid[pid] = 1;
|
| - current_processes += 1;
|
| - printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
|
| - logindex, pid, probeprov, probename,
|
| - ppid,
|
| - execname,
|
| - current_processes);
|
| - logindex++;
|
| - }
|
| - /* Should use SCRIPT but there is no access to this variable at that
|
| - * point. */
|
| - proc:::start /ppid == PID && execname == "Python"/ {
|
| - trackedpid[pid] = 2;
|
| - current_processes += 1;
|
| - printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",
|
| - logindex, pid, probeprov, probename,
|
| - ppid,
|
| - execname,
|
| - current_processes);
|
| - logindex++;
|
| - }
|
| - proc:::exit /trackedpid[pid] &&
|
| - current_processes == 1 &&
|
| - waiting_to_die == 1/ {
|
| - trackedpid[pid] = 0;
|
| - current_processes -= 1;
|
| - printf("%d %d %s_%s(%d)\\n",
|
| - logindex, pid, probeprov, probename,
|
| - current_processes);
|
| - logindex++;
|
| - exit(0);
|
| - }
|
| - proc:::exit /trackedpid[pid]/ {
|
| - trackedpid[pid] = 0;
|
| - current_processes -= 1;
|
| - printf("%d %d %s_%s(%d)\\n",
|
| - logindex, pid, probeprov, probename,
|
| - current_processes);
|
| - logindex++;
|
| - }
|
| -
|
| - /* Use an arcane function to detect when we need to die */
|
| - syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {
|
| - waiting_to_die = 1;
|
| - printf("%d %d %s()\\n", logindex, pid, probefunc);
|
| - logindex++;
|
| - }
|
| - syscall::ftruncate:entry /
|
| - pid == PID && arg0 == FILE_ID && current_processes == 0/ {
|
| - exit(0);
|
| - }
|
| -
|
| - syscall::open*:entry /trackedpid[pid] == 1/ {
|
| - self->open_arg0 = arg0;
|
| - self->open_arg1 = arg1;
|
| - self->open_arg2 = arg2;
|
| - }
|
| - syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {
|
| - this->open_arg0 = copyinstr(self->open_arg0);
|
| - printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",
|
| - logindex, pid, probefunc,
|
| - this->open_arg0,
|
| - self->open_arg1,
|
| - self->open_arg2);
|
| - logindex++;
|
| - this->open_arg0 = 0;
|
| - }
|
| - syscall::open*:return /trackedpid[pid] == 1/ {
|
| - self->open_arg0 = 0;
|
| - self->open_arg1 = 0;
|
| - self->open_arg2 = 0;
|
| - }
|
| -
|
| - syscall::rename:entry /trackedpid[pid] == 1/ {
|
| - self->rename_arg0 = arg0;
|
| - self->rename_arg1 = arg1;
|
| - }
|
| - syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {
|
| - this->rename_arg0 = copyinstr(self->rename_arg0);
|
| - this->rename_arg1 = copyinstr(self->rename_arg1);
|
| - printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",
|
| - logindex, pid, probefunc,
|
| - this->rename_arg0,
|
| - this->rename_arg1);
|
| - logindex++;
|
| - this->rename_arg0 = 0;
|
| - this->rename_arg1 = 0;
|
| - }
|
| - syscall::rename:return /trackedpid[pid] == 1/ {
|
| - self->rename_arg0 = 0;
|
| - self->rename_arg1 = 0;
|
| - }
|
| -
|
| - /* Track chdir, it's painful because it is only receiving relative path.
|
| - */
|
| - syscall::chdir:entry /trackedpid[pid] == 1/ {
|
| - self->chdir_arg0 = arg0;
|
| - }
|
| - syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {
|
| - this->chdir_arg0 = copyinstr(self->chdir_arg0);
|
| - printf("%d %d %s(\\"%s\\")\\n",
|
| - logindex, pid, probefunc,
|
| - this->chdir_arg0);
|
| - logindex++;
|
| - this->chdir_arg0 = 0;
|
| - }
|
| - syscall::chdir:return /trackedpid[pid] == 1/ {
|
| - self->chdir_arg0 = 0;
|
| - }
|
| - """
|
| -
|
| - # execve-specific code, tends to throw a lot of exceptions.
|
| - D_CODE_EXECVE = """
|
| - /* Finally what we care about! */
|
| - syscall::exec*:entry /trackedpid[pid]/ {
|
| - self->exec_arg0 = copyinstr(arg0);
|
| - /* Incrementally probe for a NULL in the argv parameter of execve() to
|
| - * figure out argc. */
|
| - /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was
|
| - * found. */
|
| - self->exec_argc = 0;
|
| - /* Probe for argc==1 */
|
| - this->exec_argv = (user_addr_t*)copyin(
|
| - arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
|
| - self->exec_argc = this->exec_argv[self->exec_argc] ?
|
| - (self->exec_argc + 1) : self->exec_argc;
|
| -
|
| - /* Probe for argc==2 */
|
| - this->exec_argv = (user_addr_t*)copyin(
|
| - arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
|
| - self->exec_argc = this->exec_argv[self->exec_argc] ?
|
| - (self->exec_argc + 1) : self->exec_argc;
|
| -
|
| - /* Probe for argc==3 */
|
| - this->exec_argv = (user_addr_t*)copyin(
|
| - arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
|
| - self->exec_argc = this->exec_argv[self->exec_argc] ?
|
| - (self->exec_argc + 1) : self->exec_argc;
|
| -
|
| - /* Probe for argc==4 */
|
| - this->exec_argv = (user_addr_t*)copyin(
|
| - arg1, sizeof(user_addr_t) * (self->exec_argc + 1));
|
| - self->exec_argc = this->exec_argv[self->exec_argc] ?
|
| - (self->exec_argc + 1) : self->exec_argc;
|
| -
|
| - /* Copy the inputs strings since there is no guarantee they'll be
|
| - * present after the call completed. */
|
| - self->exec_argv0 = (self->exec_argc > 0) ?
|
| - copyinstr(this->exec_argv[0]) : "";
|
| - self->exec_argv1 = (self->exec_argc > 1) ?
|
| - copyinstr(this->exec_argv[1]) : "";
|
| - self->exec_argv2 = (self->exec_argc > 2) ?
|
| - copyinstr(this->exec_argv[2]) : "";
|
| - self->exec_argv3 = (self->exec_argc > 3) ?
|
| - copyinstr(this->exec_argv[3]) : "";
|
| - this->exec_argv = 0;
|
| - }
|
| - syscall::exec*:return /trackedpid[pid] && errno == 0/ {
|
| - /* We need to join strings here, as using multiple printf() would
|
| - * cause tearing when multiple threads/processes are traced.
|
| - * Since it is impossible to escape a string and join it to another one,
|
| - * like sprintf("%s%S", previous, more), use hackery.
|
| - * Each of the elements are split with a \\1. \\0 cannot be used because
|
| - * it is simply ignored. This will conflict with any program putting a
|
| - * \\1 in their execve() string but this should be "rare enough" */
|
| - this->args = "";
|
| - /* Process exec_argv[0] */
|
| - this->args = strjoin(
|
| - this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");
|
| -
|
| - /* Process exec_argv[1] */
|
| - this->args = strjoin(
|
| - this->args, (self->exec_argc > 1) ? "\\1" : "");
|
| - this->args = strjoin(
|
| - this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");
|
| -
|
| - /* Process exec_argv[2] */
|
| - this->args = strjoin(
|
| - this->args, (self->exec_argc > 2) ? "\\1" : "");
|
| - this->args = strjoin(
|
| - this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");
|
| -
|
| - /* Process exec_argv[3] */
|
| - this->args = strjoin(
|
| - this->args, (self->exec_argc > 3) ? "\\1" : "");
|
| - this->args = strjoin(
|
| - this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");
|
| -
|
| - /* Prints self->exec_argc to permits verifying the internal
|
| - * consistency since this code is quite fishy. */
|
| - printf("%d %d %s(\\"%s\\", [%d, %S])\\n",
|
| - logindex, pid, probefunc,
|
| - self->exec_arg0,
|
| - self->exec_argc,
|
| - this->args);
|
| - logindex++;
|
| - this->args = 0;
|
| - }
|
| - syscall::exec*:return /trackedpid[pid]/ {
|
| - self->exec_arg0 = 0;
|
| - self->exec_argc = 0;
|
| - self->exec_argv0 = 0;
|
| - self->exec_argv1 = 0;
|
| - self->exec_argv2 = 0;
|
| - self->exec_argv3 = 0;
|
| - }
|
| - """
|
| -
|
| - # Code currently not used.
|
| - D_EXTRANEOUS = """
|
| - /* This is a good learning experience, since it traces a lot of things
|
| - * related to the process and child processes.
|
| - * Warning: it generates a gigantic log. For example, tracing
|
| - * "data/trace_inputs/child1.py --child" generates a 2mb log and takes
|
| - * several minutes to execute.
|
| - */
|
| - /*
|
| - mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
|
| - printf("%d %d %s_%s() = %d\\n",
|
| - logindex, pid, probeprov, probefunc, errno);
|
| - logindex++;
|
| - }
|
| - proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
|
| - printf("%d %d %s_%s() = %d\\n",
|
| - logindex, pid, probeprov, probefunc, errno);
|
| - logindex++;
|
| - }
|
| - sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
|
| - printf("%d %d %s_%s() = %d\\n",
|
| - logindex, pid, probeprov, probefunc, errno);
|
| - logindex++;
|
| - }
|
| - syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
|
| - printf("%d %d %s_%s() = %d\\n",
|
| - logindex, pid, probeprov, probefunc, errno);
|
| - logindex++;
|
| - }
|
| - vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ {
|
| - printf("%d %d %s_%s() = %d\\n",
|
| - logindex, pid, probeprov, probefunc, errno);
|
| - logindex++;
|
| - }
|
| - */
|
| - /* TODO(maruel): *stat* functions and friends
|
| - syscall::access:return,
|
| - syscall::chdir:return,
|
| - syscall::chflags:return,
|
| - syscall::chown:return,
|
| - syscall::chroot:return,
|
| - syscall::getattrlist:return,
|
| - syscall::getxattr:return,
|
| - syscall::lchown:return,
|
| - syscall::lstat64:return,
|
| - syscall::lstat:return,
|
| - syscall::mkdir:return,
|
| - syscall::pathconf:return,
|
| - syscall::readlink:return,
|
| - syscall::removexattr:return,
|
| - syscall::setxattr:return,
|
| - syscall::stat64:return,
|
| - syscall::stat:return,
|
| - syscall::truncate:return,
|
| - syscall::unlink:return,
|
| - syscall::utimes:return,
|
| - */
|
| - """
|
| -
|
| - def __init__(self, logname, use_sudo):
|
| - """Starts the log collection with dtrace.
|
| -
|
| - Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so
|
| - this needs to wait for dtrace to be "warmed up".
|
| - """
|
| - super(Dtrace.Tracer, self).__init__(logname)
|
| - # This script is used as a signal to figure out the root process.
|
| - self._signal_script = create_subprocess_thunk()
|
| - self._scripts_to_cleanup.append(self._signal_script)
|
| - # This unique dummy temp file is used to signal the dtrace script that it
|
| - # should stop as soon as all the child processes are done. A bit hackish
|
| - # but works fine enough.
|
| - self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(
|
| - prefix='trace_signal_file')
|
| -
|
| - dtrace_path = '/usr/sbin/dtrace'
|
| - if not os.path.isfile(dtrace_path):
|
| - dtrace_path = 'dtrace'
|
| - elif use_sudo is None and (os.stat(dtrace_path).st_mode & stat.S_ISUID):
|
| - # No need to sudo. For those following at home, don't do that.
|
| - use_sudo = False
|
| -
|
| - # Note: do not use the -p flag. It's useless if the initial process quits
|
| - # too fast, resulting in missing traces from the grand-children. The D
|
| - # code manages the dtrace lifetime itself.
|
| - trace_cmd = [
|
| - dtrace_path,
|
| - # Use a larger buffer if getting 'out of scratch space' errors.
|
| - # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables
|
| - '-b', '10m',
|
| - '-x', 'dynvarsize=10m',
|
| - #'-x', 'dtrace_global_maxsize=1m',
|
| - '-x', 'evaltime=exec',
|
| - '-o', '/dev/stderr',
|
| - '-q',
|
| - '-n', self._get_dtrace_code(),
|
| - ]
|
| - if use_sudo is not False:
|
| - trace_cmd.insert(0, 'sudo')
|
| -
|
| - with open(self._logname + '.log', 'wb') as logfile:
|
| - self._dtrace = subprocess.Popen(
|
| - trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)
|
| - logging.debug('Started dtrace pid: %d' % self._dtrace.pid)
|
| -
|
| - # Reads until one line is printed, which signifies dtrace is up and ready.
|
| - with open(self._logname + '.log', 'rb') as logfile:
|
| - while 'dtrace_BEGIN' not in logfile.readline():
|
| - if self._dtrace.poll() is not None:
|
| - # Do a busy wait. :/
|
| - break
|
| - logging.debug('dtrace started')
|
| -
|
| - def _get_dtrace_code(self):
|
| - """Setups the D code to implement child process tracking.
|
| -
|
| - Injects the cookie in the script so it knows when to stop.
|
| -
|
| - The script will detect any instance of the script created with
|
| - create_subprocess_thunk() and will start tracing it.
|
| - """
|
| - out = (
|
| - 'inline int PID = %d;\n'
|
| - 'inline string SCRIPT = "%s";\n'
|
| - 'inline int FILE_ID = %d;\n'
|
| - '\n'
|
| - '%s') % (
|
| - os.getpid(),
|
| - self._signal_script,
|
| - self._dummy_file_id,
|
| - self.D_CODE)
|
| - if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1':
|
| - # Do not enable by default since it tends to spew dtrace: error lines
|
| - # because the execve() parameters are not in valid memory at the time of
|
| - # logging.
|
| - # TODO(maruel): Find a way to make this reliable since it's useful but
|
| - # only works in limited/trivial uses cases for now.
|
| - out += self.D_CODE_EXECVE
|
| - return out
|
| -
|
| - def trace(self, cmd, cwd, tracename, output):
|
| - """Runs dtrace on an executable.
|
| -
|
| - This dtruss is broken when it starts the process itself or when tracing
|
| - child processes, this code starts a wrapper process
|
| - generated with create_subprocess_thunk() which starts the executable to
|
| - trace.
|
| - """
|
| - logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
|
| - assert os.path.isabs(cmd[0]), cmd[0]
|
| - assert os.path.isabs(cwd), cwd
|
| - assert os.path.normpath(cwd) == cwd, cwd
|
| - with self._lock:
|
| - if not self._initialized:
|
| - raise TracingFailure(
|
| - 'Called Tracer.trace() on an unitialized object',
|
| - None, None, None, tracename)
|
| - assert tracename not in (i['trace'] for i in self._traces)
|
| -
|
| - # Starts the script wrapper to start the child process. This signals the
|
| - # dtrace script that this process is to be traced.
|
| - stdout = stderr = None
|
| - if output:
|
| - stdout = subprocess.PIPE
|
| - stderr = subprocess.STDOUT
|
| - child_cmd = [
|
| - sys.executable,
|
| - self._signal_script,
|
| - tracename,
|
| - ]
|
| - # Call a dummy function so that dtrace knows I'm about to launch a process
|
| - # that needs to be traced.
|
| - # Yummy.
|
| - child = subprocess.Popen(
|
| - child_cmd + tools.fix_python_path(cmd),
|
| - stdin=subprocess.PIPE,
|
| - stdout=stdout,
|
| - stderr=stderr,
|
| - cwd=cwd)
|
| - logging.debug('Started child pid: %d' % child.pid)
|
| -
|
| - out = child.communicate()[0]
|
| - # This doesn't mean tracing is done, one of the grand-child process may
|
| - # still be alive. It will be tracked with the dtrace script.
|
| -
|
| - with self._lock:
|
| - assert tracename not in (i['trace'] for i in self._traces)
|
| - self._traces.append(
|
| - {
|
| - 'cmd': cmd,
|
| - 'cwd': cwd,
|
| - 'pid': child.pid,
|
| - 'output': out,
|
| - 'trace': tracename,
|
| - })
|
| - return child.returncode, out
|
| -
|
| - def close(self, timeout=None):
|
| - """Terminates dtrace."""
|
| - logging.debug('close(%s)' % timeout)
|
| - try:
|
| - try:
|
| - super(Dtrace.Tracer, self).close(timeout)
|
| - # Signal dtrace that it should stop now.
|
| - # ftruncate doesn't exist on Windows.
|
| - os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101
|
| - if timeout:
|
| - start = time.time()
|
| - # Use polling. :/
|
| - while (self._dtrace.poll() is None and
|
| - (time.time() - start) < timeout):
|
| - time.sleep(0.1)
|
| - self._dtrace.kill()
|
| - self._dtrace.wait()
|
| - finally:
|
| - # Make sure to kill it in any case.
|
| - if self._dtrace.poll() is None:
|
| - try:
|
| - self._dtrace.kill()
|
| - self._dtrace.wait()
|
| - except OSError:
|
| - pass
|
| -
|
| - if self._dtrace.returncode != 0:
|
| - # Warn about any dtrace failure but basically ignore it.
|
| - print 'dtrace failure: %s' % self._dtrace.returncode
|
| - finally:
|
| - os.close(self._dummy_file_id)
|
| - os.remove(self._dummy_file_name)
|
| -
|
| - def post_process_log(self):
|
| - """Sorts the log back in order when each call occured.
|
| -
|
| - dtrace doesn't save the buffer in strict order since it keeps one buffer
|
| - per CPU.
|
| - """
|
| - super(Dtrace.Tracer, self).post_process_log()
|
| - logname = self._logname + '.log'
|
| - with open(logname, 'rb') as logfile:
|
| - lines = [l for l in logfile if l.strip()]
|
| - errors = [l for l in lines if l.startswith('dtrace:')]
|
| - if errors:
|
| - raise TracingFailure(
|
| - 'Found errors in the trace: %s' % '\n'.join(errors),
|
| - None, None, None, logname)
|
| - try:
|
| - lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))
|
| - except ValueError:
|
| - raise TracingFailure(
|
| - 'Found errors in the trace: %s' % '\n'.join(
|
| - l for l in lines if l.split(' ', 1)[0].isdigit()),
|
| - None, None, None, logname)
|
| - with open(logname, 'wb') as logfile:
|
| - logfile.write(''.join(lines))
|
| -
|
| - def __init__(self, use_sudo=None):
|
| - super(Dtrace, self).__init__()
|
| - self.use_sudo = use_sudo
|
| -
|
| - def get_tracer(self, logname):
|
| - return self.Tracer(logname, self.use_sudo)
|
| -
|
| - @staticmethod
|
| - def clean_trace(logname):
|
| - for ext in ('', '.log'):
|
| - if os.path.isfile(logname + ext):
|
| - os.remove(logname + ext)
|
| -
|
| - @classmethod
|
| - def parse_log(cls, logname, blacklist, trace_name):
|
| - logging.info('parse_log(%s, ..., %s)', logname, trace_name)
|
| - assert os.path.isabs(logname)
|
| -
|
| - def blacklist_more(filepath):
|
| - # All the HFS metadata is in the form /.vol/...
|
| - return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)
|
| -
|
| - data = read_json(logname)
|
| - out = []
|
| - for item in data['traces']:
|
| - if trace_name and item['trace'] != trace_name:
|
| - continue
|
| - result = {
|
| - 'output': item['output'],
|
| - 'trace': item['trace'],
|
| - }
|
| - try:
|
| - context = cls.Context(blacklist_more, item['pid'], item['cwd'])
|
| - # It's fine to assume the file as UTF-8: OSX enforces the file names to
|
| - # be valid UTF-8 and we control the log output.
|
| - for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'):
|
| - context.on_line(line)
|
| - result['results'] = context.to_results()
|
| - except TracingFailure:
|
| - result['exception'] = sys.exc_info()
|
| - out.append(result)
|
| - return out
|
| -
|
| -
|
| -class LogmanTrace(ApiBase):
|
| - """Uses the native Windows ETW based tracing functionality to trace a child
|
| - process.
|
| -
|
| - Caveat: this implementations doesn't track cwd or initial_cwd. It is because
|
| - the Windows Kernel doesn't have a concept of 'current working directory' at
|
| - all. A Win32 process has a map of current directories, one per drive letter
|
| - and it is managed by the user mode kernel32.dll. In kernel, a file is always
|
| - opened relative to another file_object or as an absolute path. All the current
|
| - working directory logic is done in user mode.
|
| - """
|
| - class Context(ApiBase.Context):
|
| - """Processes a ETW log line and keeps the list of existent and non
|
| - existent files accessed.
|
| -
|
| - Ignores directories.
|
| - """
|
| - # These indexes are for the stripped version in json.
|
| - EVENT_NAME = 0
|
| - TYPE = 1
|
| - PID = 2
|
| - TID = 3
|
| - PROCESSOR_ID = 4
|
| - TIMESTAMP = 5
|
| - USER_DATA = 6
|
| -
|
| - class Process(ApiBase.Context.Process):
|
| - def __init__(self, *args):
|
| - super(LogmanTrace.Context.Process, self).__init__(*args)
|
| - # Handle file objects that succeeded.
|
| - self.file_objects = {}
|
| -
|
| - def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd):
|
| - logging.info(
|
| - '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name,
|
| - thunk_cmd)
|
| - super(LogmanTrace.Context, self).__init__(blacklist)
|
| - self._drive_map = file_path.DosDriveMap()
|
| - # Threads mapping to the corresponding process id.
|
| - self._threads_active = {}
|
| - # Process ID of the tracer, e.g. the temporary script created by
|
| - # create_subprocess_thunk(). This is tricky because the process id may
|
| - # have been reused.
|
| - self._thunk_pid = thunk_pid
|
| - self._thunk_cmd = thunk_cmd
|
| - self._trace_name = trace_name
|
| - self._line_number = 0
|
| - self._thunk_process = None
|
| -
|
| - def on_line(self, line):
|
| - """Processes a json Event line."""
|
| - self._line_number += 1
|
| - try:
|
| - # By Opcode
|
| - handler = getattr(
|
| - self,
|
| - 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
|
| - None)
|
| - if not handler:
|
| - raise TracingFailure(
|
| - 'Unexpected event %s_%s' % (
|
| - line[self.EVENT_NAME], line[self.TYPE]),
|
| - None, None, None)
|
| - handler(line)
|
| - except TracingFailure, e:
|
| - # Hack in the values since the handler could be a static function.
|
| - e.pid = line[self.PID]
|
| - e.line = line
|
| - e.line_number = self._line_number
|
| - # Re-raise the modified exception.
|
| - raise
|
| - except (KeyError, NotImplementedError, ValueError), e:
|
| - raise TracingFailure(
|
| - 'Trace generated a %s exception: %s' % (
|
| - e.__class__.__name__, str(e)),
|
| - line[self.PID],
|
| - self._line_number,
|
| - line,
|
| - e)
|
| -
|
| - def to_results(self):
|
| - if not self.root_process:
|
| - raise TracingFailure(
|
| - 'Failed to detect the initial process %d' % self._thunk_pid,
|
| - None, None, None)
|
| - process = self.root_process.to_results_process()
|
| - return Results(process)
|
| -
|
| - def _thread_to_process(self, tid):
|
| - """Finds the process from the thread id."""
|
| - tid = int(tid, 16)
|
| - pid = self._threads_active.get(tid)
|
| - if not pid or not self._process_lookup.get(pid):
|
| - return
|
| - return self._process_lookup[pid]
|
| -
|
| - @classmethod
|
| - def handle_EventTrace_Header(cls, line):
|
| - """Verifies no event was dropped, e.g. no buffer overrun occured."""
|
| - BUFFER_SIZE = cls.USER_DATA
|
| - #VERSION = cls.USER_DATA + 1
|
| - #PROVIDER_VERSION = cls.USER_DATA + 2
|
| - #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3
|
| - #END_TIME = cls.USER_DATA + 4
|
| - #TIMER_RESOLUTION = cls.USER_DATA + 5
|
| - #MAX_FILE_SIZE = cls.USER_DATA + 6
|
| - #LOG_FILE_MODE = cls.USER_DATA + 7
|
| - #BUFFERS_WRITTEN = cls.USER_DATA + 8
|
| - #START_BUFFERS = cls.USER_DATA + 9
|
| - #POINTER_SIZE = cls.USER_DATA + 10
|
| - EVENTS_LOST = cls.USER_DATA + 11
|
| - #CPU_SPEED = cls.USER_DATA + 12
|
| - #LOGGER_NAME = cls.USER_DATA + 13
|
| - #LOG_FILE_NAME = cls.USER_DATA + 14
|
| - #BOOT_TIME = cls.USER_DATA + 15
|
| - #PERF_FREQ = cls.USER_DATA + 16
|
| - #START_TIME = cls.USER_DATA + 17
|
| - #RESERVED_FLAGS = cls.USER_DATA + 18
|
| - #BUFFERS_LOST = cls.USER_DATA + 19
|
| - #SESSION_NAME_STRING = cls.USER_DATA + 20
|
| - #LOG_FILE_NAME_STRING = cls.USER_DATA + 21
|
| - if line[EVENTS_LOST] != '0':
|
| - raise TracingFailure(
|
| - ( '%s events were lost during trace, please increase the buffer '
|
| - 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
|
| - None, None, None)
|
| -
|
| - def handle_FileIo_Cleanup(self, line):
|
| - """General wisdom: if a file is closed, it's because it was opened.
|
| -
|
| - Note that FileIo_Close is not used since if a file was opened properly but
|
| - not closed before the process exits, only Cleanup will be logged.
|
| - """
|
| - #IRP = self.USER_DATA
|
| - TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
|
| - FILE_OBJECT = self.USER_DATA + 2
|
| - #FILE_KEY = self.USER_DATA + 3
|
| - proc = self._thread_to_process(line[TTID])
|
| - if not proc:
|
| - # Not a process we care about.
|
| - return
|
| - file_object = line[FILE_OBJECT]
|
| - if file_object in proc.file_objects:
|
| - filepath, access_type = proc.file_objects.pop(file_object)
|
| - proc.add_file(filepath, access_type)
|
| -
|
| - def handle_FileIo_Create(self, line):
|
| - """Handles a file open.
|
| -
|
| - All FileIo events are described at
|
| - http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx
|
| - for some value of 'description'.
|
| -
|
| - " (..) process and thread id values of the IO events (..) are not valid "
|
| - http://msdn.microsoft.com/magazine/ee358703.aspx
|
| -
|
| - The FileIo.Create event doesn't return if the CreateFile() call
|
| - succeeded, so keep track of the file_object and check that it is
|
| - eventually closed with FileIo_Cleanup.
|
| - """
|
| - #IRP = self.USER_DATA
|
| - TTID = self.USER_DATA + 1 # Thread ID, that's what we want.
|
| - FILE_OBJECT = self.USER_DATA + 2
|
| - #CREATE_OPTIONS = self.USER_DATA + 3
|
| - #FILE_ATTRIBUTES = self.USER_DATA + 4
|
| - #SHARE_ACCESS = self.USER_DATA + 5
|
| - OPEN_PATH = self.USER_DATA + 6
|
| -
|
| - proc = self._thread_to_process(line[TTID])
|
| - if not proc:
|
| - # Not a process we care about.
|
| - return
|
| -
|
| - raw_path = line[OPEN_PATH]
|
| - # Ignore directories and bare drive right away.
|
| - if raw_path.endswith(os.path.sep):
|
| - return
|
| - filepath = self._drive_map.to_win32(raw_path)
|
| - # Ignore bare drive right away. Some may still fall through with format
|
| - # like '\\?\X:'
|
| - if len(filepath) == 2:
|
| - return
|
| - file_object = line[FILE_OBJECT]
|
| - if os.path.isdir(filepath):
|
| - # There is no O_DIRECTORY equivalent on Windows. The closed is
|
| - # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So
|
| - # simply discard directories are they are found.
|
| - return
|
| - # Override any stale file object.
|
| - # TODO(maruel): Figure out a way to detect if the file was opened for
|
| - # reading or writting. Sadly CREATE_OPTIONS doesn't seem to be of any help
|
| - # here. For now mark as None to make it clear we have no idea what it is
|
| - # about.
|
| - proc.file_objects[file_object] = (filepath, None)
|
| -
|
| - def handle_FileIo_Rename(self, line):
|
| - # TODO(maruel): Handle?
|
| - pass
|
| -
|
| - def handle_Process_End(self, line):
|
| - pid = line[self.PID]
|
| - if self._process_lookup.get(pid):
|
| - logging.info('Terminated: %d' % pid)
|
| - self._process_lookup[pid] = None
|
| - else:
|
| - logging.debug('Terminated: %d' % pid)
|
| - if self._thunk_process and self._thunk_process.pid == pid:
|
| - self._thunk_process = None
|
| -
|
| - def handle_Process_Start(self, line):
|
| - """Handles a new child process started by PID."""
|
| - #UNIQUE_PROCESS_KEY = self.USER_DATA
|
| - PROCESS_ID = self.USER_DATA + 1
|
| - #PARENT_PID = self.USER_DATA + 2
|
| - #SESSION_ID = self.USER_DATA + 3
|
| - #EXIT_STATUS = self.USER_DATA + 4
|
| - #DIRECTORY_TABLE_BASE = self.USER_DATA + 5
|
| - #USER_SID = self.USER_DATA + 6
|
| - IMAGE_FILE_NAME = self.USER_DATA + 7
|
| - COMMAND_LINE = self.USER_DATA + 8
|
| -
|
| - ppid = line[self.PID]
|
| - pid = int(line[PROCESS_ID], 16)
|
| - command_line = CommandLineToArgvW(line[COMMAND_LINE])
|
| - logging.debug(
|
| - 'New process %d->%d (%s) %s' %
|
| - (ppid, pid, line[IMAGE_FILE_NAME], command_line))
|
| -
|
| - if pid == self._thunk_pid:
|
| - # Need to ignore processes we don't know about because the log is
|
| - # system-wide. self._thunk_pid shall start only one process.
|
| - # This is tricky though because Windows *loves* to reuse process id and
|
| - # it happens often that the process ID of the thunk script created by
|
| - # create_subprocess_thunk() is reused. So just detecting the pid here is
|
| - # not sufficient, we must confirm the command line.
|
| - if command_line[:len(self._thunk_cmd)] != self._thunk_cmd:
|
| - logging.info(
|
| - 'Ignoring duplicate pid %d for %s: %s while searching for %s',
|
| - pid, self._trace_name, command_line, self._thunk_cmd)
|
| - return
|
| -
|
| - # TODO(maruel): The check is quite weak. Add the thunk path.
|
| - if self._thunk_process:
|
| - raise TracingFailure(
|
| - ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is '
|
| - 'already set') % (self._thunk_pid, self._thunk_process.pid),
|
| - None, None, None)
|
| - proc = self.Process(self.blacklist, pid, None)
|
| - self._thunk_process = proc
|
| - return
|
| - elif ppid == self._thunk_pid and self._thunk_process:
|
| - proc = self.Process(self.blacklist, pid, None)
|
| - self.root_process = proc
|
| - ppid = None
|
| - elif self._process_lookup.get(ppid):
|
| - proc = self.Process(self.blacklist, pid, None)
|
| - self._process_lookup[ppid].children.append(proc)
|
| - else:
|
| - # Ignore
|
| - return
|
| - self._process_lookup[pid] = proc
|
| -
|
| - proc.command = command_line
|
| - proc.executable = line[IMAGE_FILE_NAME]
|
| - # proc.command[0] may be the absolute path of 'executable' but it may be
|
| - # anything else too. If it happens that command[0] ends with executable,
|
| - # use it, otherwise defaults to the base name.
|
| - cmd0 = proc.command[0].lower()
|
| - if not cmd0.endswith('.exe'):
|
| - # TODO(maruel): That's not strictly true either.
|
| - cmd0 += '.exe'
|
| - if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):
|
| - # Fix the path.
|
| - cmd0 = cmd0.replace('/', os.path.sep)
|
| - cmd0 = os.path.normpath(cmd0)
|
| - proc.executable = file_path.get_native_path_case(cmd0)
|
| - logging.info(
|
| - 'New child: %s -> %d %s' % (ppid, pid, proc.executable))
|
| -
|
| - def handle_Thread_End(self, line):
|
| - """Has the same parameters as Thread_Start."""
|
| - tid = int(line[self.TID], 16)
|
| - self._threads_active.pop(tid, None)
|
| -
|
| - def handle_Thread_Start(self, line):
|
| - """Handles a new thread created.
|
| -
|
| - Do not use self.PID here since a process' initial thread is created by
|
| - the parent process.
|
| - """
|
| - PROCESS_ID = self.USER_DATA
|
| - TTHREAD_ID = self.USER_DATA + 1
|
| - #STACK_BASE = self.USER_DATA + 2
|
| - #STACK_LIMIT = self.USER_DATA + 3
|
| - #USER_STACK_BASE = self.USER_DATA + 4
|
| - #USER_STACK_LIMIT = self.USER_DATA + 5
|
| - #AFFINITY = self.USER_DATA + 6
|
| - #WIN32_START_ADDR = self.USER_DATA + 7
|
| - #TEB_BASE = self.USER_DATA + 8
|
| - #SUB_PROCESS_TAG = self.USER_DATA + 9
|
| - #BASE_PRIORITY = self.USER_DATA + 10
|
| - #PAGE_PRIORITY = self.USER_DATA + 11
|
| - #IO_PRIORITY = self.USER_DATA + 12
|
| - #THREAD_FLAGS = self.USER_DATA + 13
|
| - # Do not use self.PID here since a process' initial thread is created by
|
| - # the parent process.
|
| - pid = int(line[PROCESS_ID], 16)
|
| - tid = int(line[TTHREAD_ID], 16)
|
| - logging.debug('New thread pid:%d, tid:%d' % (pid, tid))
|
| - self._threads_active[tid] = pid
|
| -
|
| - @classmethod
|
| - def supported_events(cls):
|
| - """Returns all the procesed events."""
|
| - out = []
|
| - for member in dir(cls):
|
| - match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
|
| - if match:
|
| - out.append(match.groups())
|
| - return out
|
| -
|
| - class Tracer(ApiBase.Tracer):
|
| - # The basic headers.
|
| - EXPECTED_HEADER = [
|
| - u'Event Name',
|
| - u'Type',
|
| - u'Event ID',
|
| - u'Version',
|
| - u'Channel',
|
| - u'Level', # 5
|
| - u'Opcode',
|
| - u'Task',
|
| - u'Keyword',
|
| - u'PID',
|
| - u'TID', # 10
|
| - u'Processor Number',
|
| - u'Instance ID',
|
| - u'Parent Instance ID',
|
| - u'Activity ID',
|
| - u'Related Activity ID', # 15
|
| - u'Clock-Time',
|
| - u'Kernel(ms)', # Both have a resolution of ~15ms which makes them
|
| - u'User(ms)', # pretty much useless.
|
| - u'User Data', # Extra arguments that are event-specific.
|
| - ]
|
| - # Only the useful headers common to all entries are listed there. Any column
|
| - # at 19 or higher is dependent on the specific event.
|
| - EVENT_NAME = 0
|
| - TYPE = 1
|
| - PID = 9
|
| - TID = 10
|
| - PROCESSOR_ID = 11
|
| - TIMESTAMP = 16
|
| - NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
|
| - USER_DATA = 19
|
| -
|
| - class CsvReader(object):
|
| - """CSV reader that reads files generated by tracerpt.exe.
|
| -
|
| - csv.reader() fails to read them properly, it mangles file names quoted
|
| - with "" with a comma in it.
|
| - """
|
| - # 0. Had a ',' or one of the following ' ' after a comma, next should
|
| - # be ' ', '"' or string or ',' for an empty field.
|
| - ( HAD_DELIMITER,
|
| - # 1. Processing an unquoted field up to ','.
|
| - IN_STR,
|
| - # 2. Processing a new field starting with '"'.
|
| - STARTING_STR_QUOTED,
|
| - # 3. Second quote in a row at the start of a field. It could be either
|
| - # '""foo""' or '""'. Who the hell thought it was a great idea to use
|
| - # the same character for delimiting and escaping?
|
| - STARTING_SECOND_QUOTE,
|
| - # 4. A quote inside a quoted string where the previous character was
|
| - # not a quote, so the string is not empty. Can be either: end of a
|
| - # quoted string (a delimiter) or a quote escape. The next char must be
|
| - # either '"' or ','.
|
| - HAD_QUOTE_IN_QUOTED,
|
| - # 5. Second quote inside a quoted string.
|
| - HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED,
|
| - # 6. Processing a field that started with '"'.
|
| - IN_STR_QUOTED) = range(7)
|
| -
|
| - def __init__(self, f):
|
| - self.f = f
|
| -
|
| - def __iter__(self):
|
| - return self
|
| -
|
| - def next(self):
|
| - """Splits the line in fields."""
|
| - line = self.f.readline()
|
| - if not line:
|
| - raise StopIteration()
|
| - line = line.strip()
|
| - fields = []
|
| - state = self.HAD_DELIMITER
|
| - for i, c in enumerate(line):
|
| - if state == self.HAD_DELIMITER:
|
| - if c == ',':
|
| - # Empty field.
|
| - fields.append('')
|
| - elif c == ' ':
|
| - # Ignore initial whitespaces
|
| - pass
|
| - elif c == '"':
|
| - state = self.STARTING_STR_QUOTED
|
| - fields.append('')
|
| - else:
|
| - # Start of a new field.
|
| - state = self.IN_STR
|
| - fields.append(c)
|
| -
|
| - elif state == self.IN_STR:
|
| - # Do not accept quote inside unquoted field.
|
| - assert c != '"', (i, c, line, fields)
|
| - if c == ',':
|
| - fields[-1] = fields[-1].strip()
|
| - state = self.HAD_DELIMITER
|
| - else:
|
| - fields[-1] = fields[-1] + c
|
| -
|
| - elif state == self.STARTING_STR_QUOTED:
|
| - if c == '"':
|
| - # Do not store the character yet.
|
| - state = self.STARTING_SECOND_QUOTE
|
| - else:
|
| - state = self.IN_STR_QUOTED
|
| - fields[-1] = fields[-1] + c
|
| -
|
| - elif state == self.STARTING_SECOND_QUOTE:
|
| - if c == ',':
|
| - # It was an empty field. '""' == ''.
|
| - state = self.HAD_DELIMITER
|
| - else:
|
| - fields[-1] = fields[-1] + '"' + c
|
| - state = self.IN_STR_QUOTED
|
| -
|
| - elif state == self.HAD_QUOTE_IN_QUOTED:
|
| - if c == ',':
|
| - # End of the string.
|
| - state = self.HAD_DELIMITER
|
| - elif c == '"':
|
| - state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED
|
| - else:
|
| - # The previous double-quote was just an unescaped quote.
|
| - fields[-1] = fields[-1] + '"' + c
|
| - state = self.IN_STR_QUOTED
|
| -
|
| - elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
|
| - if c == ',':
|
| - # End of the string.
|
| - state = self.HAD_DELIMITER
|
| - fields[-1] = fields[-1] + '"'
|
| - else:
|
| - # That's just how the logger rolls. Revert back to appending the
|
| - # char and "guess" it was a quote in a double-quoted string.
|
| - state = self.IN_STR_QUOTED
|
| - fields[-1] = fields[-1] + '"' + c
|
| -
|
| - elif state == self.IN_STR_QUOTED:
|
| - if c == '"':
|
| - # Could be a delimiter or an escape.
|
| - state = self.HAD_QUOTE_IN_QUOTED
|
| - else:
|
| - fields[-1] = fields[-1] + c
|
| -
|
| - if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED:
|
| - fields[-1] = fields[-1] + '"'
|
| - else:
|
| - assert state in (
|
| - # Terminated with a normal field.
|
| - self.IN_STR,
|
| - # Terminated with an empty field.
|
| - self.STARTING_SECOND_QUOTE,
|
| - # Terminated with a normal quoted field.
|
| - self.HAD_QUOTE_IN_QUOTED), (
|
| - line, state, fields)
|
| - return fields
|
| -
|
| - def __init__(self, logname):
|
| - """Starts the log collection.
|
| -
|
| - Requires administrative access. logman.exe is synchronous so no need for a
|
| - "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID
|
| - instead. The GUID constant name is SystemTraceControlGuid. Lovely.
|
| -
|
| - One can get the list of potentially interesting providers with:
|
| - "logman query providers | findstr /i file"
|
| - """
|
| - super(LogmanTrace.Tracer, self).__init__(logname)
|
| - self._signal_script = create_subprocess_thunk()
|
| - self._scripts_to_cleanup.append(self._signal_script)
|
| - cmd_start = [
|
| - 'logman.exe',
|
| - 'start',
|
| - 'NT Kernel Logger',
|
| - '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',
|
| - # splitio,fileiocompletion,syscall,file,cswitch,img
|
| - '(process,fileio,thread)',
|
| - '-o', self._logname + '.etl',
|
| - '-ets', # Send directly to kernel
|
| - # Values extracted out of thin air.
|
| - # Event Trace Session buffer size in kb.
|
| - '-bs', '10240',
|
| - # Number of Event Trace Session buffers.
|
| - '-nb', '16', '256',
|
| - ]
|
| - logging.debug('Running: %s' % cmd_start)
|
| - try:
|
| - subprocess.check_call(
|
| - cmd_start,
|
| - stdin=subprocess.PIPE,
|
| - stdout=subprocess.PIPE,
|
| - stderr=subprocess.STDOUT)
|
| - except subprocess.CalledProcessError, e:
|
| - if e.returncode == -2147024891:
|
| - print >> sys.stderr, 'Please restart with an elevated admin prompt'
|
| - elif e.returncode == -2144337737:
|
| - print >> sys.stderr, (
|
| - 'A kernel trace was already running, stop it and try again')
|
| - raise
|
| -
|
| - def trace(self, cmd, cwd, tracename, output):
|
| - logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))
|
| - assert os.path.isabs(cmd[0]), cmd[0]
|
| - assert os.path.isabs(cwd), cwd
|
| - assert os.path.normpath(cwd) == cwd, cwd
|
| - with self._lock:
|
| - if not self._initialized:
|
| - raise TracingFailure(
|
| - 'Called Tracer.trace() on an unitialized object',
|
| - None, None, None, tracename)
|
| - assert tracename not in (i['trace'] for i in self._traces)
|
| -
|
| - # Use "logman -?" for help.
|
| -
|
| - stdout = stderr = None
|
| - if output:
|
| - stdout = subprocess.PIPE
|
| - stderr = subprocess.STDOUT
|
| -
|
| - # Run the child process.
|
| - logging.debug('Running: %s' % cmd)
|
| - # Use the temporary script generated with create_subprocess_thunk() so we
|
| - # have a clear pid owner. Since trace_inputs.py can be used as a library
|
| - # and could trace multiple processes simultaneously, it makes it more
|
| - # complex if the executable to be traced is executed directly here. It
|
| - # also solves issues related to logman.exe that needs to be executed to
|
| - # control the kernel trace.
|
| - child_cmd = [
|
| - sys.executable,
|
| - self._signal_script,
|
| - tracename,
|
| - ]
|
| - child = subprocess.Popen(
|
| - child_cmd + tools.fix_python_path(cmd),
|
| - cwd=cwd,
|
| - stdin=subprocess.PIPE,
|
| - stdout=stdout,
|
| - stderr=stderr)
|
| - logging.debug('Started child pid: %d' % child.pid)
|
| - out = child.communicate()[0]
|
| - # This doesn't mean all the grand-children are done. Sadly, we don't have
|
| - # a good way to determine that.
|
| -
|
| - with self._lock:
|
| - assert tracename not in (i['trace'] for i in self._traces)
|
| - self._traces.append({
|
| - 'cmd': cmd,
|
| - 'cwd': cwd,
|
| - 'output': out,
|
| - 'pid': child.pid,
|
| - # Used to figure out the real process when process ids are reused.
|
| - 'thunk_cmd': child_cmd,
|
| - 'trace': tracename,
|
| - })
|
| -
|
| - return child.returncode, out
|
| -
|
| - def close(self, _timeout=None):
|
| - """Stops the kernel log collection and converts the traces to text
|
| - representation.
|
| - """
|
| - with self._lock:
|
| - try:
|
| - super(LogmanTrace.Tracer, self).close()
|
| - finally:
|
| - cmd_stop = [
|
| - 'logman.exe',
|
| - 'stop',
|
| - 'NT Kernel Logger',
|
| - '-ets', # Sends the command directly to the kernel.
|
| - ]
|
| - logging.debug('Running: %s' % cmd_stop)
|
| - subprocess.check_call(
|
| - cmd_stop,
|
| - stdin=subprocess.PIPE,
|
| - stdout=subprocess.PIPE,
|
| - stderr=subprocess.STDOUT)
|
| -
|
| - def post_process_log(self):
|
| - """Converts the .etl file into .csv then into .json."""
|
| - super(LogmanTrace.Tracer, self).post_process_log()
|
| - logformat = 'csv'
|
| - self._convert_log(logformat)
|
| - self._trim_log(logformat)
|
| -
|
| - def _gen_logdata(self):
|
| - return {
|
| - 'format': 'csv',
|
| - 'traces': self._traces,
|
| - }
|
| -
|
| - def _trim_log(self, logformat):
|
| - """Reduces the amount of data in original log by generating a 'reduced'
|
| - log.
|
| - """
|
| - if logformat == 'csv_utf16':
|
| - file_handle = codecs.open(
|
| - self._logname + '.' + logformat, 'r', encoding='utf-16')
|
| -
|
| - elif logformat == 'csv':
|
| - assert sys.getfilesystemencoding() == 'mbcs'
|
| - file_handle = codecs.open(
|
| - self._logname + '.' + logformat, 'r',
|
| - encoding=get_current_encoding())
|
| -
|
| - supported_events = LogmanTrace.Context.supported_events()
|
| -
|
| - def trim(generator):
|
| - """Loads items from the generator and returns the interesting data.
|
| -
|
| - It filters out any uninteresting line and reduce the amount of data in
|
| - the trace.
|
| - """
|
| - for index, line in enumerate(generator):
|
| - if not index:
|
| - if line != self.EXPECTED_HEADER:
|
| - raise TracingFailure(
|
| - 'Found malformed header: %s' % line,
|
| - None, None, None)
|
| - continue
|
| - # As you can see, the CSV is full of useful non-redundant information:
|
| - if (line[2] != '0' or # Event ID
|
| - line[3] not in ('2', '3') or # Version
|
| - line[4] != '0' or # Channel
|
| - line[5] != '0' or # Level
|
| - line[7] != '0' or # Task
|
| - line[8] != '0x0000000000000000' or # Keyword
|
| - line[12] != '' or # Instance ID
|
| - line[13] != '' or # Parent Instance ID
|
| - line[14] != self.NULL_GUID or # Activity ID
|
| - line[15] != ''): # Related Activity ID
|
| - raise TracingFailure(
|
| - 'Found unexpected values in line: %s' % ' '.join(line),
|
| - None, None, None)
|
| -
|
| - if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:
|
| - continue
|
| -
|
| - yield [
|
| - line[self.EVENT_NAME],
|
| - line[self.TYPE],
|
| - line[self.PID],
|
| - line[self.TID],
|
| - line[self.PROCESSOR_ID],
|
| - line[self.TIMESTAMP],
|
| - ] + line[self.USER_DATA:]
|
| -
|
| - # must not convert the trim() call into a list, since it will use too much
|
| - # memory for large trace. use a csv file as a workaround since the json
|
| - # parser requires a complete in-memory file.
|
| - with open('%s.preprocessed' % self._logname, 'wb') as f:
|
| - # $ and * can't be used in file name on windows, reducing the likelihood
|
| - # of having to escape a string.
|
| - out = csv.writer(
|
| - f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
|
| - for line in trim(self.CsvReader(file_handle)):
|
| - out.writerow([s.encode('utf-8') for s in line])
|
| -
|
| - def _convert_log(self, logformat):
|
| - """Converts the ETL trace to text representation.
|
| -
|
| - Normally, 'csv' is sufficient. If complex scripts are used (like eastern
|
| - languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
|
| -
|
| - Arguments:
|
| - - logformat: Text format to be generated, csv, csv_utf16 or xml.
|
| -
|
| - Use "tracerpt -?" for help.
|
| - """
|
| - LOCALE_INVARIANT = 0x7F
|
| - windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
|
| - cmd_convert = [
|
| - 'tracerpt.exe',
|
| - '-l', self._logname + '.etl',
|
| - '-o', self._logname + '.' + logformat,
|
| - '-gmt', # Use UTC
|
| - '-y', # No prompt
|
| - # Use -of XML to get the header of each items after column 19, e.g. all
|
| - # the actual headers of 'User Data'.
|
| - ]
|
| -
|
| - if logformat == 'csv':
|
| - # tracerpt localizes the 'Type' column, for major brainfuck
|
| - # entertainment. I can't imagine any sane reason to do that.
|
| - cmd_convert.extend(['-of', 'CSV'])
|
| - elif logformat == 'csv_utf16':
|
| - # This causes it to use UTF-16, which doubles the log size but ensures
|
| - # the log is readable for non-ASCII characters.
|
| - cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
|
| - elif logformat == 'xml':
|
| - cmd_convert.extend(['-of', 'XML'])
|
| - else:
|
| - raise ValueError('Unexpected log format \'%s\'' % logformat)
|
| - logging.debug('Running: %s' % cmd_convert)
|
| - # This can takes tens of minutes for large logs.
|
| - # Redirects all output to stderr.
|
| - subprocess.check_call(
|
| - cmd_convert,
|
| - stdin=subprocess.PIPE,
|
| - stdout=sys.stderr,
|
| - stderr=sys.stderr)
|
| -
|
| - def __init__(self, use_sudo=False): # pylint: disable=W0613
|
| - super(LogmanTrace, self).__init__()
|
| - # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API.
|
| -
|
| - @staticmethod
|
| - def clean_trace(logname):
|
| - for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'):
|
| - if os.path.isfile(logname + ext):
|
| - os.remove(logname + ext)
|
| -
|
| - @classmethod
|
| - def parse_log(cls, logname, blacklist, trace_name):
|
| - logging.info('parse_log(%s, ..., %s)', logname, trace_name)
|
| - assert os.path.isabs(logname)
|
| -
|
| - def blacklist_more(filepath):
|
| - # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
|
| - return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
|
| -
|
| - # Create a list of (Context, result_dict) tuples. This is necessary because
|
| - # the csv file may be larger than the amount of available memory.
|
| - contexes = [
|
| - (
|
| - cls.Context(
|
| - blacklist_more, item['pid'], item['trace'], item['thunk_cmd']),
|
| - {
|
| - 'output': item['output'],
|
| - 'trace': item['trace'],
|
| - },
|
| - )
|
| - for item in read_json(logname)['traces']
|
| - if not trace_name or item['trace'] == trace_name
|
| - ]
|
| -
|
| - # The log may be too large to fit in memory and it is not efficient to read
|
| - # it multiple times, so multiplex the contexes instead, which is slightly
|
| - # more awkward.
|
| - with open('%s.preprocessed' % logname, 'rb') as f:
|
| - lines = csv.reader(
|
| - f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL)
|
| - for encoded in lines:
|
| - line = [s.decode('utf-8') for s in encoded]
|
| - # Convert the PID in-place from hex.
|
| - line[cls.Context.PID] = int(line[cls.Context.PID], 16)
|
| - for context in contexes:
|
| - if 'exception' in context[1]:
|
| - continue
|
| - try:
|
| - context[0].on_line(line)
|
| - except TracingFailure:
|
| - context[1]['exception'] = sys.exc_info()
|
| -
|
| - for context in contexes:
|
| - if 'exception' in context[1]:
|
| - continue
|
| - context[1]['results'] = context[0].to_results()
|
| -
|
| - return [context[1] for context in contexes]
|
| -
|
| -
|
| -def get_api(**kwargs):
|
| - """Returns the correct implementation for the current OS."""
|
| - if sys.platform == 'cygwin':
|
| - raise NotImplementedError(
|
| - 'Not implemented for cygwin, start the script from Win32 python')
|
| - flavors = {
|
| - 'win32': LogmanTrace,
|
| - 'darwin': Dtrace,
|
| - 'sunos5': Dtrace,
|
| - 'freebsd7': Dtrace,
|
| - 'freebsd8': Dtrace,
|
| - }
|
| - # Defaults to strace.
|
| - return flavors.get(sys.platform, Strace)(**kwargs)
|
| -
|
| -
|
| -def extract_directories(root_dir, files, blacklist):
|
| - """Detects if all the files in a directory are in |files| and if so, replace
|
| - the individual files by a Results.Directory instance.
|
| -
|
| - Takes a list of Results.File instances and returns a shorter list of
|
| - Results.File and Results.Directory instances.
|
| -
|
| - Arguments:
|
| - - root_dir: Optional base directory that shouldn't be search further.
|
| - - files: list of Results.File instances.
|
| - - blacklist: lambda to reject unneeded files, for example r'.+\.pyc'.
|
| - """
|
| - logging.info(
|
| - 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))
|
| - assert not (root_dir or '').endswith(os.path.sep), root_dir
|
| - # It is important for root_dir to not be a symlinked path, make sure to call
|
| - # os.path.realpath() as needed.
|
| - assert not root_dir or (
|
| - os.path.realpath(file_path.get_native_path_case(root_dir)) == root_dir)
|
| - assert not any(isinstance(f, Results.Directory) for f in files)
|
| - # Remove non existent files.
|
| - files = [f for f in files if f.existent]
|
| - if not files:
|
| - return files
|
| - # All files must share the same root, which can be None.
|
| - assert len(set(f.root for f in files)) == 1, set(f.root for f in files)
|
| -
|
| - # Creates a {directory: {filename: File}} mapping, up to root.
|
| - buckets = {}
|
| - if root_dir:
|
| - buckets[root_dir] = {}
|
| - for fileobj in files:
|
| - path = fileobj.full_path
|
| - directory = os.path.dirname(path)
|
| - assert directory
|
| - # Do not use os.path.basename() so trailing os.path.sep is kept.
|
| - basename = path[len(directory)+1:]
|
| - files_in_directory = buckets.setdefault(directory, {})
|
| - files_in_directory[basename] = fileobj
|
| - # Add all the directories recursively up to root.
|
| - while True:
|
| - old_d = directory
|
| - directory = os.path.dirname(directory)
|
| - if directory + os.path.sep == root_dir or directory == old_d:
|
| - break
|
| - buckets.setdefault(directory, {})
|
| -
|
| - root_prefix = len(root_dir) + 1 if root_dir else 0
|
| - for directory in sorted(buckets, reverse=True):
|
| - if not os.path.isdir(directory):
|
| - logging.debug(
|
| - '%s was a directory but doesn\'t exist anymore; ignoring', directory)
|
| - continue
|
| - actual = set(f for f in os.listdir(directory) if not blacklist(f))
|
| - expected = set(buckets[directory])
|
| - if not (actual - expected):
|
| - parent = os.path.dirname(directory)
|
| - buckets[parent][os.path.basename(directory)] = Results.Directory(
|
| - root_dir,
|
| - directory[root_prefix:],
|
| - False,
|
| - sum(f.size for f in buckets[directory].itervalues()),
|
| - sum(f.nb_files for f in buckets[directory].itervalues()))
|
| - # Remove the whole bucket.
|
| - del buckets[directory]
|
| -
|
| - # Reverse the mapping with what remains. The original instances are returned,
|
| - # so the cached meta data is kept.
|
| - files = sum((x.values() for x in buckets.itervalues()), [])
|
| - return sorted(files, key=lambda x: x.path)
|
| -
|
| -
|
| -def trace(logfile, cmd, cwd, api, output):
|
| - """Traces an executable. Returns (returncode, output) from api.
|
| -
|
| - Arguments:
|
| - - logfile: file to write to.
|
| - - cmd: command to run.
|
| - - cwd: current directory to start the process in.
|
| - - api: a tracing api instance.
|
| - - output: if True, returns output, otherwise prints it at the console.
|
| - """
|
| - cmd = tools.fix_python_path(cmd)
|
| - api.clean_trace(logfile)
|
| - with api.get_tracer(logfile) as tracer:
|
| - return tracer.trace(cmd, cwd, 'default', output)
|
| -
|
| -
|
| -def CMDclean(parser, args):
|
| - """Cleans up traces."""
|
| - options, args = parser.parse_args(args)
|
| - api = get_api()
|
| - api.clean_trace(options.log)
|
| - return 0
|
| -
|
| -
|
| -def CMDtrace(parser, args):
|
| - """Traces an executable."""
|
| - parser.allow_interspersed_args = False
|
| - parser.add_option(
|
| - '-q', '--quiet', action='store_true',
|
| - help='Redirects traced executable output to /dev/null')
|
| - parser.add_option(
|
| - '-s', '--sudo', action='store_true',
|
| - help='Use sudo when shelling out the tracer tool (ignored on Windows)')
|
| - parser.add_option(
|
| - '-n', '--no-sudo', action='store_false',
|
| - help='Don\'t use sudo')
|
| - options, args = parser.parse_args(args)
|
| -
|
| - if not args:
|
| - parser.error('Please provide a command to run')
|
| -
|
| - if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):
|
| - args[0] = os.path.abspath(args[0])
|
| -
|
| - # options.sudo default value is None, which is to do whatever tracer defaults
|
| - # do.
|
| - api = get_api(use_sudo=options.sudo)
|
| - return trace(options.log, args, os.getcwd(), api, options.quiet)[0]
|
| -
|
| -
|
| -def CMDread(parser, args):
|
| - """Reads the logs and prints the result."""
|
| - parser.add_option(
|
| - '-V', '--variable',
|
| - nargs=2,
|
| - action='append',
|
| - dest='variables',
|
| - metavar='VAR_NAME directory',
|
| - default=[],
|
| - help=('Variables to replace relative directories against. Example: '
|
| - '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '
|
| - 'home dir with $HOME') % getpass.getuser())
|
| - parser.add_option(
|
| - '--root-dir',
|
| - help='Root directory to base everything off it. Anything outside of this '
|
| - 'this directory will not be reported')
|
| - parser.add_option(
|
| - '--trace-name',
|
| - help='Only reads one of the trace. Defaults to reading all traces')
|
| - parser.add_option(
|
| - '-j', '--json', action='store_true',
|
| - help='Outputs raw result data as json')
|
| - parser.add_option(
|
| - '--trace-blacklist', action='append', default=[],
|
| - help='List of regexp to use as blacklist filter')
|
| - options, args = parser.parse_args(args)
|
| -
|
| - if options.root_dir:
|
| - options.root_dir = file_path.get_native_path_case(
|
| - unicode(os.path.abspath(options.root_dir)))
|
| -
|
| - variables = dict(options.variables)
|
| - api = get_api()
|
| - blacklist = gen_blacklist(options.trace_blacklist)
|
| - data = api.parse_log(options.log, blacklist, options.trace_name)
|
| - # Process each trace.
|
| - output_as_json = []
|
| - try:
|
| - for item in data:
|
| - if 'exception' in item:
|
| - # Do not abort the other traces.
|
| - print >> sys.stderr, (
|
| - 'Trace %s: Got an exception: %s' % (
|
| - item['trace'], item['exception'][1]))
|
| - continue
|
| - results = item['results']
|
| - if options.root_dir:
|
| - results = results.strip_root(options.root_dir)
|
| -
|
| - if options.json:
|
| - output_as_json.append(results.flatten())
|
| - else:
|
| - simplified = extract_directories(
|
| - options.root_dir, results.files, blacklist)
|
| - simplified = [f.replace_variables(variables) for f in simplified]
|
| - if len(data) > 1:
|
| - print('Trace: %s' % item['trace'])
|
| - print('Total: %d' % len(results.files))
|
| - print('Non existent: %d' % len(results.non_existent))
|
| - for f in results.non_existent:
|
| - print(' %s' % f.path)
|
| - print(
|
| - 'Interesting: %d reduced to %d' % (
|
| - len(results.existent), len(simplified)))
|
| - for f in simplified:
|
| - print(' %s' % f.path)
|
| -
|
| - if options.json:
|
| - write_json(sys.stdout, output_as_json, False)
|
| - except KeyboardInterrupt:
|
| - return 1
|
| - except IOError as e:
|
| - if e.errno == errno.EPIPE:
|
| - # Do not print a stack trace when the output is piped to less and the user
|
| - # quits before the whole output was written.
|
| - return 1
|
| - raise
|
| - return 0
|
| -
|
| -
|
| -class OptionParserTraceInputs(tools.OptionParserWithLogging):
|
| - """Adds automatic --log handling."""
|
| -
|
| - # Disable --log-file options since both --log and --log-file options are
|
| - # confusing.
|
| - # TODO(vadimsh): Rename --log-file or --log to something else.
|
| - enable_log_file = False
|
| -
|
| - def __init__(self, **kwargs):
|
| - tools.OptionParserWithLogging.__init__(self, **kwargs)
|
| - self.add_option(
|
| - '-l', '--log', help='Log file to generate or read, required')
|
| -
|
| - def parse_args(self, *args, **kwargs):
|
| - """Makes sure the paths make sense.
|
| -
|
| - On Windows, / and \ are often mixed together in a path.
|
| - """
|
| - options, args = tools.OptionParserWithLogging.parse_args(
|
| - self, *args, **kwargs)
|
| - if not options.log:
|
| - self.error('Must supply a log file with -l')
|
| - options.log = os.path.abspath(options.log)
|
| - return options, args
|
| -
|
| -
|
| -def main(argv):
|
| - dispatcher = subcommand.CommandDispatcher(__name__)
|
| - try:
|
| - return dispatcher.execute(
|
| - OptionParserTraceInputs(version=__version__), argv)
|
| - except TracingFailure, e:
|
| - sys.stderr.write('\nError: ')
|
| - sys.stderr.write(str(e))
|
| - sys.stderr.write('\n')
|
| - return 1
|
| -
|
| -
|
| -if __name__ == '__main__':
|
| - fix_encoding.fix_encoding()
|
| - tools.disable_buffering()
|
| - colorama.init()
|
| - sys.exit(main(sys.argv[1:]))
|
|
|