| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env python | |
| 2 # coding=utf-8 | |
| 3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 4 # Use of this source code is governed by a BSD-style license that can be | |
| 5 # found in the LICENSE file. | |
| 6 | |
| 7 """Traces an executable and its child processes and extract the files accessed | |
| 8 by them. | |
| 9 | |
| 10 The implementation uses OS-specific API. The native Kernel logger and the ETL | |
| 11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise. | |
| 12 The OS-specific implementation is hidden in an 'API' interface. | |
| 13 | |
| 14 The results are embedded in a Results instance. The tracing is done in two | |
| 15 phases, the first is to do the actual trace and generate an | |
| 16 implementation-specific log file. Then the log file is parsed to extract the | |
| 17 information, including the individual child processes and the files accessed | |
| 18 from the log. | |
| 19 """ | |
| 20 | |
| 21 import codecs | |
| 22 import csv | |
| 23 import errno | |
| 24 import getpass | |
| 25 import glob | |
| 26 import json | |
| 27 import logging | |
| 28 import os | |
| 29 import re | |
| 30 import stat | |
| 31 import subprocess | |
| 32 import sys | |
| 33 import tempfile | |
| 34 import threading | |
| 35 import time | |
| 36 import weakref | |
| 37 | |
| 38 from third_party import colorama | |
| 39 from third_party.depot_tools import fix_encoding | |
| 40 from third_party.depot_tools import subcommand | |
| 41 | |
| 42 from utils import file_path | |
| 43 from utils import tools | |
| 44 | |
| 45 ## OS-specific imports | |
| 46 | |
| 47 if sys.platform == 'win32': | |
| 48 from ctypes.wintypes import byref, c_int, c_wchar_p | |
| 49 from ctypes.wintypes import windll # pylint: disable=E0611 | |
| 50 | |
| 51 | |
| 52 __version__ = '0.1' | |
| 53 | |
| 54 | |
| 55 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| 56 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR)) | |
| 57 | |
| 58 | |
| 59 class TracingFailure(Exception): | |
| 60 """An exception occured during tracing.""" | |
| 61 def __init__(self, description, pid, line_number, line, *args): | |
| 62 super(TracingFailure, self).__init__( | |
| 63 description, pid, line_number, line, *args) | |
| 64 self.description = description | |
| 65 self.pid = pid | |
| 66 self.line_number = line_number | |
| 67 self.line = line | |
| 68 self.extra = args | |
| 69 | |
| 70 def __str__(self): | |
| 71 out = self.description | |
| 72 if self.pid: | |
| 73 out += '\npid: %d' % self.pid | |
| 74 if self.line_number: | |
| 75 out += '\nline: %d' % self.line_number | |
| 76 if self.line: | |
| 77 out += '\n%s' % self.line | |
| 78 if self.extra: | |
| 79 out += '\n' + ', '.join(map(str, filter(None, self.extra))) | |
| 80 return out | |
| 81 | |
| 82 | |
| 83 ## OS-specific functions | |
| 84 | |
| 85 if sys.platform == 'win32': | |
| 86 def get_current_encoding(): | |
| 87 """Returns the 'ANSI' code page associated to the process.""" | |
| 88 return 'cp%d' % int(windll.kernel32.GetACP()) | |
| 89 | |
| 90 | |
| 91 def CommandLineToArgvW(command_line): | |
| 92 """Splits a commandline into argv using CommandLineToArgvW().""" | |
| 93 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx | |
| 94 size = c_int() | |
| 95 assert isinstance(command_line, unicode) | |
| 96 ptr = windll.shell32.CommandLineToArgvW(command_line, byref(size)) | |
| 97 try: | |
| 98 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)] | |
| 99 finally: | |
| 100 windll.kernel32.LocalFree(ptr) | |
| 101 | |
| 102 | |
| 103 | |
| 104 def gen_blacklist(regexes): | |
| 105 """Returns a lambda to be used as a blacklist.""" | |
| 106 compiled = [re.compile(i) for i in regexes] | |
| 107 def match(f): | |
| 108 return any(j.match(f) for j in compiled) | |
| 109 return match | |
| 110 | |
| 111 | |
| 112 def create_subprocess_thunk(): | |
| 113 """Creates a small temporary script to start the child process. | |
| 114 | |
| 115 This thunk doesn't block, its unique name is used to identify it as the | |
| 116 parent. | |
| 117 """ | |
| 118 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py') | |
| 119 try: | |
| 120 os.write( | |
| 121 handle, | |
| 122 ( | |
| 123 'import subprocess, sys\n' | |
| 124 'sys.exit(subprocess.call(sys.argv[2:]))\n' | |
| 125 )) | |
| 126 finally: | |
| 127 os.close(handle) | |
| 128 return name | |
| 129 | |
| 130 | |
| 131 def create_exec_thunk(): | |
| 132 """Creates a small temporary script to start the child executable. | |
| 133 | |
| 134 Reads from the file handle provided as the fisrt argument to block, then | |
| 135 execv() the command to be traced. | |
| 136 """ | |
| 137 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py') | |
| 138 try: | |
| 139 os.write( | |
| 140 handle, | |
| 141 ( | |
| 142 'import os, sys\n' | |
| 143 'fd = int(sys.argv[1])\n' | |
| 144 # This will block until the controlling process writes a byte on the | |
| 145 # pipe. It will do so once the tracing tool, e.g. strace, is ready to | |
| 146 # trace. | |
| 147 'os.read(fd, 1)\n' | |
| 148 'os.close(fd)\n' | |
| 149 'os.execv(sys.argv[2], sys.argv[2:])\n' | |
| 150 )) | |
| 151 finally: | |
| 152 os.close(handle) | |
| 153 return name | |
| 154 | |
| 155 | |
| 156 def strace_process_quoted_arguments(text): | |
| 157 """Extracts quoted arguments on a string and return the arguments as a list. | |
| 158 | |
| 159 Implemented as an automaton. Supports incomplete strings in the form | |
| 160 '"foo"...'. | |
| 161 | |
| 162 Example: | |
| 163 With text = '"foo", "bar"', the function will return ['foo', 'bar'] | |
| 164 | |
| 165 TODO(maruel): Implement escaping. | |
| 166 """ | |
| 167 # All the possible states of the DFA. | |
| 168 ( NEED_QUOTE, # Begining of a new arguments. | |
| 169 INSIDE_STRING, # Inside an argument. | |
| 170 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is. | |
| 171 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be | |
| 172 # a serie of 3 dots or a comma. | |
| 173 NEED_SPACE, # Right after a comma | |
| 174 NEED_DOT_2, # Found a dot, need a second one. | |
| 175 NEED_DOT_3, # Found second dot, need a third one. | |
| 176 NEED_COMMA, # Found third dot, need a comma. | |
| 177 ) = range(8) | |
| 178 | |
| 179 state = NEED_QUOTE | |
| 180 out = [] | |
| 181 for index, char in enumerate(text): | |
| 182 if char == '"': | |
| 183 if state == NEED_QUOTE: | |
| 184 state = INSIDE_STRING | |
| 185 # A new argument was found. | |
| 186 out.append('') | |
| 187 elif state == INSIDE_STRING: | |
| 188 # The argument is now closed. | |
| 189 state = NEED_COMMA_OR_DOT | |
| 190 elif state == ESCAPED: | |
| 191 out[-1] += char | |
| 192 state = INSIDE_STRING | |
| 193 else: | |
| 194 raise ValueError( | |
| 195 'Can\'t process char \'%s\' at column %d for: %r' % ( | |
| 196 char, index, text), | |
| 197 index, | |
| 198 text) | |
| 199 elif char == ',': | |
| 200 if state in (NEED_COMMA_OR_DOT, NEED_COMMA): | |
| 201 state = NEED_SPACE | |
| 202 elif state == INSIDE_STRING: | |
| 203 out[-1] += char | |
| 204 elif state == ESCAPED: | |
| 205 out[-1] += char | |
| 206 state = INSIDE_STRING | |
| 207 else: | |
| 208 raise ValueError( | |
| 209 'Can\'t process char \'%s\' at column %d for: %r' % ( | |
| 210 char, index, text), | |
| 211 index, | |
| 212 text) | |
| 213 elif char == ' ': | |
| 214 if state == NEED_SPACE: | |
| 215 state = NEED_QUOTE | |
| 216 elif state == INSIDE_STRING: | |
| 217 out[-1] += char | |
| 218 elif state == ESCAPED: | |
| 219 out[-1] += char | |
| 220 state = INSIDE_STRING | |
| 221 else: | |
| 222 raise ValueError( | |
| 223 'Can\'t process char \'%s\' at column %d for: %r' % ( | |
| 224 char, index, text), | |
| 225 index, | |
| 226 text) | |
| 227 elif char == '.': | |
| 228 if state in (NEED_QUOTE, NEED_COMMA_OR_DOT): | |
| 229 # The string is incomplete, this mean the strace -s flag should be | |
| 230 # increased. | |
| 231 # For NEED_QUOTE, the input string would look like '"foo", ...'. | |
| 232 # For NEED_COMMA_OR_DOT, the input string would look like '"foo"...' | |
| 233 state = NEED_DOT_2 | |
| 234 elif state == NEED_DOT_2: | |
| 235 state = NEED_DOT_3 | |
| 236 elif state == NEED_DOT_3: | |
| 237 state = NEED_COMMA | |
| 238 elif state == INSIDE_STRING: | |
| 239 out[-1] += char | |
| 240 elif state == ESCAPED: | |
| 241 out[-1] += char | |
| 242 state = INSIDE_STRING | |
| 243 else: | |
| 244 raise ValueError( | |
| 245 'Can\'t process char \'%s\' at column %d for: %r' % ( | |
| 246 char, index, text), | |
| 247 index, | |
| 248 text) | |
| 249 elif char == '\\': | |
| 250 if state == ESCAPED: | |
| 251 out[-1] += char | |
| 252 state = INSIDE_STRING | |
| 253 elif state == INSIDE_STRING: | |
| 254 state = ESCAPED | |
| 255 else: | |
| 256 raise ValueError( | |
| 257 'Can\'t process char \'%s\' at column %d for: %r' % ( | |
| 258 char, index, text), | |
| 259 index, | |
| 260 text) | |
| 261 else: | |
| 262 if state == INSIDE_STRING: | |
| 263 out[-1] += char | |
| 264 else: | |
| 265 raise ValueError( | |
| 266 'Can\'t process char \'%s\' at column %d for: %r' % ( | |
| 267 char, index, text), | |
| 268 index, | |
| 269 text) | |
| 270 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT): | |
| 271 raise ValueError( | |
| 272 'String is incorrectly terminated: %r' % text, | |
| 273 text) | |
| 274 return out | |
| 275 | |
| 276 | |
| 277 def read_json(filepath): | |
| 278 with open(filepath, 'r') as f: | |
| 279 return json.load(f) | |
| 280 | |
| 281 | |
| 282 def write_json(filepath_or_handle, data, dense): | |
| 283 """Writes data into filepath or file handle encoded as json. | |
| 284 | |
| 285 If dense is True, the json is packed. Otherwise, it is human readable. | |
| 286 """ | |
| 287 if hasattr(filepath_or_handle, 'write'): | |
| 288 if dense: | |
| 289 filepath_or_handle.write( | |
| 290 json.dumps(data, sort_keys=True, separators=(',',':'))) | |
| 291 else: | |
| 292 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2)) | |
| 293 else: | |
| 294 with open(filepath_or_handle, 'wb') as f: | |
| 295 if dense: | |
| 296 json.dump(data, f, sort_keys=True, separators=(',',':')) | |
| 297 else: | |
| 298 json.dump(data, f, sort_keys=True, indent=2) | |
| 299 | |
| 300 | |
| 301 def assert_is_renderable(pseudo_string): | |
| 302 """Asserts the input is a valid object to be processed by render().""" | |
| 303 assert ( | |
| 304 pseudo_string is None or | |
| 305 isinstance(pseudo_string, unicode) or | |
| 306 hasattr(pseudo_string, 'render')), repr(pseudo_string) | |
| 307 | |
| 308 | |
| 309 def render(pseudo_string): | |
| 310 """Converts the pseudo-string to an unicode string.""" | |
| 311 if pseudo_string is None or isinstance(pseudo_string, unicode): | |
| 312 return pseudo_string | |
| 313 return pseudo_string.render() | |
| 314 | |
| 315 | |
| 316 class Results(object): | |
| 317 """Results of a trace session.""" | |
| 318 | |
| 319 class _TouchedObject(object): | |
| 320 """Something, a file or a directory, that was accessed.""" | |
| 321 def __init__(self, root, path, tainted, size, nb_files): | |
| 322 logging.debug( | |
| 323 '%s(%s, %s, %s, %s, %s)' % | |
| 324 (self.__class__.__name__, root, path, tainted, size, nb_files)) | |
| 325 assert_is_renderable(root) | |
| 326 assert_is_renderable(path) | |
| 327 self.root = root | |
| 328 self.path = path | |
| 329 self.tainted = tainted | |
| 330 self.nb_files = nb_files | |
| 331 # Can be used as a cache or a default value, depending on context. In | |
| 332 # particular, once self.tainted is True, because the path was replaced | |
| 333 # with a variable, it is not possible to look up the file size. | |
| 334 self._size = size | |
| 335 # These are cache only. | |
| 336 self._real_path = None | |
| 337 | |
| 338 # Check internal consistency. | |
| 339 assert path, path | |
| 340 assert tainted or bool(root) != bool(file_path.isabs(path)), (root, path) | |
| 341 assert tainted or ( | |
| 342 not os.path.exists(self.full_path) or | |
| 343 (self.full_path == file_path.get_native_path_case(self.full_path))), ( | |
| 344 tainted, | |
| 345 self.full_path, | |
| 346 file_path.get_native_path_case(self.full_path)) | |
| 347 | |
| 348 @property | |
| 349 def existent(self): | |
| 350 return self.size != -1 | |
| 351 | |
| 352 @property | |
| 353 def full_path(self): | |
| 354 if self.root: | |
| 355 return os.path.join(self.root, self.path) | |
| 356 return self.path | |
| 357 | |
| 358 @property | |
| 359 def real_path(self): | |
| 360 """Returns the path with symlinks resolved.""" | |
| 361 if not self._real_path: | |
| 362 self._real_path = os.path.realpath(self.full_path) | |
| 363 return self._real_path | |
| 364 | |
| 365 @property | |
| 366 def size(self): | |
| 367 """File's size. -1 is not existent. | |
| 368 | |
| 369 Once tainted, it is not possible the retrieve the file size anymore since | |
| 370 the path is composed of variables. | |
| 371 """ | |
| 372 if self._size is None and not self.tainted: | |
| 373 try: | |
| 374 self._size = os.stat(self.full_path).st_size | |
| 375 except OSError: | |
| 376 self._size = -1 | |
| 377 return self._size | |
| 378 | |
| 379 def flatten(self): | |
| 380 """Returns a dict representing this object. | |
| 381 | |
| 382 A 'size' of 0 means the file was only touched and not read. | |
| 383 """ | |
| 384 return { | |
| 385 'path': self.path, | |
| 386 'size': self.size, | |
| 387 } | |
| 388 | |
| 389 def replace_variables(self, variables): | |
| 390 """Replaces the root of this File with one of the variables if it matches. | |
| 391 | |
| 392 If a variable replacement occurs, the cloned object becomes tainted. | |
| 393 """ | |
| 394 for variable, root_path in variables.iteritems(): | |
| 395 if self.path.startswith(root_path): | |
| 396 return self._clone( | |
| 397 self.root, variable + self.path[len(root_path):], True) | |
| 398 # No need to clone, returns ourself. | |
| 399 return self | |
| 400 | |
| 401 def strip_root(self, root): | |
| 402 """Returns a clone of itself with 'root' stripped off. | |
| 403 | |
| 404 Note that the file is kept if it is either accessible from a symlinked | |
| 405 path that was used to access the file or through the real path. | |
| 406 """ | |
| 407 # Check internal consistency. | |
| 408 assert ( | |
| 409 self.tainted or | |
| 410 (file_path.isabs(root) and root.endswith(os.path.sep))), root | |
| 411 if not self.full_path.startswith(root): | |
| 412 # Now try to resolve the symlinks to see if it can be reached this way. | |
| 413 # Only try *after* trying without resolving symlink. | |
| 414 if not self.real_path.startswith(root): | |
| 415 return None | |
| 416 path = self.real_path | |
| 417 else: | |
| 418 path = self.full_path | |
| 419 return self._clone(root, path[len(root):], self.tainted) | |
| 420 | |
| 421 def _clone(self, new_root, new_path, tainted): | |
| 422 raise NotImplementedError(self.__class__.__name__) | |
| 423 | |
| 424 class File(_TouchedObject): | |
| 425 """A file that was accessed. May not be present anymore. | |
| 426 | |
| 427 If tainted is true, it means it is not a real path anymore as a variable | |
| 428 replacement occured. | |
| 429 | |
| 430 |mode| can be one of None, TOUCHED, READ or WRITE. | |
| 431 """ | |
| 432 # Was probed for existence, and it is existent, but was never _opened_. | |
| 433 TOUCHED = 't' | |
| 434 # Opened for read only and guaranteed to not have been written to. | |
| 435 READ = 'r' | |
| 436 # Opened for write. | |
| 437 WRITE = 'w' | |
| 438 | |
| 439 # They are listed in order of priority. E.g. if a file is traced as TOUCHED | |
| 440 # then as WRITE, only keep WRITE. None means no idea, which is a problem on | |
| 441 # Windows. | |
| 442 ACCEPTABLE_MODES = (None, TOUCHED, READ, WRITE) | |
| 443 | |
| 444 def __init__(self, root, path, tainted, size, mode): | |
| 445 assert mode in self.ACCEPTABLE_MODES | |
| 446 super(Results.File, self).__init__(root, path, tainted, size, 1) | |
| 447 self.mode = mode | |
| 448 | |
| 449 def _clone(self, new_root, new_path, tainted): | |
| 450 """Clones itself keeping meta-data.""" | |
| 451 # Keep the self.size and self._real_path caches for performance reason. It | |
| 452 # is also important when the file becomes tainted (with a variable instead | |
| 453 # of the real path) since self.path is not an on-disk path anymore so | |
| 454 # out._size cannot be updated. | |
| 455 out = self.__class__(new_root, new_path, tainted, self.size, self.mode) | |
| 456 out._real_path = self._real_path | |
| 457 return out | |
| 458 | |
| 459 def flatten(self): | |
| 460 out = super(Results.File, self).flatten() | |
| 461 out['mode'] = self.mode | |
| 462 return out | |
| 463 | |
| 464 class Directory(_TouchedObject): | |
| 465 """A directory of files. Must exist. | |
| 466 | |
| 467 For a Directory instance, self.size is not a cache, it's an actual value | |
| 468 that is never modified and represents the total size of the files contained | |
| 469 in this directory. It is possible that the directory is empty so that | |
| 470 size==0; this happens if there's only an invalid symlink in it. | |
| 471 """ | |
| 472 def __init__(self, root, path, tainted, size, nb_files): | |
| 473 """path='.' is a valid value and must be handled appropriately.""" | |
| 474 assert not path.endswith(os.path.sep), path | |
| 475 super(Results.Directory, self).__init__( | |
| 476 root, path + os.path.sep, tainted, size, nb_files) | |
| 477 | |
| 478 def flatten(self): | |
| 479 out = super(Results.Directory, self).flatten() | |
| 480 out['nb_files'] = self.nb_files | |
| 481 return out | |
| 482 | |
| 483 def _clone(self, new_root, new_path, tainted): | |
| 484 """Clones itself keeping meta-data.""" | |
| 485 out = self.__class__( | |
| 486 new_root, | |
| 487 new_path.rstrip(os.path.sep), | |
| 488 tainted, | |
| 489 self.size, | |
| 490 self.nb_files) | |
| 491 out._real_path = self._real_path | |
| 492 return out | |
| 493 | |
| 494 class Process(object): | |
| 495 """A process that was traced. | |
| 496 | |
| 497 Contains references to the files accessed by this process and its children. | |
| 498 """ | |
| 499 def __init__(self, pid, files, executable, command, initial_cwd, children): | |
| 500 logging.debug('Process(%s, %d, ...)' % (pid, len(files))) | |
| 501 self.pid = pid | |
| 502 self.files = sorted(files, key=lambda x: x.path) | |
| 503 self.children = children | |
| 504 self.executable = executable | |
| 505 self.command = command | |
| 506 self.initial_cwd = initial_cwd | |
| 507 | |
| 508 # Check internal consistency. | |
| 509 assert len(set(f.path for f in self.files)) == len(self.files), sorted( | |
| 510 f.path for f in self.files) | |
| 511 assert isinstance(self.children, list) | |
| 512 assert isinstance(self.files, list) | |
| 513 | |
| 514 @property | |
| 515 def all(self): | |
| 516 for child in self.children: | |
| 517 for i in child.all: | |
| 518 yield i | |
| 519 yield self | |
| 520 | |
| 521 def flatten(self): | |
| 522 return { | |
| 523 'children': [c.flatten() for c in self.children], | |
| 524 'command': self.command, | |
| 525 'executable': self.executable, | |
| 526 'files': [f.flatten() for f in self.files], | |
| 527 'initial_cwd': self.initial_cwd, | |
| 528 'pid': self.pid, | |
| 529 } | |
| 530 | |
| 531 def strip_root(self, root): | |
| 532 assert file_path.isabs(root) and root.endswith(os.path.sep), root | |
| 533 # Loads the files after since they are constructed as objects. | |
| 534 out = self.__class__( | |
| 535 self.pid, | |
| 536 filter(None, (f.strip_root(root) for f in self.files)), | |
| 537 self.executable, | |
| 538 self.command, | |
| 539 self.initial_cwd, | |
| 540 [c.strip_root(root) for c in self.children]) | |
| 541 logging.debug( | |
| 542 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files))) | |
| 543 return out | |
| 544 | |
| 545 def __init__(self, process): | |
| 546 self.process = process | |
| 547 # Cache. | |
| 548 self._files = None | |
| 549 | |
| 550 def flatten(self): | |
| 551 return { | |
| 552 'root': self.process.flatten(), | |
| 553 } | |
| 554 | |
| 555 @property | |
| 556 def files(self): | |
| 557 if self._files is None: | |
| 558 self._files = sorted( | |
| 559 sum((p.files for p in self.process.all), []), | |
| 560 key=lambda x: x.path) | |
| 561 return self._files | |
| 562 | |
| 563 @property | |
| 564 def existent(self): | |
| 565 return [f for f in self.files if f.existent] | |
| 566 | |
| 567 @property | |
| 568 def non_existent(self): | |
| 569 return [f for f in self.files if not f.existent] | |
| 570 | |
| 571 def strip_root(self, root): | |
| 572 """Returns a clone with all the files outside the directory |root| removed | |
| 573 and converts all the path to be relative paths. | |
| 574 | |
| 575 It keeps files accessible through the |root| directory or that have been | |
| 576 accessed through any symlink which points to the same directory. | |
| 577 """ | |
| 578 # Resolve any symlink | |
| 579 root = os.path.realpath(root) | |
| 580 root = ( | |
| 581 file_path.get_native_path_case(root).rstrip(os.path.sep) + os.path.sep) | |
| 582 logging.debug('strip_root(%s)' % root) | |
| 583 return Results(self.process.strip_root(root)) | |
| 584 | |
| 585 | |
| 586 class ApiBase(object): | |
| 587 """OS-agnostic API to trace a process and its children.""" | |
| 588 class Context(object): | |
| 589 """Processes one log line at a time and keeps the list of traced processes. | |
| 590 | |
| 591 The parsing is complicated by the fact that logs are traced out of order for | |
| 592 strace but in-order for dtrace and logman. In addition, on Windows it is | |
| 593 very frequent that processids are reused so a flat list cannot be used. But | |
| 594 at the same time, it is impossible to faithfully construct a graph when the | |
| 595 logs are processed out of order. So both a tree and a flat mapping are used, | |
| 596 the tree is the real process tree, while the flat mapping stores the last | |
| 597 valid process for the corresponding processid. For the strace case, the | |
| 598 tree's head is guessed at the last moment. | |
| 599 """ | |
| 600 class Process(object): | |
| 601 """Keeps context for one traced child process. | |
| 602 | |
| 603 Logs all the files this process touched. Ignores directories. | |
| 604 """ | |
| 605 def __init__(self, blacklist, pid, initial_cwd): | |
| 606 # Check internal consistency. | |
| 607 assert isinstance(pid, int), repr(pid) | |
| 608 assert_is_renderable(initial_cwd) | |
| 609 self.pid = pid | |
| 610 # children are Process instances. | |
| 611 self.children = [] | |
| 612 self.initial_cwd = initial_cwd | |
| 613 self.cwd = None | |
| 614 self.files = {} | |
| 615 self.executable = None | |
| 616 self.command = None | |
| 617 self._blacklist = blacklist | |
| 618 | |
| 619 def to_results_process(self): | |
| 620 """Resolves file case sensitivity and or late-bound strings.""" | |
| 621 # When resolving files, it's normal to get dupe because a file could be | |
| 622 # opened multiple times with different case. Resolve the deduplication | |
| 623 # here. | |
| 624 def fix_path(x): | |
| 625 """Returns the native file path case. | |
| 626 | |
| 627 Converts late-bound strings. | |
| 628 """ | |
| 629 if not x: | |
| 630 # Do not convert None instance to 'None'. | |
| 631 return x | |
| 632 x = render(x) | |
| 633 if os.path.isabs(x): | |
| 634 # If the path is not absolute, which tends to happen occasionally on | |
| 635 # Windows, it is not possible to get the native path case so ignore | |
| 636 # that trace. It mostly happens for 'executable' value. | |
| 637 x = file_path.get_native_path_case(x) | |
| 638 return x | |
| 639 | |
| 640 def fix_and_blacklist_path(x, m): | |
| 641 """Receives a tuple (filepath, mode) and processes filepath.""" | |
| 642 x = fix_path(x) | |
| 643 if not x: | |
| 644 return | |
| 645 # The blacklist needs to be reapplied, since path casing could | |
| 646 # influence blacklisting. | |
| 647 if self._blacklist(x): | |
| 648 return | |
| 649 # Filters out directories. Some may have passed through. | |
| 650 if os.path.isdir(x): | |
| 651 return | |
| 652 return x, m | |
| 653 | |
| 654 # Renders all the files as strings, as some could be RelativePath | |
| 655 # instances. It is important to do it first since there could still be | |
| 656 # multiple entries with the same path but different modes. | |
| 657 rendered = ( | |
| 658 fix_and_blacklist_path(f, m) for f, m in self.files.iteritems()) | |
| 659 files = sorted( | |
| 660 (f for f in rendered if f), | |
| 661 key=lambda x: (x[0], Results.File.ACCEPTABLE_MODES.index(x[1]))) | |
| 662 # Then converting into a dict will automatically clean up lesser | |
| 663 # important values. | |
| 664 files = [ | |
| 665 Results.File(None, f, False, None, m) | |
| 666 for f, m in dict(files).iteritems() | |
| 667 ] | |
| 668 return Results.Process( | |
| 669 self.pid, | |
| 670 files, | |
| 671 fix_path(self.executable), | |
| 672 self.command, | |
| 673 fix_path(self.initial_cwd), | |
| 674 [c.to_results_process() for c in self.children]) | |
| 675 | |
| 676 def add_file(self, filepath, mode): | |
| 677 """Adds a file if it passes the blacklist.""" | |
| 678 if self._blacklist(render(filepath)): | |
| 679 return | |
| 680 logging.debug('add_file(%d, %s, %s)', self.pid, filepath, mode) | |
| 681 # Note that filepath and not render(filepath) is added. It is because | |
| 682 # filepath could be something else than a string, like a RelativePath | |
| 683 # instance for dtrace logs. | |
| 684 modes = Results.File.ACCEPTABLE_MODES | |
| 685 old_mode = self.files.setdefault(filepath, mode) | |
| 686 if old_mode != mode and modes.index(old_mode) < modes.index(mode): | |
| 687 # Take the highest value. | |
| 688 self.files[filepath] = mode | |
| 689 | |
| 690 def __init__(self, blacklist): | |
| 691 self.blacklist = blacklist | |
| 692 # Initial process. | |
| 693 self.root_process = None | |
| 694 # dict to accelerate process lookup, to not have to lookup the whole graph | |
| 695 # each time. | |
| 696 self._process_lookup = {} | |
| 697 | |
| 698 class Tracer(object): | |
| 699 """During it's lifetime, the tracing subsystem is enabled.""" | |
| 700 def __init__(self, logname): | |
| 701 self._logname = logname | |
| 702 self._lock = threading.RLock() | |
| 703 self._traces = [] | |
| 704 self._initialized = True | |
| 705 self._scripts_to_cleanup = [] | |
| 706 | |
| 707 def trace(self, cmd, cwd, tracename, output): | |
| 708 """Runs the OS-specific trace program on an executable. | |
| 709 | |
| 710 Arguments: | |
| 711 - cmd: The command (a list) to run. | |
| 712 - cwd: Current directory to start the child process in. | |
| 713 - tracename: Name of the trace in the logname file. | |
| 714 - output: If False, redirects output to PIPEs. | |
| 715 | |
| 716 Returns a tuple (resultcode, output) and updates the internal trace | |
| 717 entries. | |
| 718 """ | |
| 719 # The implementation adds an item to self._traces. | |
| 720 raise NotImplementedError(self.__class__.__name__) | |
| 721 | |
| 722 def close(self, _timeout=None): | |
| 723 """Saves the meta-data in the logname file. | |
| 724 | |
| 725 For kernel-based tracing, stops the tracing subsystem. | |
| 726 | |
| 727 Must not be used manually when using 'with' construct. | |
| 728 """ | |
| 729 with self._lock: | |
| 730 if not self._initialized: | |
| 731 raise TracingFailure( | |
| 732 'Called %s.close() on an unitialized object' % | |
| 733 self.__class__.__name__, | |
| 734 None, None, None) | |
| 735 try: | |
| 736 while self._scripts_to_cleanup: | |
| 737 try: | |
| 738 os.remove(self._scripts_to_cleanup.pop()) | |
| 739 except OSError as e: | |
| 740 logging.error('Failed to delete a temporary script: %s', e) | |
| 741 write_json(self._logname, self._gen_logdata(), False) | |
| 742 finally: | |
| 743 self._initialized = False | |
| 744 | |
| 745 def post_process_log(self): | |
| 746 """Post-processes the log so it becomes faster to load afterward. | |
| 747 | |
| 748 Must not be used manually when using 'with' construct. | |
| 749 """ | |
| 750 assert not self._initialized, 'Must stop tracing first.' | |
| 751 | |
| 752 def _gen_logdata(self): | |
| 753 """Returns the data to be saved in the trace file.""" | |
| 754 return { | |
| 755 'traces': self._traces, | |
| 756 } | |
| 757 | |
| 758 def __enter__(self): | |
| 759 """Enables 'with' statement.""" | |
| 760 return self | |
| 761 | |
| 762 def __exit__(self, exc_type, exc_value, traceback): | |
| 763 """Enables 'with' statement.""" | |
| 764 self.close() | |
| 765 # If an exception was thrown, do not process logs. | |
| 766 if not exc_type: | |
| 767 self.post_process_log() | |
| 768 | |
| 769 def get_tracer(self, logname): | |
| 770 """Returns an ApiBase.Tracer instance. | |
| 771 | |
| 772 Initializes the tracing subsystem, which is a requirement for kernel-based | |
| 773 tracers. Only one tracer instance should be live at a time! | |
| 774 | |
| 775 logname is the filepath to the json file that will contain the meta-data | |
| 776 about the logs. | |
| 777 """ | |
| 778 return self.Tracer(logname) | |
| 779 | |
| 780 @staticmethod | |
| 781 def clean_trace(logname): | |
| 782 """Deletes an old log.""" | |
| 783 raise NotImplementedError() | |
| 784 | |
| 785 @classmethod | |
| 786 def parse_log(cls, logname, blacklist, trace_name): | |
| 787 """Processes trace logs and returns the files opened and the files that do | |
| 788 not exist. | |
| 789 | |
| 790 It does not track directories. | |
| 791 | |
| 792 Arguments: | |
| 793 - logname: must be an absolute path. | |
| 794 - blacklist: must be a lambda. | |
| 795 - trace_name: optional trace to read, defaults to reading all traces. | |
| 796 | |
| 797 Most of the time, files that do not exist are temporary test files that | |
| 798 should be put in /tmp instead. See http://crbug.com/116251. | |
| 799 | |
| 800 Returns a list of dict with keys: | |
| 801 - results: A Results instance. | |
| 802 - trace: The corresponding tracename parameter provided to | |
| 803 get_tracer().trace(). | |
| 804 - output: Output gathered during execution, if get_tracer().trace(..., | |
| 805 output=False) was used. | |
| 806 """ | |
| 807 raise NotImplementedError(cls.__class__.__name__) | |
| 808 | |
| 809 | |
| 810 class Strace(ApiBase): | |
| 811 """strace implies linux.""" | |
| 812 @staticmethod | |
| 813 def load_filename(filename): | |
| 814 """Parses a filename in a log.""" | |
| 815 # TODO(maruel): Be compatible with strace -x. | |
| 816 assert isinstance(filename, str) | |
| 817 out = '' | |
| 818 i = 0 | |
| 819 while i < len(filename): | |
| 820 c = filename[i] | |
| 821 if c == '\\': | |
| 822 out += chr(int(filename[i+1:i+4], 8)) | |
| 823 i += 4 | |
| 824 else: | |
| 825 out += c | |
| 826 i += 1 | |
| 827 # TODO(maruel): That's not necessarily true that the current code page is | |
| 828 # utf-8. | |
| 829 return out.decode('utf-8') | |
| 830 | |
| 831 class Context(ApiBase.Context): | |
| 832 """Processes a strace log line and keeps the list of existent and non | |
| 833 existent files accessed. | |
| 834 | |
| 835 Ignores directories. | |
| 836 | |
| 837 Uses late-binding to processes the cwd of each process. The problem is that | |
| 838 strace generates one log file per process it traced but doesn't give any | |
| 839 information about which process was started when and by who. So process the | |
| 840 logs out of order and use late binding with RelativePath to be able to | |
| 841 deduce the initial directory of each process once all the logs are parsed. | |
| 842 | |
| 843 TODO(maruel): Use the script even in the non-sudo case, so log parsing can | |
| 844 be done in two phase: first find the root process, then process the child | |
| 845 processes in order. With that, it should be possible to not use RelativePath | |
| 846 anymore. This would significantly simplify the code! | |
| 847 """ | |
| 848 class Process(ApiBase.Context.Process): | |
| 849 """Represents the state of a process. | |
| 850 | |
| 851 Contains all the information retrieved from the pid-specific log. | |
| 852 """ | |
| 853 # Function names are using ([a-z_0-9]+) | |
| 854 # This is the most common format. function(args) = result | |
| 855 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.*?)\)\s+= (.+)$') | |
| 856 # An interrupted function call, only grab the minimal header. | |
| 857 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$') | |
| 858 # A resumed function call. | |
| 859 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$') | |
| 860 # A process received a signal. | |
| 861 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---') | |
| 862 # A process didn't handle a signal. Ignore any junk appearing before, | |
| 863 # because the process was forcibly killed so it won't open any new file. | |
| 864 RE_KILLED = re.compile( | |
| 865 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$') | |
| 866 # The process has exited. | |
| 867 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+') | |
| 868 # A call was canceled. Ignore any prefix. | |
| 869 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$') | |
| 870 # Happens when strace fails to even get the function name. | |
| 871 UNNAMED_FUNCTION = '????' | |
| 872 | |
| 873 # Corner-case in python, a class member function decorator must not be | |
| 874 # @staticmethod. | |
| 875 def parse_args(regexp, expect_zero): # pylint: disable=E0213 | |
| 876 """Automatically convert the str 'args' into a list of processed | |
| 877 arguments. | |
| 878 | |
| 879 Arguments: | |
| 880 - regexp is used to parse args. | |
| 881 - expect_zero: one of True, False or None. | |
| 882 - True: will check for result.startswith('0') first and will ignore | |
| 883 the trace line completely otherwise. This is important because for | |
| 884 many functions, the regexp will not process if the call failed. | |
| 885 - False: will check for not result.startswith(('?', '-1')) for the | |
| 886 same reason than with True. | |
| 887 - None: ignore result. | |
| 888 """ | |
| 889 def meta_hook(function): | |
| 890 assert function.__name__.startswith('handle_') | |
| 891 def hook(self, args, result): | |
| 892 if expect_zero is True and not result.startswith('0'): | |
| 893 return | |
| 894 if expect_zero is False and result.startswith(('?', '-1')): | |
| 895 return | |
| 896 match = re.match(regexp, args) | |
| 897 if not match: | |
| 898 raise TracingFailure( | |
| 899 'Failed to parse %s(%s) = %s' % | |
| 900 (function.__name__[len('handle_'):], args, result), | |
| 901 None, None, None) | |
| 902 return function(self, match.groups(), result) | |
| 903 return hook | |
| 904 return meta_hook | |
| 905 | |
| 906 class RelativePath(object): | |
| 907 """A late-bound relative path.""" | |
| 908 def __init__(self, parent, value): | |
| 909 assert_is_renderable(parent) | |
| 910 self.parent = parent | |
| 911 assert ( | |
| 912 value is None or | |
| 913 (isinstance(value, unicode) and not os.path.isabs(value))) | |
| 914 self.value = value | |
| 915 if self.value: | |
| 916 # TODO(maruel): On POSIX, '\\' is a valid character so remove this | |
| 917 # assert. | |
| 918 assert '\\' not in self.value, value | |
| 919 assert '\\' not in self.value, (repr(value), repr(self.value)) | |
| 920 | |
| 921 def render(self): | |
| 922 """Returns the current directory this instance is representing. | |
| 923 | |
| 924 This function is used to return the late-bound value. | |
| 925 """ | |
| 926 assert self.parent is not None | |
| 927 parent = render(self.parent) | |
| 928 if self.value: | |
| 929 return os.path.normpath(os.path.join(parent, self.value)) | |
| 930 return parent | |
| 931 | |
| 932 def __init__(self, root, pid): | |
| 933 """Keeps enough information to be able to guess the original process | |
| 934 root. | |
| 935 | |
| 936 strace doesn't store which process was the initial process. So more | |
| 937 information needs to be kept so the graph can be reconstructed from the | |
| 938 flat map. | |
| 939 """ | |
| 940 logging.info('%s(%d)' % (self.__class__.__name__, pid)) | |
| 941 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None) | |
| 942 assert isinstance(root, ApiBase.Context) | |
| 943 self._root = weakref.ref(root) | |
| 944 # The dict key is the function name of the pending call, like 'open' | |
| 945 # or 'execve'. | |
| 946 self._pending_calls = {} | |
| 947 self._line_number = 0 | |
| 948 # Current directory when the process started. | |
| 949 if isinstance(self._root(), unicode): | |
| 950 self.initial_cwd = self._root() | |
| 951 else: | |
| 952 self.initial_cwd = self.RelativePath(self._root(), None) | |
| 953 self.parentid = None | |
| 954 self._done = False | |
| 955 | |
| 956 def get_cwd(self): | |
| 957 """Returns the best known value of cwd.""" | |
| 958 return self.cwd or self.initial_cwd | |
| 959 | |
| 960 def render(self): | |
| 961 """Returns the string value of the RelativePath() object. | |
| 962 | |
| 963 Used by RelativePath. Returns the initial directory and not the | |
| 964 current one since the current directory 'cwd' validity is time-limited. | |
| 965 | |
| 966 The validity is only guaranteed once all the logs are processed. | |
| 967 """ | |
| 968 return self.initial_cwd.render() | |
| 969 | |
| 970 def on_line(self, line): | |
| 971 assert isinstance(line, str) | |
| 972 self._line_number += 1 | |
| 973 try: | |
| 974 if self._done: | |
| 975 raise TracingFailure( | |
| 976 'Found a trace for a terminated process or corrupted log', | |
| 977 None, None, None) | |
| 978 | |
| 979 if self.RE_SIGNAL.match(line): | |
| 980 # Ignore signals. | |
| 981 return | |
| 982 | |
| 983 match = self.RE_KILLED.match(line) | |
| 984 if match: | |
| 985 # Converts a '+++ killed by Foo +++' trace into an exit_group(). | |
| 986 self.handle_exit_group(match.group(1), None) | |
| 987 return | |
| 988 | |
| 989 match = self.RE_PROCESS_EXITED.match(line) | |
| 990 if match: | |
| 991 # Converts a '+++ exited with 1 +++' trace into an exit_group() | |
| 992 self.handle_exit_group(match.group(1), None) | |
| 993 return | |
| 994 | |
| 995 match = self.RE_UNFINISHED.match(line) | |
| 996 if match: | |
| 997 if match.group(1) in self._pending_calls: | |
| 998 raise TracingFailure( | |
| 999 'Found two unfinished calls for the same function', | |
| 1000 None, None, None, | |
| 1001 self._pending_calls) | |
| 1002 self._pending_calls[match.group(1)] = ( | |
| 1003 match.group(1) + match.group(2)) | |
| 1004 return | |
| 1005 | |
| 1006 match = self.RE_UNAVAILABLE.match(line) | |
| 1007 if match: | |
| 1008 # This usually means a process was killed and a pending call was | |
| 1009 # canceled. | |
| 1010 # TODO(maruel): Look up the last exit_group() trace just above and | |
| 1011 # make sure any self._pending_calls[anything] is properly flushed. | |
| 1012 return | |
| 1013 | |
| 1014 match = self.RE_RESUMED.match(line) | |
| 1015 if match: | |
| 1016 if match.group(1) not in self._pending_calls: | |
| 1017 raise TracingFailure( | |
| 1018 'Found a resumed call that was not logged as unfinished', | |
| 1019 None, None, None, | |
| 1020 self._pending_calls) | |
| 1021 pending = self._pending_calls.pop(match.group(1)) | |
| 1022 # Reconstruct the line. | |
| 1023 line = pending + match.group(2) | |
| 1024 | |
| 1025 match = self.RE_HEADER.match(line) | |
| 1026 if not match: | |
| 1027 # The line is corrupted. It happens occasionally when a process is | |
| 1028 # killed forcibly with activity going on. Assume the process died. | |
| 1029 # No other line can be processed afterward. | |
| 1030 logging.debug('%d is done: %s', self.pid, line) | |
| 1031 self._done = True | |
| 1032 return | |
| 1033 | |
| 1034 if match.group(1) == self.UNNAMED_FUNCTION: | |
| 1035 return | |
| 1036 | |
| 1037 # It's a valid line, handle it. | |
| 1038 handler = getattr(self, 'handle_%s' % match.group(1), None) | |
| 1039 if not handler: | |
| 1040 self._handle_unknown(match.group(1), match.group(2), match.group(3)) | |
| 1041 return handler(match.group(2), match.group(3)) | |
| 1042 except TracingFailure, e: | |
| 1043 # Hack in the values since the handler could be a static function. | |
| 1044 e.pid = self.pid | |
| 1045 e.line = line | |
| 1046 e.line_number = self._line_number | |
| 1047 # Re-raise the modified exception. | |
| 1048 raise | |
| 1049 except (KeyError, NotImplementedError, ValueError), e: | |
| 1050 raise TracingFailure( | |
| 1051 'Trace generated a %s exception: %s' % ( | |
| 1052 e.__class__.__name__, str(e)), | |
| 1053 self.pid, | |
| 1054 self._line_number, | |
| 1055 line, | |
| 1056 e) | |
| 1057 | |
| 1058 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True) | |
| 1059 def handle_access(self, args, _result): | |
| 1060 self._handle_file(args[0], Results.File.TOUCHED) | |
| 1061 | |
| 1062 @parse_args(r'^\"(.+?)\"$', True) | |
| 1063 def handle_chdir(self, args, _result): | |
| 1064 """Updates cwd.""" | |
| 1065 self.cwd = self._mangle(args[0]) | |
| 1066 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd)) | |
| 1067 | |
| 1068 @parse_args(r'^\"(.+?)\", (\d+), (\d+)$', False) | |
| 1069 def handle_chown(self, args, _result): | |
| 1070 # TODO(maruel): Look at result? | |
| 1071 self._handle_file(args[0], Results.File.WRITE) | |
| 1072 | |
| 1073 def handle_clone(self, _args, result): | |
| 1074 self._handling_forking('clone', result) | |
| 1075 | |
| 1076 def handle_close(self, _args, _result): | |
| 1077 pass | |
| 1078 | |
| 1079 @parse_args(r'^\"(.+?)\", (\d+)$', False) | |
| 1080 def handle_chmod(self, args, _result): | |
| 1081 self._handle_file(args[0], Results.File.WRITE) | |
| 1082 | |
| 1083 @parse_args(r'^\"(.+?)\", (\d+)$', False) | |
| 1084 def handle_creat(self, args, _result): | |
| 1085 self._handle_file(args[0], Results.File.WRITE) | |
| 1086 | |
| 1087 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True) | |
| 1088 def handle_execve(self, args, _result): | |
| 1089 # Even if in practice execve() doesn't returns when it succeeds, strace | |
| 1090 # still prints '0' as the result. | |
| 1091 filepath = args[0] | |
| 1092 self._handle_file(filepath, Results.File.READ) | |
| 1093 self.executable = self._mangle(filepath) | |
| 1094 try: | |
| 1095 self.command = strace_process_quoted_arguments(args[1]) | |
| 1096 except ValueError as e: | |
| 1097 raise TracingFailure( | |
| 1098 'Failed to process command line argument:\n%s' % e.args[0], | |
| 1099 None, None, None) | |
| 1100 | |
| 1101 def handle_exit_group(self, _args, _result): | |
| 1102 """Removes cwd.""" | |
| 1103 self.cwd = None | |
| 1104 | |
| 1105 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', True) | |
| 1106 def handle_faccessat(self, args, _results): | |
| 1107 if args[0] == 'AT_FDCWD': | |
| 1108 self._handle_file(args[1], Results.File.TOUCHED) | |
| 1109 else: | |
| 1110 raise Exception('Relative faccess not implemented.') | |
| 1111 | |
| 1112 def handle_fallocate(self, _args, result): | |
| 1113 pass | |
| 1114 | |
| 1115 def handle_fork(self, args, result): | |
| 1116 self._handle_unknown('fork', args, result) | |
| 1117 | |
| 1118 def handle_futex(self, _args, _result): | |
| 1119 pass | |
| 1120 | |
| 1121 @parse_args(r'^\"(.+?)\", (\d+)$', False) | |
| 1122 def handle_getcwd(self, args, _result): | |
| 1123 if os.path.isabs(args[0]): | |
| 1124 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd)) | |
| 1125 if not isinstance(self.cwd, unicode): | |
| 1126 # Take the occasion to reset the path. | |
| 1127 self.cwd = self._mangle(args[0]) | |
| 1128 else: | |
| 1129 # It should always match. | |
| 1130 assert self.cwd == Strace.load_filename(args[0]), ( | |
| 1131 self.cwd, args[0]) | |
| 1132 | |
| 1133 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True) | |
| 1134 def handle_link(self, args, _result): | |
| 1135 self._handle_file(args[0], Results.File.READ) | |
| 1136 self._handle_file(args[1], Results.File.WRITE) | |
| 1137 | |
| 1138 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True) | |
| 1139 def handle_lstat(self, args, _result): | |
| 1140 self._handle_file(args[0], Results.File.TOUCHED) | |
| 1141 | |
| 1142 def handle_mkdir(self, _args, _result): | |
| 1143 # We track content, not directories. | |
| 1144 pass | |
| 1145 | |
| 1146 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False) | |
| 1147 def handle_open(self, args, _result): | |
| 1148 if 'O_DIRECTORY' in args[1]: | |
| 1149 return | |
| 1150 self._handle_file( | |
| 1151 args[0], | |
| 1152 Results.File.READ if 'O_RDONLY' in args[1] else Results.File.WRITE) | |
| 1153 | |
| 1154 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False) | |
| 1155 def handle_openat(self, args, _result): | |
| 1156 if 'O_DIRECTORY' in args[2]: | |
| 1157 return | |
| 1158 if args[0] == 'AT_FDCWD': | |
| 1159 self._handle_file( | |
| 1160 args[1], | |
| 1161 Results.File.READ if 'O_RDONLY' in args[2] | |
| 1162 else Results.File.WRITE) | |
| 1163 else: | |
| 1164 # TODO(maruel): Implement relative open if necessary instead of the | |
| 1165 # AT_FDCWD flag, let's hope not since this means tracking all active | |
| 1166 # directory handles. | |
| 1167 raise NotImplementedError('Relative open via openat not implemented.') | |
| 1168 | |
| 1169 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False) | |
| 1170 def handle_readlink(self, args, _result): | |
| 1171 self._handle_file(args[0], Results.File.READ) | |
| 1172 | |
| 1173 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True) | |
| 1174 def handle_rename(self, args, _result): | |
| 1175 self._handle_file(args[0], Results.File.READ) | |
| 1176 self._handle_file(args[1], Results.File.WRITE) | |
| 1177 | |
| 1178 def handle_rmdir(self, _args, _result): | |
| 1179 # We track content, not directories. | |
| 1180 pass | |
| 1181 | |
| 1182 def handle_setxattr(self, _args, _result): | |
| 1183 # TODO(maruel): self._handle_file(args[0], Results.File.WRITE) | |
| 1184 pass | |
| 1185 | |
| 1186 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True) | |
| 1187 def handle_stat(self, args, _result): | |
| 1188 self._handle_file(args[0], Results.File.TOUCHED) | |
| 1189 | |
| 1190 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True) | |
| 1191 def handle_symlink(self, args, _result): | |
| 1192 self._handle_file(args[0], Results.File.TOUCHED) | |
| 1193 self._handle_file(args[1], Results.File.WRITE) | |
| 1194 | |
| 1195 @parse_args(r'^\"(.+?)\", \d+', True) | |
| 1196 def handle_truncate(self, args, _result): | |
| 1197 self._handle_file(args[0], Results.File.WRITE) | |
| 1198 | |
| 1199 def handle_unlink(self, _args, _result): | |
| 1200 # In theory, the file had to be created anyway. | |
| 1201 pass | |
| 1202 | |
| 1203 def handle_unlinkat(self, _args, _result): | |
| 1204 # In theory, the file had to be created anyway. | |
| 1205 pass | |
| 1206 | |
| 1207 def handle_statfs(self, _args, _result): | |
| 1208 pass | |
| 1209 | |
| 1210 def handle_utimensat(self, _args, _result): | |
| 1211 pass | |
| 1212 | |
| 1213 def handle_vfork(self, _args, result): | |
| 1214 self._handling_forking('vfork', result) | |
| 1215 | |
| 1216 @staticmethod | |
| 1217 def _handle_unknown(function, args, result): | |
| 1218 raise TracingFailure( | |
| 1219 'Unexpected/unimplemented trace %s(%s)= %s' % | |
| 1220 (function, args, result), | |
| 1221 None, None, None) | |
| 1222 | |
| 1223 def _handling_forking(self, name, result): | |
| 1224 """Transfers cwd.""" | |
| 1225 if result.startswith(('?', '-1')): | |
| 1226 # The call failed. | |
| 1227 return | |
| 1228 # Update the other process right away. | |
| 1229 childpid = int(result) | |
| 1230 child = self._root().get_or_set_proc(childpid) | |
| 1231 if child.parentid is not None or childpid in self.children: | |
| 1232 raise TracingFailure( | |
| 1233 'Found internal inconsitency in process lifetime detection ' | |
| 1234 'during a %s() call' % name, | |
| 1235 None, None, None) | |
| 1236 | |
| 1237 # Copy the cwd object. | |
| 1238 child.initial_cwd = self.get_cwd() | |
| 1239 child.parentid = self.pid | |
| 1240 # It is necessary because the logs are processed out of order. | |
| 1241 self.children.append(child) | |
| 1242 | |
| 1243 def _handle_file(self, filepath, mode): | |
| 1244 filepath = self._mangle(filepath) | |
| 1245 self.add_file(filepath, mode) | |
| 1246 | |
| 1247 def _mangle(self, filepath): | |
| 1248 """Decodes a filepath found in the log and convert it to a late-bound | |
| 1249 path if necessary. | |
| 1250 | |
| 1251 |filepath| is an strace 'encoded' string and the returned value is | |
| 1252 either an unicode string if the path was absolute or a late bound path | |
| 1253 otherwise. | |
| 1254 """ | |
| 1255 filepath = Strace.load_filename(filepath) | |
| 1256 if os.path.isabs(filepath): | |
| 1257 return filepath | |
| 1258 else: | |
| 1259 if isinstance(self.get_cwd(), unicode): | |
| 1260 return os.path.normpath(os.path.join(self.get_cwd(), filepath)) | |
| 1261 return self.RelativePath(self.get_cwd(), filepath) | |
| 1262 | |
| 1263 def __init__(self, blacklist, root_pid, initial_cwd): | |
| 1264 """|root_pid| may be None when the root process is not known. | |
| 1265 | |
| 1266 In that case, a search is done after reading all the logs to figure out | |
| 1267 the root process. | |
| 1268 """ | |
| 1269 super(Strace.Context, self).__init__(blacklist) | |
| 1270 assert_is_renderable(initial_cwd) | |
| 1271 self.root_pid = root_pid | |
| 1272 self.initial_cwd = initial_cwd | |
| 1273 | |
| 1274 def render(self): | |
| 1275 """Returns the string value of the initial cwd of the root process. | |
| 1276 | |
| 1277 Used by RelativePath. | |
| 1278 """ | |
| 1279 return self.initial_cwd | |
| 1280 | |
| 1281 def on_line(self, pid, line): | |
| 1282 """Transfers control into the Process.on_line() function.""" | |
| 1283 self.get_or_set_proc(pid).on_line(line.strip()) | |
| 1284 | |
| 1285 def to_results(self): | |
| 1286 """If necessary, finds back the root process and verify consistency.""" | |
| 1287 if not self.root_pid: | |
| 1288 # The non-sudo case. The traced process was started by strace itself, | |
| 1289 # so the pid of the traced process is not known. | |
| 1290 root = [p for p in self._process_lookup.itervalues() if not p.parentid] | |
| 1291 if len(root) == 1: | |
| 1292 self.root_process = root[0] | |
| 1293 # Save it for later. | |
| 1294 self.root_pid = self.root_process.pid | |
| 1295 else: | |
| 1296 # The sudo case. The traced process was started manually so its pid is | |
| 1297 # known. | |
| 1298 self.root_process = self._process_lookup.get(self.root_pid) | |
| 1299 if not self.root_process: | |
| 1300 raise TracingFailure( | |
| 1301 'Found internal inconsitency in process lifetime detection ' | |
| 1302 'while finding the root process', | |
| 1303 None, | |
| 1304 None, | |
| 1305 None, | |
| 1306 self.root_pid, | |
| 1307 sorted(self._process_lookup)) | |
| 1308 process = self.root_process.to_results_process() | |
| 1309 if sorted(self._process_lookup) != sorted(p.pid for p in process.all): | |
| 1310 raise TracingFailure( | |
| 1311 'Found internal inconsitency in process lifetime detection ' | |
| 1312 'while looking for len(tree) == len(list)', | |
| 1313 None, | |
| 1314 None, | |
| 1315 None, | |
| 1316 sorted(self._process_lookup), | |
| 1317 sorted(p.pid for p in process.all)) | |
| 1318 return Results(process) | |
| 1319 | |
| 1320 def get_or_set_proc(self, pid): | |
| 1321 """Returns the Context.Process instance for this pid or creates a new one. | |
| 1322 """ | |
| 1323 if not pid or not isinstance(pid, int): | |
| 1324 raise TracingFailure( | |
| 1325 'Unpexpected value for pid: %r' % pid, | |
| 1326 pid, | |
| 1327 None, | |
| 1328 None, | |
| 1329 pid) | |
| 1330 if pid not in self._process_lookup: | |
| 1331 self._process_lookup[pid] = self.Process(self, pid) | |
| 1332 return self._process_lookup[pid] | |
| 1333 | |
| 1334 @classmethod | |
| 1335 def traces(cls): | |
| 1336 """Returns the list of all handled traces to pass this as an argument to | |
| 1337 strace. | |
| 1338 """ | |
| 1339 prefix = 'handle_' | |
| 1340 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)] | |
| 1341 | |
| 1342 class Tracer(ApiBase.Tracer): | |
| 1343 MAX_LEN = 256 | |
| 1344 | |
| 1345 def __init__(self, logname, use_sudo): | |
| 1346 super(Strace.Tracer, self).__init__(logname) | |
| 1347 self.use_sudo = use_sudo | |
| 1348 if use_sudo: | |
| 1349 # TODO(maruel): Use the jump script systematically to make it easy to | |
| 1350 # figure out the root process, so RelativePath is not necessary anymore. | |
| 1351 self._child_script = create_exec_thunk() | |
| 1352 self._scripts_to_cleanup.append(self._child_script) | |
| 1353 | |
| 1354 def trace(self, cmd, cwd, tracename, output): | |
| 1355 """Runs strace on an executable. | |
| 1356 | |
| 1357 When use_sudo=True, it is a 3-phases process: start the thunk, start | |
| 1358 sudo strace with the pid of the thunk and then have the thunk os.execve() | |
| 1359 the process to trace. | |
| 1360 """ | |
| 1361 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) | |
| 1362 assert os.path.isabs(cmd[0]), cmd[0] | |
| 1363 assert os.path.isabs(cwd), cwd | |
| 1364 assert os.path.normpath(cwd) == cwd, cwd | |
| 1365 with self._lock: | |
| 1366 if not self._initialized: | |
| 1367 raise TracingFailure( | |
| 1368 'Called Tracer.trace() on an unitialized object', | |
| 1369 None, None, None, tracename) | |
| 1370 assert tracename not in (i['trace'] for i in self._traces) | |
| 1371 stdout = stderr = None | |
| 1372 if output: | |
| 1373 stdout = subprocess.PIPE | |
| 1374 stderr = subprocess.STDOUT | |
| 1375 | |
| 1376 # Ensure all file related APIs are hooked. | |
| 1377 traces = ','.join(Strace.Context.traces() + ['file']) | |
| 1378 flags = [ | |
| 1379 # Each child process has its own trace file. It is necessary because | |
| 1380 # strace may generate corrupted log file if multiple processes are | |
| 1381 # heavily doing syscalls simultaneously. | |
| 1382 '-ff', | |
| 1383 # Reduce whitespace usage. | |
| 1384 '-a1', | |
| 1385 # hex encode non-ascii strings. | |
| 1386 # TODO(maruel): '-x', | |
| 1387 # TODO(maruel): '-ttt', | |
| 1388 # Signals are unnecessary noise here. Note the parser can cope with them | |
| 1389 # but reduce the unnecessary output. | |
| 1390 '-esignal=none', | |
| 1391 # Print as much data as wanted. | |
| 1392 '-s', '%d' % self.MAX_LEN, | |
| 1393 '-e', 'trace=%s' % traces, | |
| 1394 '-o', self._logname + '.' + tracename, | |
| 1395 ] | |
| 1396 | |
| 1397 if self.use_sudo: | |
| 1398 pipe_r, pipe_w = os.pipe() | |
| 1399 # Start the child process paused. | |
| 1400 target_cmd = [sys.executable, self._child_script, str(pipe_r)] + cmd | |
| 1401 logging.debug(' '.join(target_cmd)) | |
| 1402 child_proc = subprocess.Popen( | |
| 1403 target_cmd, | |
| 1404 stdin=subprocess.PIPE, | |
| 1405 stdout=stdout, | |
| 1406 stderr=stderr, | |
| 1407 cwd=cwd) | |
| 1408 | |
| 1409 # TODO(maruel): both processes must use the same UID for it to work | |
| 1410 # without sudo. Look why -p is failing at the moment without sudo. | |
| 1411 trace_cmd = [ | |
| 1412 'sudo', | |
| 1413 'strace', | |
| 1414 '-p', str(child_proc.pid), | |
| 1415 ] + flags | |
| 1416 logging.debug(' '.join(trace_cmd)) | |
| 1417 strace_proc = subprocess.Popen( | |
| 1418 trace_cmd, | |
| 1419 cwd=cwd, | |
| 1420 stdin=subprocess.PIPE, | |
| 1421 stdout=subprocess.PIPE, | |
| 1422 stderr=subprocess.PIPE) | |
| 1423 | |
| 1424 line = strace_proc.stderr.readline() | |
| 1425 if not re.match(r'^Process \d+ attached \- interrupt to quit$', line): | |
| 1426 # TODO(maruel): Raise an exception. | |
| 1427 assert False, line | |
| 1428 | |
| 1429 # Now fire the child process. | |
| 1430 os.write(pipe_w, 'x') | |
| 1431 | |
| 1432 out = child_proc.communicate()[0] | |
| 1433 strace_out = strace_proc.communicate()[0] | |
| 1434 | |
| 1435 # TODO(maruel): if strace_proc.returncode: Add an exception. | |
| 1436 saved_out = strace_out if strace_proc.returncode else out | |
| 1437 root_pid = child_proc.pid | |
| 1438 else: | |
| 1439 # Non-sudo case. | |
| 1440 trace_cmd = [ | |
| 1441 'strace', | |
| 1442 ] + flags + cmd | |
| 1443 logging.debug(' '.join(trace_cmd)) | |
| 1444 child_proc = subprocess.Popen( | |
| 1445 trace_cmd, | |
| 1446 cwd=cwd, | |
| 1447 stdin=subprocess.PIPE, | |
| 1448 stdout=stdout, | |
| 1449 stderr=stderr) | |
| 1450 out = child_proc.communicate()[0] | |
| 1451 # TODO(maruel): Walk the logs and figure out the root process would | |
| 1452 # simplify parsing the logs a *lot*. | |
| 1453 saved_out = out | |
| 1454 # The trace reader will have to figure out. | |
| 1455 root_pid = None | |
| 1456 | |
| 1457 with self._lock: | |
| 1458 assert tracename not in (i['trace'] for i in self._traces) | |
| 1459 self._traces.append( | |
| 1460 { | |
| 1461 'cmd': cmd, | |
| 1462 'cwd': cwd, | |
| 1463 'output': saved_out, | |
| 1464 'pid': root_pid, | |
| 1465 'trace': tracename, | |
| 1466 }) | |
| 1467 return child_proc.returncode, out | |
| 1468 | |
| 1469 def __init__(self, use_sudo=None): | |
| 1470 super(Strace, self).__init__() | |
| 1471 self.use_sudo = use_sudo | |
| 1472 | |
| 1473 def get_tracer(self, logname): | |
| 1474 return self.Tracer(logname, self.use_sudo) | |
| 1475 | |
| 1476 @staticmethod | |
| 1477 def clean_trace(logname): | |
| 1478 if os.path.isfile(logname): | |
| 1479 os.remove(logname) | |
| 1480 # Also delete any pid specific file from previous traces. | |
| 1481 for i in glob.iglob(logname + '.*'): | |
| 1482 if i.rsplit('.', 1)[1].isdigit(): | |
| 1483 os.remove(i) | |
| 1484 | |
| 1485 @classmethod | |
| 1486 def parse_log(cls, logname, blacklist, trace_name): | |
| 1487 logging.info('parse_log(%s, ..., %s)', logname, trace_name) | |
| 1488 assert os.path.isabs(logname) | |
| 1489 data = read_json(logname) | |
| 1490 out = [] | |
| 1491 for item in data['traces']: | |
| 1492 if trace_name and item['trace'] != trace_name: | |
| 1493 continue | |
| 1494 result = { | |
| 1495 'output': item['output'], | |
| 1496 'trace': item['trace'], | |
| 1497 } | |
| 1498 try: | |
| 1499 context = cls.Context(blacklist, item['pid'], item['cwd']) | |
| 1500 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])): | |
| 1501 logging.debug('Reading %s', pidfile) | |
| 1502 pid = pidfile.rsplit('.', 1)[1] | |
| 1503 if pid.isdigit(): | |
| 1504 pid = int(pid) | |
| 1505 found_line = False | |
| 1506 for line in open(pidfile, 'rb'): | |
| 1507 context.on_line(pid, line) | |
| 1508 found_line = True | |
| 1509 if not found_line: | |
| 1510 # Ensures that a completely empty trace still creates the | |
| 1511 # corresponding Process instance by logging a dummy line. | |
| 1512 context.on_line(pid, '') | |
| 1513 else: | |
| 1514 logging.warning('Found unexpected file %s', pidfile) | |
| 1515 result['results'] = context.to_results() | |
| 1516 except TracingFailure: | |
| 1517 result['exception'] = sys.exc_info() | |
| 1518 out.append(result) | |
| 1519 return out | |
| 1520 | |
| 1521 | |
| 1522 class Dtrace(ApiBase): | |
| 1523 """Uses DTrace framework through dtrace. Requires root access. | |
| 1524 | |
| 1525 Implies Mac OSX. | |
| 1526 | |
| 1527 dtruss can't be used because it has compatibility issues with python. | |
| 1528 | |
| 1529 Also, the pid->cwd handling needs to be done manually since OSX has no way to | |
| 1530 get the absolute path of the 'cwd' dtrace variable from the probe. | |
| 1531 | |
| 1532 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see | |
| 1533 https://discussions.apple.com/thread/1980539. So resort to handling execve() | |
| 1534 manually. | |
| 1535 | |
| 1536 errno is not printed in the log since this implementation currently only cares | |
| 1537 about files that were successfully opened. | |
| 1538 """ | |
| 1539 class Context(ApiBase.Context): | |
| 1540 # Format: index pid function(args) | |
| 1541 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$') | |
| 1542 | |
| 1543 # Arguments parsing. | |
| 1544 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$') | |
| 1545 RE_CHDIR = re.compile(r'^\"(.+?)\"$') | |
| 1546 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$') | |
| 1547 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$') | |
| 1548 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$') | |
| 1549 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$') | |
| 1550 | |
| 1551 # O_DIRECTORY is not defined on Windows and dtrace doesn't exist on Windows. | |
| 1552 O_DIRECTORY = os.O_DIRECTORY if hasattr(os, 'O_DIRECTORY') else None | |
| 1553 O_RDWR = os.O_RDWR | |
| 1554 O_WRONLY = os.O_WRONLY | |
| 1555 | |
| 1556 class Process(ApiBase.Context.Process): | |
| 1557 def __init__(self, *args): | |
| 1558 super(Dtrace.Context.Process, self).__init__(*args) | |
| 1559 self.cwd = self.initial_cwd | |
| 1560 | |
| 1561 def __init__(self, blacklist, thunk_pid, initial_cwd): | |
| 1562 logging.info( | |
| 1563 '%s(%d, %s)' % (self.__class__.__name__, thunk_pid, initial_cwd)) | |
| 1564 super(Dtrace.Context, self).__init__(blacklist) | |
| 1565 assert isinstance(initial_cwd, unicode), initial_cwd | |
| 1566 # Process ID of the temporary script created by create_subprocess_thunk(). | |
| 1567 self._thunk_pid = thunk_pid | |
| 1568 self._initial_cwd = initial_cwd | |
| 1569 self._line_number = 0 | |
| 1570 | |
| 1571 def on_line(self, line): | |
| 1572 assert isinstance(line, unicode), line | |
| 1573 self._line_number += 1 | |
| 1574 match = self.RE_HEADER.match(line) | |
| 1575 if not match: | |
| 1576 raise TracingFailure( | |
| 1577 'Found malformed line: %s' % line, | |
| 1578 None, | |
| 1579 self._line_number, | |
| 1580 line) | |
| 1581 fn = getattr( | |
| 1582 self, | |
| 1583 'handle_%s' % match.group(2).replace('-', '_'), | |
| 1584 self._handle_ignored) | |
| 1585 # It is guaranteed to succeed because of the regexp. Or at least I thought | |
| 1586 # it would. | |
| 1587 pid = int(match.group(1)) | |
| 1588 try: | |
| 1589 return fn(pid, match.group(3)) | |
| 1590 except TracingFailure, e: | |
| 1591 # Hack in the values since the handler could be a static function. | |
| 1592 e.pid = pid | |
| 1593 e.line = line | |
| 1594 e.line_number = self._line_number | |
| 1595 # Re-raise the modified exception. | |
| 1596 raise | |
| 1597 except (KeyError, NotImplementedError, ValueError), e: | |
| 1598 raise TracingFailure( | |
| 1599 'Trace generated a %s exception: %s' % ( | |
| 1600 e.__class__.__name__, str(e)), | |
| 1601 pid, | |
| 1602 self._line_number, | |
| 1603 line, | |
| 1604 e) | |
| 1605 | |
| 1606 def to_results(self): | |
| 1607 process = self.root_process.to_results_process() | |
| 1608 # Internal concistency check. | |
| 1609 if sorted(self._process_lookup) != sorted(p.pid for p in process.all): | |
| 1610 raise TracingFailure( | |
| 1611 'Found internal inconsitency in process lifetime detection ' | |
| 1612 'while looking for len(tree) == len(list)', | |
| 1613 None, | |
| 1614 None, | |
| 1615 None, | |
| 1616 sorted(self._process_lookup), | |
| 1617 sorted(p.pid for p in process.all)) | |
| 1618 return Results(process) | |
| 1619 | |
| 1620 def handle_dtrace_BEGIN(self, _pid, args): | |
| 1621 if not self.RE_DTRACE_BEGIN.match(args): | |
| 1622 raise TracingFailure( | |
| 1623 'Found internal inconsitency in dtrace_BEGIN log line', | |
| 1624 None, None, None) | |
| 1625 | |
| 1626 def handle_proc_start(self, pid, args): | |
| 1627 """Transfers cwd. | |
| 1628 | |
| 1629 The dtrace script already takes care of only tracing the processes that | |
| 1630 are child of the traced processes so there is no need to verify the | |
| 1631 process hierarchy. | |
| 1632 """ | |
| 1633 if pid in self._process_lookup: | |
| 1634 raise TracingFailure( | |
| 1635 'Found internal inconsitency in proc_start: %d started two times' % | |
| 1636 pid, | |
| 1637 None, None, None) | |
| 1638 match = self.RE_PROC_START.match(args) | |
| 1639 if not match: | |
| 1640 raise TracingFailure( | |
| 1641 'Failed to parse arguments: %s' % args, | |
| 1642 None, None, None) | |
| 1643 ppid = int(match.group(1)) | |
| 1644 if ppid == self._thunk_pid and not self.root_process: | |
| 1645 proc = self.root_process = self.Process( | |
| 1646 self.blacklist, pid, self._initial_cwd) | |
| 1647 elif ppid in self._process_lookup: | |
| 1648 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd) | |
| 1649 self._process_lookup[ppid].children.append(proc) | |
| 1650 else: | |
| 1651 # Another process tree, ignore. | |
| 1652 return | |
| 1653 self._process_lookup[pid] = proc | |
| 1654 logging.debug( | |
| 1655 'New child: %s -> %d cwd:%s' % | |
| 1656 (ppid, pid, render(proc.initial_cwd))) | |
| 1657 | |
| 1658 def handle_proc_exit(self, pid, _args): | |
| 1659 """Removes cwd.""" | |
| 1660 if pid in self._process_lookup: | |
| 1661 # self._thunk_pid is not traced itself and other traces run neither. | |
| 1662 self._process_lookup[pid].cwd = None | |
| 1663 | |
| 1664 def handle_execve(self, pid, args): | |
| 1665 """Sets the process' executable. | |
| 1666 | |
| 1667 TODO(maruel): Read command line arguments. See | |
| 1668 https://discussions.apple.com/thread/1980539 for an example. | |
| 1669 https://gist.github.com/1242279 | |
| 1670 | |
| 1671 Will have to put the answer at http://stackoverflow.com/questions/7556249. | |
| 1672 :) | |
| 1673 """ | |
| 1674 if not pid in self._process_lookup: | |
| 1675 # Another process tree, ignore. | |
| 1676 return | |
| 1677 match = self.RE_EXECVE.match(args) | |
| 1678 if not match: | |
| 1679 raise TracingFailure( | |
| 1680 'Failed to parse arguments: %r' % args, | |
| 1681 None, None, None) | |
| 1682 proc = self._process_lookup[pid] | |
| 1683 proc.executable = match.group(1) | |
| 1684 self._handle_file(pid, proc.executable, Results.File.READ) | |
| 1685 proc.command = self.process_escaped_arguments(match.group(3)) | |
| 1686 if int(match.group(2)) != len(proc.command): | |
| 1687 raise TracingFailure( | |
| 1688 'Failed to parse execve() arguments: %s' % args, | |
| 1689 None, None, None) | |
| 1690 | |
| 1691 def handle_chdir(self, pid, args): | |
| 1692 """Updates cwd.""" | |
| 1693 if pid not in self._process_lookup: | |
| 1694 # Another process tree, ignore. | |
| 1695 return | |
| 1696 cwd = self.RE_CHDIR.match(args).group(1) | |
| 1697 if not cwd.startswith('/'): | |
| 1698 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd) | |
| 1699 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2)) | |
| 1700 else: | |
| 1701 logging.debug('handle_chdir(%d, %s)' % (pid, cwd)) | |
| 1702 cwd2 = cwd | |
| 1703 self._process_lookup[pid].cwd = cwd2 | |
| 1704 | |
| 1705 def handle_open_nocancel(self, pid, args): | |
| 1706 """Redirects to handle_open().""" | |
| 1707 return self.handle_open(pid, args) | |
| 1708 | |
| 1709 def handle_open(self, pid, args): | |
| 1710 if pid not in self._process_lookup: | |
| 1711 # Another process tree, ignore. | |
| 1712 return | |
| 1713 match = self.RE_OPEN.match(args) | |
| 1714 if not match: | |
| 1715 raise TracingFailure( | |
| 1716 'Failed to parse arguments: %s' % args, | |
| 1717 None, None, None) | |
| 1718 flag = int(match.group(2), 16) | |
| 1719 if self.O_DIRECTORY & flag == self.O_DIRECTORY: | |
| 1720 # Ignore directories. | |
| 1721 return | |
| 1722 self._handle_file( | |
| 1723 pid, | |
| 1724 match.group(1), | |
| 1725 Results.File.READ if not ((self.O_RDWR | self.O_WRONLY) & flag) | |
| 1726 else Results.File.WRITE) | |
| 1727 | |
| 1728 def handle_rename(self, pid, args): | |
| 1729 if pid not in self._process_lookup: | |
| 1730 # Another process tree, ignore. | |
| 1731 return | |
| 1732 match = self.RE_RENAME.match(args) | |
| 1733 if not match: | |
| 1734 raise TracingFailure( | |
| 1735 'Failed to parse arguments: %s' % args, | |
| 1736 None, None, None) | |
| 1737 self._handle_file(pid, match.group(1), Results.File.READ) | |
| 1738 self._handle_file(pid, match.group(2), Results.File.WRITE) | |
| 1739 | |
| 1740 def _handle_file(self, pid, filepath, mode): | |
| 1741 if not filepath.startswith('/'): | |
| 1742 filepath = os.path.join(self._process_lookup[pid].cwd, filepath) | |
| 1743 # We can get '..' in the path. | |
| 1744 filepath = os.path.normpath(filepath) | |
| 1745 # Sadly, still need to filter out directories here; | |
| 1746 # saw open_nocancel(".", 0, 0) = 0 lines. | |
| 1747 if os.path.isdir(filepath): | |
| 1748 return | |
| 1749 self._process_lookup[pid].add_file(filepath, mode) | |
| 1750 | |
| 1751 def handle_ftruncate(self, pid, args): | |
| 1752 """Just used as a signal to kill dtrace, ignoring.""" | |
| 1753 pass | |
| 1754 | |
| 1755 @staticmethod | |
| 1756 def _handle_ignored(pid, args): | |
| 1757 """Is called for all the event traces that are not handled.""" | |
| 1758 raise NotImplementedError('Please implement me') | |
| 1759 | |
| 1760 @staticmethod | |
| 1761 def process_escaped_arguments(text): | |
| 1762 """Extracts escaped arguments on a string and return the arguments as a | |
| 1763 list. | |
| 1764 | |
| 1765 Implemented as an automaton. | |
| 1766 | |
| 1767 Example: | |
| 1768 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the | |
| 1769 function will return ['python2.7', '-c', 'print("hi")] | |
| 1770 """ | |
| 1771 if not text.endswith('\\0'): | |
| 1772 raise ValueError('String is not null terminated: %r' % text, text) | |
| 1773 text = text[:-2] | |
| 1774 | |
| 1775 def unescape(x): | |
| 1776 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?.""" | |
| 1777 out = [] | |
| 1778 escaped = False | |
| 1779 for i in x: | |
| 1780 if i == '\\' and not escaped: | |
| 1781 escaped = True | |
| 1782 continue | |
| 1783 escaped = False | |
| 1784 out.append(i) | |
| 1785 return ''.join(out) | |
| 1786 | |
| 1787 return [unescape(i) for i in text.split('\\001')] | |
| 1788 | |
| 1789 class Tracer(ApiBase.Tracer): | |
| 1790 # pylint: disable=C0301 | |
| 1791 # | |
| 1792 # To understand the following code, you'll want to take a look at: | |
| 1793 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref
.html | |
| 1794 # https://wikis.oracle.com/display/DTrace/Variables | |
| 1795 # http://docs.oracle.com/cd/E19205-01/820-4221/ | |
| 1796 # | |
| 1797 # 0. Dump all the valid probes into a text file. It is important, you | |
| 1798 # want to redirect into a file and you don't want to constantly 'sudo'. | |
| 1799 # $ sudo dtrace -l > probes.txt | |
| 1800 # | |
| 1801 # 1. Count the number of probes: | |
| 1802 # $ wc -l probes.txt | |
| 1803 # 81823 # On OSX 10.7, including 1 header line. | |
| 1804 # | |
| 1805 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the | |
| 1806 # likes and skipping the header with NR>1: | |
| 1807 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]' | |
| 1808 # dtrace | |
| 1809 # fbt | |
| 1810 # io | |
| 1811 # ip | |
| 1812 # lockstat | |
| 1813 # mach_trap | |
| 1814 # proc | |
| 1815 # profile | |
| 1816 # sched | |
| 1817 # syscall | |
| 1818 # tcp | |
| 1819 # vminfo | |
| 1820 # | |
| 1821 # 3. List of valid probes: | |
| 1822 # $ grep syscall probes.txt | less | |
| 1823 # or use dtrace directly: | |
| 1824 # $ sudo dtrace -l -P syscall | less | |
| 1825 # | |
| 1826 # trackedpid is an associative array where its value can be 0, 1 or 2. | |
| 1827 # 0 is for untracked processes and is the default value for items not | |
| 1828 # in the associative array. | |
| 1829 # 1 is for tracked processes. | |
| 1830 # 2 is for the script created by create_subprocess_thunk() only. It is not | |
| 1831 # tracked itself but all its decendants are. | |
| 1832 # | |
| 1833 # The script will kill itself only once waiting_to_die == 1 and | |
| 1834 # current_processes == 0, so that both getlogin() was called and that | |
| 1835 # all traced processes exited. | |
| 1836 # | |
| 1837 # TODO(maruel): Use cacheable predicates. See | |
| 1838 # https://wikis.oracle.com/display/DTrace/Performance+Considerations | |
| 1839 D_CODE = """ | |
| 1840 dtrace:::BEGIN { | |
| 1841 waiting_to_die = 0; | |
| 1842 current_processes = 0; | |
| 1843 logindex = 0; | |
| 1844 printf("%d %d %s_%s(\\"%s\\")\\n", | |
| 1845 logindex, PID, probeprov, probename, SCRIPT); | |
| 1846 logindex++; | |
| 1847 } | |
| 1848 | |
| 1849 proc:::start /trackedpid[ppid]/ { | |
| 1850 trackedpid[pid] = 1; | |
| 1851 current_processes += 1; | |
| 1852 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n", | |
| 1853 logindex, pid, probeprov, probename, | |
| 1854 ppid, | |
| 1855 execname, | |
| 1856 current_processes); | |
| 1857 logindex++; | |
| 1858 } | |
| 1859 /* Should use SCRIPT but there is no access to this variable at that | |
| 1860 * point. */ | |
| 1861 proc:::start /ppid == PID && execname == "Python"/ { | |
| 1862 trackedpid[pid] = 2; | |
| 1863 current_processes += 1; | |
| 1864 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n", | |
| 1865 logindex, pid, probeprov, probename, | |
| 1866 ppid, | |
| 1867 execname, | |
| 1868 current_processes); | |
| 1869 logindex++; | |
| 1870 } | |
| 1871 proc:::exit /trackedpid[pid] && | |
| 1872 current_processes == 1 && | |
| 1873 waiting_to_die == 1/ { | |
| 1874 trackedpid[pid] = 0; | |
| 1875 current_processes -= 1; | |
| 1876 printf("%d %d %s_%s(%d)\\n", | |
| 1877 logindex, pid, probeprov, probename, | |
| 1878 current_processes); | |
| 1879 logindex++; | |
| 1880 exit(0); | |
| 1881 } | |
| 1882 proc:::exit /trackedpid[pid]/ { | |
| 1883 trackedpid[pid] = 0; | |
| 1884 current_processes -= 1; | |
| 1885 printf("%d %d %s_%s(%d)\\n", | |
| 1886 logindex, pid, probeprov, probename, | |
| 1887 current_processes); | |
| 1888 logindex++; | |
| 1889 } | |
| 1890 | |
| 1891 /* Use an arcane function to detect when we need to die */ | |
| 1892 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ { | |
| 1893 waiting_to_die = 1; | |
| 1894 printf("%d %d %s()\\n", logindex, pid, probefunc); | |
| 1895 logindex++; | |
| 1896 } | |
| 1897 syscall::ftruncate:entry / | |
| 1898 pid == PID && arg0 == FILE_ID && current_processes == 0/ { | |
| 1899 exit(0); | |
| 1900 } | |
| 1901 | |
| 1902 syscall::open*:entry /trackedpid[pid] == 1/ { | |
| 1903 self->open_arg0 = arg0; | |
| 1904 self->open_arg1 = arg1; | |
| 1905 self->open_arg2 = arg2; | |
| 1906 } | |
| 1907 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ { | |
| 1908 this->open_arg0 = copyinstr(self->open_arg0); | |
| 1909 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n", | |
| 1910 logindex, pid, probefunc, | |
| 1911 this->open_arg0, | |
| 1912 self->open_arg1, | |
| 1913 self->open_arg2); | |
| 1914 logindex++; | |
| 1915 this->open_arg0 = 0; | |
| 1916 } | |
| 1917 syscall::open*:return /trackedpid[pid] == 1/ { | |
| 1918 self->open_arg0 = 0; | |
| 1919 self->open_arg1 = 0; | |
| 1920 self->open_arg2 = 0; | |
| 1921 } | |
| 1922 | |
| 1923 syscall::rename:entry /trackedpid[pid] == 1/ { | |
| 1924 self->rename_arg0 = arg0; | |
| 1925 self->rename_arg1 = arg1; | |
| 1926 } | |
| 1927 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ { | |
| 1928 this->rename_arg0 = copyinstr(self->rename_arg0); | |
| 1929 this->rename_arg1 = copyinstr(self->rename_arg1); | |
| 1930 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n", | |
| 1931 logindex, pid, probefunc, | |
| 1932 this->rename_arg0, | |
| 1933 this->rename_arg1); | |
| 1934 logindex++; | |
| 1935 this->rename_arg0 = 0; | |
| 1936 this->rename_arg1 = 0; | |
| 1937 } | |
| 1938 syscall::rename:return /trackedpid[pid] == 1/ { | |
| 1939 self->rename_arg0 = 0; | |
| 1940 self->rename_arg1 = 0; | |
| 1941 } | |
| 1942 | |
| 1943 /* Track chdir, it's painful because it is only receiving relative path. | |
| 1944 */ | |
| 1945 syscall::chdir:entry /trackedpid[pid] == 1/ { | |
| 1946 self->chdir_arg0 = arg0; | |
| 1947 } | |
| 1948 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ { | |
| 1949 this->chdir_arg0 = copyinstr(self->chdir_arg0); | |
| 1950 printf("%d %d %s(\\"%s\\")\\n", | |
| 1951 logindex, pid, probefunc, | |
| 1952 this->chdir_arg0); | |
| 1953 logindex++; | |
| 1954 this->chdir_arg0 = 0; | |
| 1955 } | |
| 1956 syscall::chdir:return /trackedpid[pid] == 1/ { | |
| 1957 self->chdir_arg0 = 0; | |
| 1958 } | |
| 1959 """ | |
| 1960 | |
| 1961 # execve-specific code, tends to throw a lot of exceptions. | |
| 1962 D_CODE_EXECVE = """ | |
| 1963 /* Finally what we care about! */ | |
| 1964 syscall::exec*:entry /trackedpid[pid]/ { | |
| 1965 self->exec_arg0 = copyinstr(arg0); | |
| 1966 /* Incrementally probe for a NULL in the argv parameter of execve() to | |
| 1967 * figure out argc. */ | |
| 1968 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was | |
| 1969 * found. */ | |
| 1970 self->exec_argc = 0; | |
| 1971 /* Probe for argc==1 */ | |
| 1972 this->exec_argv = (user_addr_t*)copyin( | |
| 1973 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
| 1974 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
| 1975 (self->exec_argc + 1) : self->exec_argc; | |
| 1976 | |
| 1977 /* Probe for argc==2 */ | |
| 1978 this->exec_argv = (user_addr_t*)copyin( | |
| 1979 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
| 1980 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
| 1981 (self->exec_argc + 1) : self->exec_argc; | |
| 1982 | |
| 1983 /* Probe for argc==3 */ | |
| 1984 this->exec_argv = (user_addr_t*)copyin( | |
| 1985 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
| 1986 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
| 1987 (self->exec_argc + 1) : self->exec_argc; | |
| 1988 | |
| 1989 /* Probe for argc==4 */ | |
| 1990 this->exec_argv = (user_addr_t*)copyin( | |
| 1991 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); | |
| 1992 self->exec_argc = this->exec_argv[self->exec_argc] ? | |
| 1993 (self->exec_argc + 1) : self->exec_argc; | |
| 1994 | |
| 1995 /* Copy the inputs strings since there is no guarantee they'll be | |
| 1996 * present after the call completed. */ | |
| 1997 self->exec_argv0 = (self->exec_argc > 0) ? | |
| 1998 copyinstr(this->exec_argv[0]) : ""; | |
| 1999 self->exec_argv1 = (self->exec_argc > 1) ? | |
| 2000 copyinstr(this->exec_argv[1]) : ""; | |
| 2001 self->exec_argv2 = (self->exec_argc > 2) ? | |
| 2002 copyinstr(this->exec_argv[2]) : ""; | |
| 2003 self->exec_argv3 = (self->exec_argc > 3) ? | |
| 2004 copyinstr(this->exec_argv[3]) : ""; | |
| 2005 this->exec_argv = 0; | |
| 2006 } | |
| 2007 syscall::exec*:return /trackedpid[pid] && errno == 0/ { | |
| 2008 /* We need to join strings here, as using multiple printf() would | |
| 2009 * cause tearing when multiple threads/processes are traced. | |
| 2010 * Since it is impossible to escape a string and join it to another one, | |
| 2011 * like sprintf("%s%S", previous, more), use hackery. | |
| 2012 * Each of the elements are split with a \\1. \\0 cannot be used because | |
| 2013 * it is simply ignored. This will conflict with any program putting a | |
| 2014 * \\1 in their execve() string but this should be "rare enough" */ | |
| 2015 this->args = ""; | |
| 2016 /* Process exec_argv[0] */ | |
| 2017 this->args = strjoin( | |
| 2018 this->args, (self->exec_argc > 0) ? self->exec_argv0 : ""); | |
| 2019 | |
| 2020 /* Process exec_argv[1] */ | |
| 2021 this->args = strjoin( | |
| 2022 this->args, (self->exec_argc > 1) ? "\\1" : ""); | |
| 2023 this->args = strjoin( | |
| 2024 this->args, (self->exec_argc > 1) ? self->exec_argv1 : ""); | |
| 2025 | |
| 2026 /* Process exec_argv[2] */ | |
| 2027 this->args = strjoin( | |
| 2028 this->args, (self->exec_argc > 2) ? "\\1" : ""); | |
| 2029 this->args = strjoin( | |
| 2030 this->args, (self->exec_argc > 2) ? self->exec_argv2 : ""); | |
| 2031 | |
| 2032 /* Process exec_argv[3] */ | |
| 2033 this->args = strjoin( | |
| 2034 this->args, (self->exec_argc > 3) ? "\\1" : ""); | |
| 2035 this->args = strjoin( | |
| 2036 this->args, (self->exec_argc > 3) ? self->exec_argv3 : ""); | |
| 2037 | |
| 2038 /* Prints self->exec_argc to permits verifying the internal | |
| 2039 * consistency since this code is quite fishy. */ | |
| 2040 printf("%d %d %s(\\"%s\\", [%d, %S])\\n", | |
| 2041 logindex, pid, probefunc, | |
| 2042 self->exec_arg0, | |
| 2043 self->exec_argc, | |
| 2044 this->args); | |
| 2045 logindex++; | |
| 2046 this->args = 0; | |
| 2047 } | |
| 2048 syscall::exec*:return /trackedpid[pid]/ { | |
| 2049 self->exec_arg0 = 0; | |
| 2050 self->exec_argc = 0; | |
| 2051 self->exec_argv0 = 0; | |
| 2052 self->exec_argv1 = 0; | |
| 2053 self->exec_argv2 = 0; | |
| 2054 self->exec_argv3 = 0; | |
| 2055 } | |
| 2056 """ | |
| 2057 | |
| 2058 # Code currently not used. | |
| 2059 D_EXTRANEOUS = """ | |
| 2060 /* This is a good learning experience, since it traces a lot of things | |
| 2061 * related to the process and child processes. | |
| 2062 * Warning: it generates a gigantic log. For example, tracing | |
| 2063 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes | |
| 2064 * several minutes to execute. | |
| 2065 */ | |
| 2066 /* | |
| 2067 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
| 2068 printf("%d %d %s_%s() = %d\\n", | |
| 2069 logindex, pid, probeprov, probefunc, errno); | |
| 2070 logindex++; | |
| 2071 } | |
| 2072 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
| 2073 printf("%d %d %s_%s() = %d\\n", | |
| 2074 logindex, pid, probeprov, probefunc, errno); | |
| 2075 logindex++; | |
| 2076 } | |
| 2077 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
| 2078 printf("%d %d %s_%s() = %d\\n", | |
| 2079 logindex, pid, probeprov, probefunc, errno); | |
| 2080 logindex++; | |
| 2081 } | |
| 2082 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
| 2083 printf("%d %d %s_%s() = %d\\n", | |
| 2084 logindex, pid, probeprov, probefunc, errno); | |
| 2085 logindex++; | |
| 2086 } | |
| 2087 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { | |
| 2088 printf("%d %d %s_%s() = %d\\n", | |
| 2089 logindex, pid, probeprov, probefunc, errno); | |
| 2090 logindex++; | |
| 2091 } | |
| 2092 */ | |
| 2093 /* TODO(maruel): *stat* functions and friends | |
| 2094 syscall::access:return, | |
| 2095 syscall::chdir:return, | |
| 2096 syscall::chflags:return, | |
| 2097 syscall::chown:return, | |
| 2098 syscall::chroot:return, | |
| 2099 syscall::getattrlist:return, | |
| 2100 syscall::getxattr:return, | |
| 2101 syscall::lchown:return, | |
| 2102 syscall::lstat64:return, | |
| 2103 syscall::lstat:return, | |
| 2104 syscall::mkdir:return, | |
| 2105 syscall::pathconf:return, | |
| 2106 syscall::readlink:return, | |
| 2107 syscall::removexattr:return, | |
| 2108 syscall::setxattr:return, | |
| 2109 syscall::stat64:return, | |
| 2110 syscall::stat:return, | |
| 2111 syscall::truncate:return, | |
| 2112 syscall::unlink:return, | |
| 2113 syscall::utimes:return, | |
| 2114 */ | |
| 2115 """ | |
| 2116 | |
| 2117 def __init__(self, logname, use_sudo): | |
| 2118 """Starts the log collection with dtrace. | |
| 2119 | |
| 2120 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so | |
| 2121 this needs to wait for dtrace to be "warmed up". | |
| 2122 """ | |
| 2123 super(Dtrace.Tracer, self).__init__(logname) | |
| 2124 # This script is used as a signal to figure out the root process. | |
| 2125 self._signal_script = create_subprocess_thunk() | |
| 2126 self._scripts_to_cleanup.append(self._signal_script) | |
| 2127 # This unique dummy temp file is used to signal the dtrace script that it | |
| 2128 # should stop as soon as all the child processes are done. A bit hackish | |
| 2129 # but works fine enough. | |
| 2130 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp( | |
| 2131 prefix='trace_signal_file') | |
| 2132 | |
| 2133 dtrace_path = '/usr/sbin/dtrace' | |
| 2134 if not os.path.isfile(dtrace_path): | |
| 2135 dtrace_path = 'dtrace' | |
| 2136 elif use_sudo is None and (os.stat(dtrace_path).st_mode & stat.S_ISUID): | |
| 2137 # No need to sudo. For those following at home, don't do that. | |
| 2138 use_sudo = False | |
| 2139 | |
| 2140 # Note: do not use the -p flag. It's useless if the initial process quits | |
| 2141 # too fast, resulting in missing traces from the grand-children. The D | |
| 2142 # code manages the dtrace lifetime itself. | |
| 2143 trace_cmd = [ | |
| 2144 dtrace_path, | |
| 2145 # Use a larger buffer if getting 'out of scratch space' errors. | |
| 2146 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables | |
| 2147 '-b', '10m', | |
| 2148 '-x', 'dynvarsize=10m', | |
| 2149 #'-x', 'dtrace_global_maxsize=1m', | |
| 2150 '-x', 'evaltime=exec', | |
| 2151 '-o', '/dev/stderr', | |
| 2152 '-q', | |
| 2153 '-n', self._get_dtrace_code(), | |
| 2154 ] | |
| 2155 if use_sudo is not False: | |
| 2156 trace_cmd.insert(0, 'sudo') | |
| 2157 | |
| 2158 with open(self._logname + '.log', 'wb') as logfile: | |
| 2159 self._dtrace = subprocess.Popen( | |
| 2160 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT) | |
| 2161 logging.debug('Started dtrace pid: %d' % self._dtrace.pid) | |
| 2162 | |
| 2163 # Reads until one line is printed, which signifies dtrace is up and ready. | |
| 2164 with open(self._logname + '.log', 'rb') as logfile: | |
| 2165 while 'dtrace_BEGIN' not in logfile.readline(): | |
| 2166 if self._dtrace.poll() is not None: | |
| 2167 # Do a busy wait. :/ | |
| 2168 break | |
| 2169 logging.debug('dtrace started') | |
| 2170 | |
| 2171 def _get_dtrace_code(self): | |
| 2172 """Setups the D code to implement child process tracking. | |
| 2173 | |
| 2174 Injects the cookie in the script so it knows when to stop. | |
| 2175 | |
| 2176 The script will detect any instance of the script created with | |
| 2177 create_subprocess_thunk() and will start tracing it. | |
| 2178 """ | |
| 2179 out = ( | |
| 2180 'inline int PID = %d;\n' | |
| 2181 'inline string SCRIPT = "%s";\n' | |
| 2182 'inline int FILE_ID = %d;\n' | |
| 2183 '\n' | |
| 2184 '%s') % ( | |
| 2185 os.getpid(), | |
| 2186 self._signal_script, | |
| 2187 self._dummy_file_id, | |
| 2188 self.D_CODE) | |
| 2189 if os.environ.get('TRACE_INPUTS_DTRACE_ENABLE_EXECVE') == '1': | |
| 2190 # Do not enable by default since it tends to spew dtrace: error lines | |
| 2191 # because the execve() parameters are not in valid memory at the time of | |
| 2192 # logging. | |
| 2193 # TODO(maruel): Find a way to make this reliable since it's useful but | |
| 2194 # only works in limited/trivial uses cases for now. | |
| 2195 out += self.D_CODE_EXECVE | |
| 2196 return out | |
| 2197 | |
| 2198 def trace(self, cmd, cwd, tracename, output): | |
| 2199 """Runs dtrace on an executable. | |
| 2200 | |
| 2201 This dtruss is broken when it starts the process itself or when tracing | |
| 2202 child processes, this code starts a wrapper process | |
| 2203 generated with create_subprocess_thunk() which starts the executable to | |
| 2204 trace. | |
| 2205 """ | |
| 2206 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) | |
| 2207 assert os.path.isabs(cmd[0]), cmd[0] | |
| 2208 assert os.path.isabs(cwd), cwd | |
| 2209 assert os.path.normpath(cwd) == cwd, cwd | |
| 2210 with self._lock: | |
| 2211 if not self._initialized: | |
| 2212 raise TracingFailure( | |
| 2213 'Called Tracer.trace() on an unitialized object', | |
| 2214 None, None, None, tracename) | |
| 2215 assert tracename not in (i['trace'] for i in self._traces) | |
| 2216 | |
| 2217 # Starts the script wrapper to start the child process. This signals the | |
| 2218 # dtrace script that this process is to be traced. | |
| 2219 stdout = stderr = None | |
| 2220 if output: | |
| 2221 stdout = subprocess.PIPE | |
| 2222 stderr = subprocess.STDOUT | |
| 2223 child_cmd = [ | |
| 2224 sys.executable, | |
| 2225 self._signal_script, | |
| 2226 tracename, | |
| 2227 ] | |
| 2228 # Call a dummy function so that dtrace knows I'm about to launch a process | |
| 2229 # that needs to be traced. | |
| 2230 # Yummy. | |
| 2231 child = subprocess.Popen( | |
| 2232 child_cmd + tools.fix_python_path(cmd), | |
| 2233 stdin=subprocess.PIPE, | |
| 2234 stdout=stdout, | |
| 2235 stderr=stderr, | |
| 2236 cwd=cwd) | |
| 2237 logging.debug('Started child pid: %d' % child.pid) | |
| 2238 | |
| 2239 out = child.communicate()[0] | |
| 2240 # This doesn't mean tracing is done, one of the grand-child process may | |
| 2241 # still be alive. It will be tracked with the dtrace script. | |
| 2242 | |
| 2243 with self._lock: | |
| 2244 assert tracename not in (i['trace'] for i in self._traces) | |
| 2245 self._traces.append( | |
| 2246 { | |
| 2247 'cmd': cmd, | |
| 2248 'cwd': cwd, | |
| 2249 'pid': child.pid, | |
| 2250 'output': out, | |
| 2251 'trace': tracename, | |
| 2252 }) | |
| 2253 return child.returncode, out | |
| 2254 | |
| 2255 def close(self, timeout=None): | |
| 2256 """Terminates dtrace.""" | |
| 2257 logging.debug('close(%s)' % timeout) | |
| 2258 try: | |
| 2259 try: | |
| 2260 super(Dtrace.Tracer, self).close(timeout) | |
| 2261 # Signal dtrace that it should stop now. | |
| 2262 # ftruncate doesn't exist on Windows. | |
| 2263 os.ftruncate(self._dummy_file_id, 0) # pylint: disable=E1101 | |
| 2264 if timeout: | |
| 2265 start = time.time() | |
| 2266 # Use polling. :/ | |
| 2267 while (self._dtrace.poll() is None and | |
| 2268 (time.time() - start) < timeout): | |
| 2269 time.sleep(0.1) | |
| 2270 self._dtrace.kill() | |
| 2271 self._dtrace.wait() | |
| 2272 finally: | |
| 2273 # Make sure to kill it in any case. | |
| 2274 if self._dtrace.poll() is None: | |
| 2275 try: | |
| 2276 self._dtrace.kill() | |
| 2277 self._dtrace.wait() | |
| 2278 except OSError: | |
| 2279 pass | |
| 2280 | |
| 2281 if self._dtrace.returncode != 0: | |
| 2282 # Warn about any dtrace failure but basically ignore it. | |
| 2283 print 'dtrace failure: %s' % self._dtrace.returncode | |
| 2284 finally: | |
| 2285 os.close(self._dummy_file_id) | |
| 2286 os.remove(self._dummy_file_name) | |
| 2287 | |
| 2288 def post_process_log(self): | |
| 2289 """Sorts the log back in order when each call occured. | |
| 2290 | |
| 2291 dtrace doesn't save the buffer in strict order since it keeps one buffer | |
| 2292 per CPU. | |
| 2293 """ | |
| 2294 super(Dtrace.Tracer, self).post_process_log() | |
| 2295 logname = self._logname + '.log' | |
| 2296 with open(logname, 'rb') as logfile: | |
| 2297 lines = [l for l in logfile if l.strip()] | |
| 2298 errors = [l for l in lines if l.startswith('dtrace:')] | |
| 2299 if errors: | |
| 2300 raise TracingFailure( | |
| 2301 'Found errors in the trace: %s' % '\n'.join(errors), | |
| 2302 None, None, None, logname) | |
| 2303 try: | |
| 2304 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0])) | |
| 2305 except ValueError: | |
| 2306 raise TracingFailure( | |
| 2307 'Found errors in the trace: %s' % '\n'.join( | |
| 2308 l for l in lines if l.split(' ', 1)[0].isdigit()), | |
| 2309 None, None, None, logname) | |
| 2310 with open(logname, 'wb') as logfile: | |
| 2311 logfile.write(''.join(lines)) | |
| 2312 | |
| 2313 def __init__(self, use_sudo=None): | |
| 2314 super(Dtrace, self).__init__() | |
| 2315 self.use_sudo = use_sudo | |
| 2316 | |
| 2317 def get_tracer(self, logname): | |
| 2318 return self.Tracer(logname, self.use_sudo) | |
| 2319 | |
| 2320 @staticmethod | |
| 2321 def clean_trace(logname): | |
| 2322 for ext in ('', '.log'): | |
| 2323 if os.path.isfile(logname + ext): | |
| 2324 os.remove(logname + ext) | |
| 2325 | |
| 2326 @classmethod | |
| 2327 def parse_log(cls, logname, blacklist, trace_name): | |
| 2328 logging.info('parse_log(%s, ..., %s)', logname, trace_name) | |
| 2329 assert os.path.isabs(logname) | |
| 2330 | |
| 2331 def blacklist_more(filepath): | |
| 2332 # All the HFS metadata is in the form /.vol/... | |
| 2333 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath) | |
| 2334 | |
| 2335 data = read_json(logname) | |
| 2336 out = [] | |
| 2337 for item in data['traces']: | |
| 2338 if trace_name and item['trace'] != trace_name: | |
| 2339 continue | |
| 2340 result = { | |
| 2341 'output': item['output'], | |
| 2342 'trace': item['trace'], | |
| 2343 } | |
| 2344 try: | |
| 2345 context = cls.Context(blacklist_more, item['pid'], item['cwd']) | |
| 2346 # It's fine to assume the file as UTF-8: OSX enforces the file names to | |
| 2347 # be valid UTF-8 and we control the log output. | |
| 2348 for line in codecs.open(logname + '.log', 'rb', encoding='utf-8'): | |
| 2349 context.on_line(line) | |
| 2350 result['results'] = context.to_results() | |
| 2351 except TracingFailure: | |
| 2352 result['exception'] = sys.exc_info() | |
| 2353 out.append(result) | |
| 2354 return out | |
| 2355 | |
| 2356 | |
| 2357 class LogmanTrace(ApiBase): | |
| 2358 """Uses the native Windows ETW based tracing functionality to trace a child | |
| 2359 process. | |
| 2360 | |
| 2361 Caveat: this implementations doesn't track cwd or initial_cwd. It is because | |
| 2362 the Windows Kernel doesn't have a concept of 'current working directory' at | |
| 2363 all. A Win32 process has a map of current directories, one per drive letter | |
| 2364 and it is managed by the user mode kernel32.dll. In kernel, a file is always | |
| 2365 opened relative to another file_object or as an absolute path. All the current | |
| 2366 working directory logic is done in user mode. | |
| 2367 """ | |
| 2368 class Context(ApiBase.Context): | |
| 2369 """Processes a ETW log line and keeps the list of existent and non | |
| 2370 existent files accessed. | |
| 2371 | |
| 2372 Ignores directories. | |
| 2373 """ | |
| 2374 # These indexes are for the stripped version in json. | |
| 2375 EVENT_NAME = 0 | |
| 2376 TYPE = 1 | |
| 2377 PID = 2 | |
| 2378 TID = 3 | |
| 2379 PROCESSOR_ID = 4 | |
| 2380 TIMESTAMP = 5 | |
| 2381 USER_DATA = 6 | |
| 2382 | |
| 2383 class Process(ApiBase.Context.Process): | |
| 2384 def __init__(self, *args): | |
| 2385 super(LogmanTrace.Context.Process, self).__init__(*args) | |
| 2386 # Handle file objects that succeeded. | |
| 2387 self.file_objects = {} | |
| 2388 | |
| 2389 def __init__(self, blacklist, thunk_pid, trace_name, thunk_cmd): | |
| 2390 logging.info( | |
| 2391 '%s(%d, %s, %s)', self.__class__.__name__, thunk_pid, trace_name, | |
| 2392 thunk_cmd) | |
| 2393 super(LogmanTrace.Context, self).__init__(blacklist) | |
| 2394 self._drive_map = file_path.DosDriveMap() | |
| 2395 # Threads mapping to the corresponding process id. | |
| 2396 self._threads_active = {} | |
| 2397 # Process ID of the tracer, e.g. the temporary script created by | |
| 2398 # create_subprocess_thunk(). This is tricky because the process id may | |
| 2399 # have been reused. | |
| 2400 self._thunk_pid = thunk_pid | |
| 2401 self._thunk_cmd = thunk_cmd | |
| 2402 self._trace_name = trace_name | |
| 2403 self._line_number = 0 | |
| 2404 self._thunk_process = None | |
| 2405 | |
| 2406 def on_line(self, line): | |
| 2407 """Processes a json Event line.""" | |
| 2408 self._line_number += 1 | |
| 2409 try: | |
| 2410 # By Opcode | |
| 2411 handler = getattr( | |
| 2412 self, | |
| 2413 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]), | |
| 2414 None) | |
| 2415 if not handler: | |
| 2416 raise TracingFailure( | |
| 2417 'Unexpected event %s_%s' % ( | |
| 2418 line[self.EVENT_NAME], line[self.TYPE]), | |
| 2419 None, None, None) | |
| 2420 handler(line) | |
| 2421 except TracingFailure, e: | |
| 2422 # Hack in the values since the handler could be a static function. | |
| 2423 e.pid = line[self.PID] | |
| 2424 e.line = line | |
| 2425 e.line_number = self._line_number | |
| 2426 # Re-raise the modified exception. | |
| 2427 raise | |
| 2428 except (KeyError, NotImplementedError, ValueError), e: | |
| 2429 raise TracingFailure( | |
| 2430 'Trace generated a %s exception: %s' % ( | |
| 2431 e.__class__.__name__, str(e)), | |
| 2432 line[self.PID], | |
| 2433 self._line_number, | |
| 2434 line, | |
| 2435 e) | |
| 2436 | |
| 2437 def to_results(self): | |
| 2438 if not self.root_process: | |
| 2439 raise TracingFailure( | |
| 2440 'Failed to detect the initial process %d' % self._thunk_pid, | |
| 2441 None, None, None) | |
| 2442 process = self.root_process.to_results_process() | |
| 2443 return Results(process) | |
| 2444 | |
| 2445 def _thread_to_process(self, tid): | |
| 2446 """Finds the process from the thread id.""" | |
| 2447 tid = int(tid, 16) | |
| 2448 pid = self._threads_active.get(tid) | |
| 2449 if not pid or not self._process_lookup.get(pid): | |
| 2450 return | |
| 2451 return self._process_lookup[pid] | |
| 2452 | |
| 2453 @classmethod | |
| 2454 def handle_EventTrace_Header(cls, line): | |
| 2455 """Verifies no event was dropped, e.g. no buffer overrun occured.""" | |
| 2456 BUFFER_SIZE = cls.USER_DATA | |
| 2457 #VERSION = cls.USER_DATA + 1 | |
| 2458 #PROVIDER_VERSION = cls.USER_DATA + 2 | |
| 2459 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3 | |
| 2460 #END_TIME = cls.USER_DATA + 4 | |
| 2461 #TIMER_RESOLUTION = cls.USER_DATA + 5 | |
| 2462 #MAX_FILE_SIZE = cls.USER_DATA + 6 | |
| 2463 #LOG_FILE_MODE = cls.USER_DATA + 7 | |
| 2464 #BUFFERS_WRITTEN = cls.USER_DATA + 8 | |
| 2465 #START_BUFFERS = cls.USER_DATA + 9 | |
| 2466 #POINTER_SIZE = cls.USER_DATA + 10 | |
| 2467 EVENTS_LOST = cls.USER_DATA + 11 | |
| 2468 #CPU_SPEED = cls.USER_DATA + 12 | |
| 2469 #LOGGER_NAME = cls.USER_DATA + 13 | |
| 2470 #LOG_FILE_NAME = cls.USER_DATA + 14 | |
| 2471 #BOOT_TIME = cls.USER_DATA + 15 | |
| 2472 #PERF_FREQ = cls.USER_DATA + 16 | |
| 2473 #START_TIME = cls.USER_DATA + 17 | |
| 2474 #RESERVED_FLAGS = cls.USER_DATA + 18 | |
| 2475 #BUFFERS_LOST = cls.USER_DATA + 19 | |
| 2476 #SESSION_NAME_STRING = cls.USER_DATA + 20 | |
| 2477 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21 | |
| 2478 if line[EVENTS_LOST] != '0': | |
| 2479 raise TracingFailure( | |
| 2480 ( '%s events were lost during trace, please increase the buffer ' | |
| 2481 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]), | |
| 2482 None, None, None) | |
| 2483 | |
| 2484 def handle_FileIo_Cleanup(self, line): | |
| 2485 """General wisdom: if a file is closed, it's because it was opened. | |
| 2486 | |
| 2487 Note that FileIo_Close is not used since if a file was opened properly but | |
| 2488 not closed before the process exits, only Cleanup will be logged. | |
| 2489 """ | |
| 2490 #IRP = self.USER_DATA | |
| 2491 TTID = self.USER_DATA + 1 # Thread ID, that's what we want. | |
| 2492 FILE_OBJECT = self.USER_DATA + 2 | |
| 2493 #FILE_KEY = self.USER_DATA + 3 | |
| 2494 proc = self._thread_to_process(line[TTID]) | |
| 2495 if not proc: | |
| 2496 # Not a process we care about. | |
| 2497 return | |
| 2498 file_object = line[FILE_OBJECT] | |
| 2499 if file_object in proc.file_objects: | |
| 2500 filepath, access_type = proc.file_objects.pop(file_object) | |
| 2501 proc.add_file(filepath, access_type) | |
| 2502 | |
| 2503 def handle_FileIo_Create(self, line): | |
| 2504 """Handles a file open. | |
| 2505 | |
| 2506 All FileIo events are described at | |
| 2507 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx | |
| 2508 for some value of 'description'. | |
| 2509 | |
| 2510 " (..) process and thread id values of the IO events (..) are not valid " | |
| 2511 http://msdn.microsoft.com/magazine/ee358703.aspx | |
| 2512 | |
| 2513 The FileIo.Create event doesn't return if the CreateFile() call | |
| 2514 succeeded, so keep track of the file_object and check that it is | |
| 2515 eventually closed with FileIo_Cleanup. | |
| 2516 """ | |
| 2517 #IRP = self.USER_DATA | |
| 2518 TTID = self.USER_DATA + 1 # Thread ID, that's what we want. | |
| 2519 FILE_OBJECT = self.USER_DATA + 2 | |
| 2520 #CREATE_OPTIONS = self.USER_DATA + 3 | |
| 2521 #FILE_ATTRIBUTES = self.USER_DATA + 4 | |
| 2522 #SHARE_ACCESS = self.USER_DATA + 5 | |
| 2523 OPEN_PATH = self.USER_DATA + 6 | |
| 2524 | |
| 2525 proc = self._thread_to_process(line[TTID]) | |
| 2526 if not proc: | |
| 2527 # Not a process we care about. | |
| 2528 return | |
| 2529 | |
| 2530 raw_path = line[OPEN_PATH] | |
| 2531 # Ignore directories and bare drive right away. | |
| 2532 if raw_path.endswith(os.path.sep): | |
| 2533 return | |
| 2534 filepath = self._drive_map.to_win32(raw_path) | |
| 2535 # Ignore bare drive right away. Some may still fall through with format | |
| 2536 # like '\\?\X:' | |
| 2537 if len(filepath) == 2: | |
| 2538 return | |
| 2539 file_object = line[FILE_OBJECT] | |
| 2540 if os.path.isdir(filepath): | |
| 2541 # There is no O_DIRECTORY equivalent on Windows. The closed is | |
| 2542 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So | |
| 2543 # simply discard directories are they are found. | |
| 2544 return | |
| 2545 # Override any stale file object. | |
| 2546 # TODO(maruel): Figure out a way to detect if the file was opened for | |
| 2547 # reading or writting. Sadly CREATE_OPTIONS doesn't seem to be of any help | |
| 2548 # here. For now mark as None to make it clear we have no idea what it is | |
| 2549 # about. | |
| 2550 proc.file_objects[file_object] = (filepath, None) | |
| 2551 | |
| 2552 def handle_FileIo_Rename(self, line): | |
| 2553 # TODO(maruel): Handle? | |
| 2554 pass | |
| 2555 | |
| 2556 def handle_Process_End(self, line): | |
| 2557 pid = line[self.PID] | |
| 2558 if self._process_lookup.get(pid): | |
| 2559 logging.info('Terminated: %d' % pid) | |
| 2560 self._process_lookup[pid] = None | |
| 2561 else: | |
| 2562 logging.debug('Terminated: %d' % pid) | |
| 2563 if self._thunk_process and self._thunk_process.pid == pid: | |
| 2564 self._thunk_process = None | |
| 2565 | |
| 2566 def handle_Process_Start(self, line): | |
| 2567 """Handles a new child process started by PID.""" | |
| 2568 #UNIQUE_PROCESS_KEY = self.USER_DATA | |
| 2569 PROCESS_ID = self.USER_DATA + 1 | |
| 2570 #PARENT_PID = self.USER_DATA + 2 | |
| 2571 #SESSION_ID = self.USER_DATA + 3 | |
| 2572 #EXIT_STATUS = self.USER_DATA + 4 | |
| 2573 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5 | |
| 2574 #USER_SID = self.USER_DATA + 6 | |
| 2575 IMAGE_FILE_NAME = self.USER_DATA + 7 | |
| 2576 COMMAND_LINE = self.USER_DATA + 8 | |
| 2577 | |
| 2578 ppid = line[self.PID] | |
| 2579 pid = int(line[PROCESS_ID], 16) | |
| 2580 command_line = CommandLineToArgvW(line[COMMAND_LINE]) | |
| 2581 logging.debug( | |
| 2582 'New process %d->%d (%s) %s' % | |
| 2583 (ppid, pid, line[IMAGE_FILE_NAME], command_line)) | |
| 2584 | |
| 2585 if pid == self._thunk_pid: | |
| 2586 # Need to ignore processes we don't know about because the log is | |
| 2587 # system-wide. self._thunk_pid shall start only one process. | |
| 2588 # This is tricky though because Windows *loves* to reuse process id and | |
| 2589 # it happens often that the process ID of the thunk script created by | |
| 2590 # create_subprocess_thunk() is reused. So just detecting the pid here is | |
| 2591 # not sufficient, we must confirm the command line. | |
| 2592 if command_line[:len(self._thunk_cmd)] != self._thunk_cmd: | |
| 2593 logging.info( | |
| 2594 'Ignoring duplicate pid %d for %s: %s while searching for %s', | |
| 2595 pid, self._trace_name, command_line, self._thunk_cmd) | |
| 2596 return | |
| 2597 | |
| 2598 # TODO(maruel): The check is quite weak. Add the thunk path. | |
| 2599 if self._thunk_process: | |
| 2600 raise TracingFailure( | |
| 2601 ( 'Parent process is _thunk_pid(%d) but thunk_process(%d) is ' | |
| 2602 'already set') % (self._thunk_pid, self._thunk_process.pid), | |
| 2603 None, None, None) | |
| 2604 proc = self.Process(self.blacklist, pid, None) | |
| 2605 self._thunk_process = proc | |
| 2606 return | |
| 2607 elif ppid == self._thunk_pid and self._thunk_process: | |
| 2608 proc = self.Process(self.blacklist, pid, None) | |
| 2609 self.root_process = proc | |
| 2610 ppid = None | |
| 2611 elif self._process_lookup.get(ppid): | |
| 2612 proc = self.Process(self.blacklist, pid, None) | |
| 2613 self._process_lookup[ppid].children.append(proc) | |
| 2614 else: | |
| 2615 # Ignore | |
| 2616 return | |
| 2617 self._process_lookup[pid] = proc | |
| 2618 | |
| 2619 proc.command = command_line | |
| 2620 proc.executable = line[IMAGE_FILE_NAME] | |
| 2621 # proc.command[0] may be the absolute path of 'executable' but it may be | |
| 2622 # anything else too. If it happens that command[0] ends with executable, | |
| 2623 # use it, otherwise defaults to the base name. | |
| 2624 cmd0 = proc.command[0].lower() | |
| 2625 if not cmd0.endswith('.exe'): | |
| 2626 # TODO(maruel): That's not strictly true either. | |
| 2627 cmd0 += '.exe' | |
| 2628 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0): | |
| 2629 # Fix the path. | |
| 2630 cmd0 = cmd0.replace('/', os.path.sep) | |
| 2631 cmd0 = os.path.normpath(cmd0) | |
| 2632 proc.executable = file_path.get_native_path_case(cmd0) | |
| 2633 logging.info( | |
| 2634 'New child: %s -> %d %s' % (ppid, pid, proc.executable)) | |
| 2635 | |
| 2636 def handle_Thread_End(self, line): | |
| 2637 """Has the same parameters as Thread_Start.""" | |
| 2638 tid = int(line[self.TID], 16) | |
| 2639 self._threads_active.pop(tid, None) | |
| 2640 | |
| 2641 def handle_Thread_Start(self, line): | |
| 2642 """Handles a new thread created. | |
| 2643 | |
| 2644 Do not use self.PID here since a process' initial thread is created by | |
| 2645 the parent process. | |
| 2646 """ | |
| 2647 PROCESS_ID = self.USER_DATA | |
| 2648 TTHREAD_ID = self.USER_DATA + 1 | |
| 2649 #STACK_BASE = self.USER_DATA + 2 | |
| 2650 #STACK_LIMIT = self.USER_DATA + 3 | |
| 2651 #USER_STACK_BASE = self.USER_DATA + 4 | |
| 2652 #USER_STACK_LIMIT = self.USER_DATA + 5 | |
| 2653 #AFFINITY = self.USER_DATA + 6 | |
| 2654 #WIN32_START_ADDR = self.USER_DATA + 7 | |
| 2655 #TEB_BASE = self.USER_DATA + 8 | |
| 2656 #SUB_PROCESS_TAG = self.USER_DATA + 9 | |
| 2657 #BASE_PRIORITY = self.USER_DATA + 10 | |
| 2658 #PAGE_PRIORITY = self.USER_DATA + 11 | |
| 2659 #IO_PRIORITY = self.USER_DATA + 12 | |
| 2660 #THREAD_FLAGS = self.USER_DATA + 13 | |
| 2661 # Do not use self.PID here since a process' initial thread is created by | |
| 2662 # the parent process. | |
| 2663 pid = int(line[PROCESS_ID], 16) | |
| 2664 tid = int(line[TTHREAD_ID], 16) | |
| 2665 logging.debug('New thread pid:%d, tid:%d' % (pid, tid)) | |
| 2666 self._threads_active[tid] = pid | |
| 2667 | |
| 2668 @classmethod | |
| 2669 def supported_events(cls): | |
| 2670 """Returns all the procesed events.""" | |
| 2671 out = [] | |
| 2672 for member in dir(cls): | |
| 2673 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member) | |
| 2674 if match: | |
| 2675 out.append(match.groups()) | |
| 2676 return out | |
| 2677 | |
| 2678 class Tracer(ApiBase.Tracer): | |
| 2679 # The basic headers. | |
| 2680 EXPECTED_HEADER = [ | |
| 2681 u'Event Name', | |
| 2682 u'Type', | |
| 2683 u'Event ID', | |
| 2684 u'Version', | |
| 2685 u'Channel', | |
| 2686 u'Level', # 5 | |
| 2687 u'Opcode', | |
| 2688 u'Task', | |
| 2689 u'Keyword', | |
| 2690 u'PID', | |
| 2691 u'TID', # 10 | |
| 2692 u'Processor Number', | |
| 2693 u'Instance ID', | |
| 2694 u'Parent Instance ID', | |
| 2695 u'Activity ID', | |
| 2696 u'Related Activity ID', # 15 | |
| 2697 u'Clock-Time', | |
| 2698 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them | |
| 2699 u'User(ms)', # pretty much useless. | |
| 2700 u'User Data', # Extra arguments that are event-specific. | |
| 2701 ] | |
| 2702 # Only the useful headers common to all entries are listed there. Any column | |
| 2703 # at 19 or higher is dependent on the specific event. | |
| 2704 EVENT_NAME = 0 | |
| 2705 TYPE = 1 | |
| 2706 PID = 9 | |
| 2707 TID = 10 | |
| 2708 PROCESSOR_ID = 11 | |
| 2709 TIMESTAMP = 16 | |
| 2710 NULL_GUID = '{00000000-0000-0000-0000-000000000000}' | |
| 2711 USER_DATA = 19 | |
| 2712 | |
| 2713 class CsvReader(object): | |
| 2714 """CSV reader that reads files generated by tracerpt.exe. | |
| 2715 | |
| 2716 csv.reader() fails to read them properly, it mangles file names quoted | |
| 2717 with "" with a comma in it. | |
| 2718 """ | |
| 2719 # 0. Had a ',' or one of the following ' ' after a comma, next should | |
| 2720 # be ' ', '"' or string or ',' for an empty field. | |
| 2721 ( HAD_DELIMITER, | |
| 2722 # 1. Processing an unquoted field up to ','. | |
| 2723 IN_STR, | |
| 2724 # 2. Processing a new field starting with '"'. | |
| 2725 STARTING_STR_QUOTED, | |
| 2726 # 3. Second quote in a row at the start of a field. It could be either | |
| 2727 # '""foo""' or '""'. Who the hell thought it was a great idea to use | |
| 2728 # the same character for delimiting and escaping? | |
| 2729 STARTING_SECOND_QUOTE, | |
| 2730 # 4. A quote inside a quoted string where the previous character was | |
| 2731 # not a quote, so the string is not empty. Can be either: end of a | |
| 2732 # quoted string (a delimiter) or a quote escape. The next char must be | |
| 2733 # either '"' or ','. | |
| 2734 HAD_QUOTE_IN_QUOTED, | |
| 2735 # 5. Second quote inside a quoted string. | |
| 2736 HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED, | |
| 2737 # 6. Processing a field that started with '"'. | |
| 2738 IN_STR_QUOTED) = range(7) | |
| 2739 | |
| 2740 def __init__(self, f): | |
| 2741 self.f = f | |
| 2742 | |
| 2743 def __iter__(self): | |
| 2744 return self | |
| 2745 | |
| 2746 def next(self): | |
| 2747 """Splits the line in fields.""" | |
| 2748 line = self.f.readline() | |
| 2749 if not line: | |
| 2750 raise StopIteration() | |
| 2751 line = line.strip() | |
| 2752 fields = [] | |
| 2753 state = self.HAD_DELIMITER | |
| 2754 for i, c in enumerate(line): | |
| 2755 if state == self.HAD_DELIMITER: | |
| 2756 if c == ',': | |
| 2757 # Empty field. | |
| 2758 fields.append('') | |
| 2759 elif c == ' ': | |
| 2760 # Ignore initial whitespaces | |
| 2761 pass | |
| 2762 elif c == '"': | |
| 2763 state = self.STARTING_STR_QUOTED | |
| 2764 fields.append('') | |
| 2765 else: | |
| 2766 # Start of a new field. | |
| 2767 state = self.IN_STR | |
| 2768 fields.append(c) | |
| 2769 | |
| 2770 elif state == self.IN_STR: | |
| 2771 # Do not accept quote inside unquoted field. | |
| 2772 assert c != '"', (i, c, line, fields) | |
| 2773 if c == ',': | |
| 2774 fields[-1] = fields[-1].strip() | |
| 2775 state = self.HAD_DELIMITER | |
| 2776 else: | |
| 2777 fields[-1] = fields[-1] + c | |
| 2778 | |
| 2779 elif state == self.STARTING_STR_QUOTED: | |
| 2780 if c == '"': | |
| 2781 # Do not store the character yet. | |
| 2782 state = self.STARTING_SECOND_QUOTE | |
| 2783 else: | |
| 2784 state = self.IN_STR_QUOTED | |
| 2785 fields[-1] = fields[-1] + c | |
| 2786 | |
| 2787 elif state == self.STARTING_SECOND_QUOTE: | |
| 2788 if c == ',': | |
| 2789 # It was an empty field. '""' == ''. | |
| 2790 state = self.HAD_DELIMITER | |
| 2791 else: | |
| 2792 fields[-1] = fields[-1] + '"' + c | |
| 2793 state = self.IN_STR_QUOTED | |
| 2794 | |
| 2795 elif state == self.HAD_QUOTE_IN_QUOTED: | |
| 2796 if c == ',': | |
| 2797 # End of the string. | |
| 2798 state = self.HAD_DELIMITER | |
| 2799 elif c == '"': | |
| 2800 state = self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED | |
| 2801 else: | |
| 2802 # The previous double-quote was just an unescaped quote. | |
| 2803 fields[-1] = fields[-1] + '"' + c | |
| 2804 state = self.IN_STR_QUOTED | |
| 2805 | |
| 2806 elif state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED: | |
| 2807 if c == ',': | |
| 2808 # End of the string. | |
| 2809 state = self.HAD_DELIMITER | |
| 2810 fields[-1] = fields[-1] + '"' | |
| 2811 else: | |
| 2812 # That's just how the logger rolls. Revert back to appending the | |
| 2813 # char and "guess" it was a quote in a double-quoted string. | |
| 2814 state = self.IN_STR_QUOTED | |
| 2815 fields[-1] = fields[-1] + '"' + c | |
| 2816 | |
| 2817 elif state == self.IN_STR_QUOTED: | |
| 2818 if c == '"': | |
| 2819 # Could be a delimiter or an escape. | |
| 2820 state = self.HAD_QUOTE_IN_QUOTED | |
| 2821 else: | |
| 2822 fields[-1] = fields[-1] + c | |
| 2823 | |
| 2824 if state == self.HAD_SECOND_QUOTE_IN_A_ROW_IN_QUOTED: | |
| 2825 fields[-1] = fields[-1] + '"' | |
| 2826 else: | |
| 2827 assert state in ( | |
| 2828 # Terminated with a normal field. | |
| 2829 self.IN_STR, | |
| 2830 # Terminated with an empty field. | |
| 2831 self.STARTING_SECOND_QUOTE, | |
| 2832 # Terminated with a normal quoted field. | |
| 2833 self.HAD_QUOTE_IN_QUOTED), ( | |
| 2834 line, state, fields) | |
| 2835 return fields | |
| 2836 | |
| 2837 def __init__(self, logname): | |
| 2838 """Starts the log collection. | |
| 2839 | |
| 2840 Requires administrative access. logman.exe is synchronous so no need for a | |
| 2841 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID | |
| 2842 instead. The GUID constant name is SystemTraceControlGuid. Lovely. | |
| 2843 | |
| 2844 One can get the list of potentially interesting providers with: | |
| 2845 "logman query providers | findstr /i file" | |
| 2846 """ | |
| 2847 super(LogmanTrace.Tracer, self).__init__(logname) | |
| 2848 self._signal_script = create_subprocess_thunk() | |
| 2849 self._scripts_to_cleanup.append(self._signal_script) | |
| 2850 cmd_start = [ | |
| 2851 'logman.exe', | |
| 2852 'start', | |
| 2853 'NT Kernel Logger', | |
| 2854 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}', | |
| 2855 # splitio,fileiocompletion,syscall,file,cswitch,img | |
| 2856 '(process,fileio,thread)', | |
| 2857 '-o', self._logname + '.etl', | |
| 2858 '-ets', # Send directly to kernel | |
| 2859 # Values extracted out of thin air. | |
| 2860 # Event Trace Session buffer size in kb. | |
| 2861 '-bs', '10240', | |
| 2862 # Number of Event Trace Session buffers. | |
| 2863 '-nb', '16', '256', | |
| 2864 ] | |
| 2865 logging.debug('Running: %s' % cmd_start) | |
| 2866 try: | |
| 2867 subprocess.check_call( | |
| 2868 cmd_start, | |
| 2869 stdin=subprocess.PIPE, | |
| 2870 stdout=subprocess.PIPE, | |
| 2871 stderr=subprocess.STDOUT) | |
| 2872 except subprocess.CalledProcessError, e: | |
| 2873 if e.returncode == -2147024891: | |
| 2874 print >> sys.stderr, 'Please restart with an elevated admin prompt' | |
| 2875 elif e.returncode == -2144337737: | |
| 2876 print >> sys.stderr, ( | |
| 2877 'A kernel trace was already running, stop it and try again') | |
| 2878 raise | |
| 2879 | |
| 2880 def trace(self, cmd, cwd, tracename, output): | |
| 2881 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) | |
| 2882 assert os.path.isabs(cmd[0]), cmd[0] | |
| 2883 assert os.path.isabs(cwd), cwd | |
| 2884 assert os.path.normpath(cwd) == cwd, cwd | |
| 2885 with self._lock: | |
| 2886 if not self._initialized: | |
| 2887 raise TracingFailure( | |
| 2888 'Called Tracer.trace() on an unitialized object', | |
| 2889 None, None, None, tracename) | |
| 2890 assert tracename not in (i['trace'] for i in self._traces) | |
| 2891 | |
| 2892 # Use "logman -?" for help. | |
| 2893 | |
| 2894 stdout = stderr = None | |
| 2895 if output: | |
| 2896 stdout = subprocess.PIPE | |
| 2897 stderr = subprocess.STDOUT | |
| 2898 | |
| 2899 # Run the child process. | |
| 2900 logging.debug('Running: %s' % cmd) | |
| 2901 # Use the temporary script generated with create_subprocess_thunk() so we | |
| 2902 # have a clear pid owner. Since trace_inputs.py can be used as a library | |
| 2903 # and could trace multiple processes simultaneously, it makes it more | |
| 2904 # complex if the executable to be traced is executed directly here. It | |
| 2905 # also solves issues related to logman.exe that needs to be executed to | |
| 2906 # control the kernel trace. | |
| 2907 child_cmd = [ | |
| 2908 sys.executable, | |
| 2909 self._signal_script, | |
| 2910 tracename, | |
| 2911 ] | |
| 2912 child = subprocess.Popen( | |
| 2913 child_cmd + tools.fix_python_path(cmd), | |
| 2914 cwd=cwd, | |
| 2915 stdin=subprocess.PIPE, | |
| 2916 stdout=stdout, | |
| 2917 stderr=stderr) | |
| 2918 logging.debug('Started child pid: %d' % child.pid) | |
| 2919 out = child.communicate()[0] | |
| 2920 # This doesn't mean all the grand-children are done. Sadly, we don't have | |
| 2921 # a good way to determine that. | |
| 2922 | |
| 2923 with self._lock: | |
| 2924 assert tracename not in (i['trace'] for i in self._traces) | |
| 2925 self._traces.append({ | |
| 2926 'cmd': cmd, | |
| 2927 'cwd': cwd, | |
| 2928 'output': out, | |
| 2929 'pid': child.pid, | |
| 2930 # Used to figure out the real process when process ids are reused. | |
| 2931 'thunk_cmd': child_cmd, | |
| 2932 'trace': tracename, | |
| 2933 }) | |
| 2934 | |
| 2935 return child.returncode, out | |
| 2936 | |
| 2937 def close(self, _timeout=None): | |
| 2938 """Stops the kernel log collection and converts the traces to text | |
| 2939 representation. | |
| 2940 """ | |
| 2941 with self._lock: | |
| 2942 try: | |
| 2943 super(LogmanTrace.Tracer, self).close() | |
| 2944 finally: | |
| 2945 cmd_stop = [ | |
| 2946 'logman.exe', | |
| 2947 'stop', | |
| 2948 'NT Kernel Logger', | |
| 2949 '-ets', # Sends the command directly to the kernel. | |
| 2950 ] | |
| 2951 logging.debug('Running: %s' % cmd_stop) | |
| 2952 subprocess.check_call( | |
| 2953 cmd_stop, | |
| 2954 stdin=subprocess.PIPE, | |
| 2955 stdout=subprocess.PIPE, | |
| 2956 stderr=subprocess.STDOUT) | |
| 2957 | |
| 2958 def post_process_log(self): | |
| 2959 """Converts the .etl file into .csv then into .json.""" | |
| 2960 super(LogmanTrace.Tracer, self).post_process_log() | |
| 2961 logformat = 'csv' | |
| 2962 self._convert_log(logformat) | |
| 2963 self._trim_log(logformat) | |
| 2964 | |
| 2965 def _gen_logdata(self): | |
| 2966 return { | |
| 2967 'format': 'csv', | |
| 2968 'traces': self._traces, | |
| 2969 } | |
| 2970 | |
| 2971 def _trim_log(self, logformat): | |
| 2972 """Reduces the amount of data in original log by generating a 'reduced' | |
| 2973 log. | |
| 2974 """ | |
| 2975 if logformat == 'csv_utf16': | |
| 2976 file_handle = codecs.open( | |
| 2977 self._logname + '.' + logformat, 'r', encoding='utf-16') | |
| 2978 | |
| 2979 elif logformat == 'csv': | |
| 2980 assert sys.getfilesystemencoding() == 'mbcs' | |
| 2981 file_handle = codecs.open( | |
| 2982 self._logname + '.' + logformat, 'r', | |
| 2983 encoding=get_current_encoding()) | |
| 2984 | |
| 2985 supported_events = LogmanTrace.Context.supported_events() | |
| 2986 | |
| 2987 def trim(generator): | |
| 2988 """Loads items from the generator and returns the interesting data. | |
| 2989 | |
| 2990 It filters out any uninteresting line and reduce the amount of data in | |
| 2991 the trace. | |
| 2992 """ | |
| 2993 for index, line in enumerate(generator): | |
| 2994 if not index: | |
| 2995 if line != self.EXPECTED_HEADER: | |
| 2996 raise TracingFailure( | |
| 2997 'Found malformed header: %s' % line, | |
| 2998 None, None, None) | |
| 2999 continue | |
| 3000 # As you can see, the CSV is full of useful non-redundant information: | |
| 3001 if (line[2] != '0' or # Event ID | |
| 3002 line[3] not in ('2', '3') or # Version | |
| 3003 line[4] != '0' or # Channel | |
| 3004 line[5] != '0' or # Level | |
| 3005 line[7] != '0' or # Task | |
| 3006 line[8] != '0x0000000000000000' or # Keyword | |
| 3007 line[12] != '' or # Instance ID | |
| 3008 line[13] != '' or # Parent Instance ID | |
| 3009 line[14] != self.NULL_GUID or # Activity ID | |
| 3010 line[15] != ''): # Related Activity ID | |
| 3011 raise TracingFailure( | |
| 3012 'Found unexpected values in line: %s' % ' '.join(line), | |
| 3013 None, None, None) | |
| 3014 | |
| 3015 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events: | |
| 3016 continue | |
| 3017 | |
| 3018 yield [ | |
| 3019 line[self.EVENT_NAME], | |
| 3020 line[self.TYPE], | |
| 3021 line[self.PID], | |
| 3022 line[self.TID], | |
| 3023 line[self.PROCESSOR_ID], | |
| 3024 line[self.TIMESTAMP], | |
| 3025 ] + line[self.USER_DATA:] | |
| 3026 | |
| 3027 # must not convert the trim() call into a list, since it will use too much | |
| 3028 # memory for large trace. use a csv file as a workaround since the json | |
| 3029 # parser requires a complete in-memory file. | |
| 3030 with open('%s.preprocessed' % self._logname, 'wb') as f: | |
| 3031 # $ and * can't be used in file name on windows, reducing the likelihood | |
| 3032 # of having to escape a string. | |
| 3033 out = csv.writer( | |
| 3034 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL) | |
| 3035 for line in trim(self.CsvReader(file_handle)): | |
| 3036 out.writerow([s.encode('utf-8') for s in line]) | |
| 3037 | |
| 3038 def _convert_log(self, logformat): | |
| 3039 """Converts the ETL trace to text representation. | |
| 3040 | |
| 3041 Normally, 'csv' is sufficient. If complex scripts are used (like eastern | |
| 3042 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'. | |
| 3043 | |
| 3044 Arguments: | |
| 3045 - logformat: Text format to be generated, csv, csv_utf16 or xml. | |
| 3046 | |
| 3047 Use "tracerpt -?" for help. | |
| 3048 """ | |
| 3049 LOCALE_INVARIANT = 0x7F | |
| 3050 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT) | |
| 3051 cmd_convert = [ | |
| 3052 'tracerpt.exe', | |
| 3053 '-l', self._logname + '.etl', | |
| 3054 '-o', self._logname + '.' + logformat, | |
| 3055 '-gmt', # Use UTC | |
| 3056 '-y', # No prompt | |
| 3057 # Use -of XML to get the header of each items after column 19, e.g. all | |
| 3058 # the actual headers of 'User Data'. | |
| 3059 ] | |
| 3060 | |
| 3061 if logformat == 'csv': | |
| 3062 # tracerpt localizes the 'Type' column, for major brainfuck | |
| 3063 # entertainment. I can't imagine any sane reason to do that. | |
| 3064 cmd_convert.extend(['-of', 'CSV']) | |
| 3065 elif logformat == 'csv_utf16': | |
| 3066 # This causes it to use UTF-16, which doubles the log size but ensures | |
| 3067 # the log is readable for non-ASCII characters. | |
| 3068 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode']) | |
| 3069 elif logformat == 'xml': | |
| 3070 cmd_convert.extend(['-of', 'XML']) | |
| 3071 else: | |
| 3072 raise ValueError('Unexpected log format \'%s\'' % logformat) | |
| 3073 logging.debug('Running: %s' % cmd_convert) | |
| 3074 # This can takes tens of minutes for large logs. | |
| 3075 # Redirects all output to stderr. | |
| 3076 subprocess.check_call( | |
| 3077 cmd_convert, | |
| 3078 stdin=subprocess.PIPE, | |
| 3079 stdout=sys.stderr, | |
| 3080 stderr=sys.stderr) | |
| 3081 | |
| 3082 def __init__(self, use_sudo=False): # pylint: disable=W0613 | |
| 3083 super(LogmanTrace, self).__init__() | |
| 3084 # Ignore use_sudo. It's irrelevant on Windows but kept to simplify the API. | |
| 3085 | |
| 3086 @staticmethod | |
| 3087 def clean_trace(logname): | |
| 3088 for ext in ('', '.csv', '.etl', '.json', '.xml', '.preprocessed'): | |
| 3089 if os.path.isfile(logname + ext): | |
| 3090 os.remove(logname + ext) | |
| 3091 | |
| 3092 @classmethod | |
| 3093 def parse_log(cls, logname, blacklist, trace_name): | |
| 3094 logging.info('parse_log(%s, ..., %s)', logname, trace_name) | |
| 3095 assert os.path.isabs(logname) | |
| 3096 | |
| 3097 def blacklist_more(filepath): | |
| 3098 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that. | |
| 3099 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath) | |
| 3100 | |
| 3101 # Create a list of (Context, result_dict) tuples. This is necessary because | |
| 3102 # the csv file may be larger than the amount of available memory. | |
| 3103 contexes = [ | |
| 3104 ( | |
| 3105 cls.Context( | |
| 3106 blacklist_more, item['pid'], item['trace'], item['thunk_cmd']), | |
| 3107 { | |
| 3108 'output': item['output'], | |
| 3109 'trace': item['trace'], | |
| 3110 }, | |
| 3111 ) | |
| 3112 for item in read_json(logname)['traces'] | |
| 3113 if not trace_name or item['trace'] == trace_name | |
| 3114 ] | |
| 3115 | |
| 3116 # The log may be too large to fit in memory and it is not efficient to read | |
| 3117 # it multiple times, so multiplex the contexes instead, which is slightly | |
| 3118 # more awkward. | |
| 3119 with open('%s.preprocessed' % logname, 'rb') as f: | |
| 3120 lines = csv.reader( | |
| 3121 f, delimiter='$', quotechar='*', quoting=csv.QUOTE_MINIMAL) | |
| 3122 for encoded in lines: | |
| 3123 line = [s.decode('utf-8') for s in encoded] | |
| 3124 # Convert the PID in-place from hex. | |
| 3125 line[cls.Context.PID] = int(line[cls.Context.PID], 16) | |
| 3126 for context in contexes: | |
| 3127 if 'exception' in context[1]: | |
| 3128 continue | |
| 3129 try: | |
| 3130 context[0].on_line(line) | |
| 3131 except TracingFailure: | |
| 3132 context[1]['exception'] = sys.exc_info() | |
| 3133 | |
| 3134 for context in contexes: | |
| 3135 if 'exception' in context[1]: | |
| 3136 continue | |
| 3137 context[1]['results'] = context[0].to_results() | |
| 3138 | |
| 3139 return [context[1] for context in contexes] | |
| 3140 | |
| 3141 | |
| 3142 def get_api(**kwargs): | |
| 3143 """Returns the correct implementation for the current OS.""" | |
| 3144 if sys.platform == 'cygwin': | |
| 3145 raise NotImplementedError( | |
| 3146 'Not implemented for cygwin, start the script from Win32 python') | |
| 3147 flavors = { | |
| 3148 'win32': LogmanTrace, | |
| 3149 'darwin': Dtrace, | |
| 3150 'sunos5': Dtrace, | |
| 3151 'freebsd7': Dtrace, | |
| 3152 'freebsd8': Dtrace, | |
| 3153 } | |
| 3154 # Defaults to strace. | |
| 3155 return flavors.get(sys.platform, Strace)(**kwargs) | |
| 3156 | |
| 3157 | |
| 3158 def extract_directories(root_dir, files, blacklist): | |
| 3159 """Detects if all the files in a directory are in |files| and if so, replace | |
| 3160 the individual files by a Results.Directory instance. | |
| 3161 | |
| 3162 Takes a list of Results.File instances and returns a shorter list of | |
| 3163 Results.File and Results.Directory instances. | |
| 3164 | |
| 3165 Arguments: | |
| 3166 - root_dir: Optional base directory that shouldn't be search further. | |
| 3167 - files: list of Results.File instances. | |
| 3168 - blacklist: lambda to reject unneeded files, for example r'.+\.pyc'. | |
| 3169 """ | |
| 3170 logging.info( | |
| 3171 'extract_directories(%s, %d files, ...)' % (root_dir, len(files))) | |
| 3172 assert not (root_dir or '').endswith(os.path.sep), root_dir | |
| 3173 # It is important for root_dir to not be a symlinked path, make sure to call | |
| 3174 # os.path.realpath() as needed. | |
| 3175 assert not root_dir or ( | |
| 3176 os.path.realpath(file_path.get_native_path_case(root_dir)) == root_dir) | |
| 3177 assert not any(isinstance(f, Results.Directory) for f in files) | |
| 3178 # Remove non existent files. | |
| 3179 files = [f for f in files if f.existent] | |
| 3180 if not files: | |
| 3181 return files | |
| 3182 # All files must share the same root, which can be None. | |
| 3183 assert len(set(f.root for f in files)) == 1, set(f.root for f in files) | |
| 3184 | |
| 3185 # Creates a {directory: {filename: File}} mapping, up to root. | |
| 3186 buckets = {} | |
| 3187 if root_dir: | |
| 3188 buckets[root_dir] = {} | |
| 3189 for fileobj in files: | |
| 3190 path = fileobj.full_path | |
| 3191 directory = os.path.dirname(path) | |
| 3192 assert directory | |
| 3193 # Do not use os.path.basename() so trailing os.path.sep is kept. | |
| 3194 basename = path[len(directory)+1:] | |
| 3195 files_in_directory = buckets.setdefault(directory, {}) | |
| 3196 files_in_directory[basename] = fileobj | |
| 3197 # Add all the directories recursively up to root. | |
| 3198 while True: | |
| 3199 old_d = directory | |
| 3200 directory = os.path.dirname(directory) | |
| 3201 if directory + os.path.sep == root_dir or directory == old_d: | |
| 3202 break | |
| 3203 buckets.setdefault(directory, {}) | |
| 3204 | |
| 3205 root_prefix = len(root_dir) + 1 if root_dir else 0 | |
| 3206 for directory in sorted(buckets, reverse=True): | |
| 3207 if not os.path.isdir(directory): | |
| 3208 logging.debug( | |
| 3209 '%s was a directory but doesn\'t exist anymore; ignoring', directory) | |
| 3210 continue | |
| 3211 actual = set(f for f in os.listdir(directory) if not blacklist(f)) | |
| 3212 expected = set(buckets[directory]) | |
| 3213 if not (actual - expected): | |
| 3214 parent = os.path.dirname(directory) | |
| 3215 buckets[parent][os.path.basename(directory)] = Results.Directory( | |
| 3216 root_dir, | |
| 3217 directory[root_prefix:], | |
| 3218 False, | |
| 3219 sum(f.size for f in buckets[directory].itervalues()), | |
| 3220 sum(f.nb_files for f in buckets[directory].itervalues())) | |
| 3221 # Remove the whole bucket. | |
| 3222 del buckets[directory] | |
| 3223 | |
| 3224 # Reverse the mapping with what remains. The original instances are returned, | |
| 3225 # so the cached meta data is kept. | |
| 3226 files = sum((x.values() for x in buckets.itervalues()), []) | |
| 3227 return sorted(files, key=lambda x: x.path) | |
| 3228 | |
| 3229 | |
| 3230 def trace(logfile, cmd, cwd, api, output): | |
| 3231 """Traces an executable. Returns (returncode, output) from api. | |
| 3232 | |
| 3233 Arguments: | |
| 3234 - logfile: file to write to. | |
| 3235 - cmd: command to run. | |
| 3236 - cwd: current directory to start the process in. | |
| 3237 - api: a tracing api instance. | |
| 3238 - output: if True, returns output, otherwise prints it at the console. | |
| 3239 """ | |
| 3240 cmd = tools.fix_python_path(cmd) | |
| 3241 api.clean_trace(logfile) | |
| 3242 with api.get_tracer(logfile) as tracer: | |
| 3243 return tracer.trace(cmd, cwd, 'default', output) | |
| 3244 | |
| 3245 | |
| 3246 def CMDclean(parser, args): | |
| 3247 """Cleans up traces.""" | |
| 3248 options, args = parser.parse_args(args) | |
| 3249 api = get_api() | |
| 3250 api.clean_trace(options.log) | |
| 3251 return 0 | |
| 3252 | |
| 3253 | |
| 3254 def CMDtrace(parser, args): | |
| 3255 """Traces an executable.""" | |
| 3256 parser.allow_interspersed_args = False | |
| 3257 parser.add_option( | |
| 3258 '-q', '--quiet', action='store_true', | |
| 3259 help='Redirects traced executable output to /dev/null') | |
| 3260 parser.add_option( | |
| 3261 '-s', '--sudo', action='store_true', | |
| 3262 help='Use sudo when shelling out the tracer tool (ignored on Windows)') | |
| 3263 parser.add_option( | |
| 3264 '-n', '--no-sudo', action='store_false', | |
| 3265 help='Don\'t use sudo') | |
| 3266 options, args = parser.parse_args(args) | |
| 3267 | |
| 3268 if not args: | |
| 3269 parser.error('Please provide a command to run') | |
| 3270 | |
| 3271 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK): | |
| 3272 args[0] = os.path.abspath(args[0]) | |
| 3273 | |
| 3274 # options.sudo default value is None, which is to do whatever tracer defaults | |
| 3275 # do. | |
| 3276 api = get_api(use_sudo=options.sudo) | |
| 3277 return trace(options.log, args, os.getcwd(), api, options.quiet)[0] | |
| 3278 | |
| 3279 | |
| 3280 def CMDread(parser, args): | |
| 3281 """Reads the logs and prints the result.""" | |
| 3282 parser.add_option( | |
| 3283 '-V', '--variable', | |
| 3284 nargs=2, | |
| 3285 action='append', | |
| 3286 dest='variables', | |
| 3287 metavar='VAR_NAME directory', | |
| 3288 default=[], | |
| 3289 help=('Variables to replace relative directories against. Example: ' | |
| 3290 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your ' | |
| 3291 'home dir with $HOME') % getpass.getuser()) | |
| 3292 parser.add_option( | |
| 3293 '--root-dir', | |
| 3294 help='Root directory to base everything off it. Anything outside of this ' | |
| 3295 'this directory will not be reported') | |
| 3296 parser.add_option( | |
| 3297 '--trace-name', | |
| 3298 help='Only reads one of the trace. Defaults to reading all traces') | |
| 3299 parser.add_option( | |
| 3300 '-j', '--json', action='store_true', | |
| 3301 help='Outputs raw result data as json') | |
| 3302 parser.add_option( | |
| 3303 '--trace-blacklist', action='append', default=[], | |
| 3304 help='List of regexp to use as blacklist filter') | |
| 3305 options, args = parser.parse_args(args) | |
| 3306 | |
| 3307 if options.root_dir: | |
| 3308 options.root_dir = file_path.get_native_path_case( | |
| 3309 unicode(os.path.abspath(options.root_dir))) | |
| 3310 | |
| 3311 variables = dict(options.variables) | |
| 3312 api = get_api() | |
| 3313 blacklist = gen_blacklist(options.trace_blacklist) | |
| 3314 data = api.parse_log(options.log, blacklist, options.trace_name) | |
| 3315 # Process each trace. | |
| 3316 output_as_json = [] | |
| 3317 try: | |
| 3318 for item in data: | |
| 3319 if 'exception' in item: | |
| 3320 # Do not abort the other traces. | |
| 3321 print >> sys.stderr, ( | |
| 3322 'Trace %s: Got an exception: %s' % ( | |
| 3323 item['trace'], item['exception'][1])) | |
| 3324 continue | |
| 3325 results = item['results'] | |
| 3326 if options.root_dir: | |
| 3327 results = results.strip_root(options.root_dir) | |
| 3328 | |
| 3329 if options.json: | |
| 3330 output_as_json.append(results.flatten()) | |
| 3331 else: | |
| 3332 simplified = extract_directories( | |
| 3333 options.root_dir, results.files, blacklist) | |
| 3334 simplified = [f.replace_variables(variables) for f in simplified] | |
| 3335 if len(data) > 1: | |
| 3336 print('Trace: %s' % item['trace']) | |
| 3337 print('Total: %d' % len(results.files)) | |
| 3338 print('Non existent: %d' % len(results.non_existent)) | |
| 3339 for f in results.non_existent: | |
| 3340 print(' %s' % f.path) | |
| 3341 print( | |
| 3342 'Interesting: %d reduced to %d' % ( | |
| 3343 len(results.existent), len(simplified))) | |
| 3344 for f in simplified: | |
| 3345 print(' %s' % f.path) | |
| 3346 | |
| 3347 if options.json: | |
| 3348 write_json(sys.stdout, output_as_json, False) | |
| 3349 except KeyboardInterrupt: | |
| 3350 return 1 | |
| 3351 except IOError as e: | |
| 3352 if e.errno == errno.EPIPE: | |
| 3353 # Do not print a stack trace when the output is piped to less and the user | |
| 3354 # quits before the whole output was written. | |
| 3355 return 1 | |
| 3356 raise | |
| 3357 return 0 | |
| 3358 | |
| 3359 | |
| 3360 class OptionParserTraceInputs(tools.OptionParserWithLogging): | |
| 3361 """Adds automatic --log handling.""" | |
| 3362 | |
| 3363 # Disable --log-file options since both --log and --log-file options are | |
| 3364 # confusing. | |
| 3365 # TODO(vadimsh): Rename --log-file or --log to something else. | |
| 3366 enable_log_file = False | |
| 3367 | |
| 3368 def __init__(self, **kwargs): | |
| 3369 tools.OptionParserWithLogging.__init__(self, **kwargs) | |
| 3370 self.add_option( | |
| 3371 '-l', '--log', help='Log file to generate or read, required') | |
| 3372 | |
| 3373 def parse_args(self, *args, **kwargs): | |
| 3374 """Makes sure the paths make sense. | |
| 3375 | |
| 3376 On Windows, / and \ are often mixed together in a path. | |
| 3377 """ | |
| 3378 options, args = tools.OptionParserWithLogging.parse_args( | |
| 3379 self, *args, **kwargs) | |
| 3380 if not options.log: | |
| 3381 self.error('Must supply a log file with -l') | |
| 3382 options.log = os.path.abspath(options.log) | |
| 3383 return options, args | |
| 3384 | |
| 3385 | |
| 3386 def main(argv): | |
| 3387 dispatcher = subcommand.CommandDispatcher(__name__) | |
| 3388 try: | |
| 3389 return dispatcher.execute( | |
| 3390 OptionParserTraceInputs(version=__version__), argv) | |
| 3391 except TracingFailure, e: | |
| 3392 sys.stderr.write('\nError: ') | |
| 3393 sys.stderr.write(str(e)) | |
| 3394 sys.stderr.write('\n') | |
| 3395 return 1 | |
| 3396 | |
| 3397 | |
| 3398 if __name__ == '__main__': | |
| 3399 fix_encoding.fix_encoding() | |
| 3400 tools.disable_buffering() | |
| 3401 colorama.init() | |
| 3402 sys.exit(main(sys.argv[1:])) | |
| OLD | NEW |