OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # coding=utf-8 |
| 3 # Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 4 # Use of this source code is governed by a BSD-style license that can be |
| 5 # found in the LICENSE file. |
| 6 |
| 7 """Traces an executable and its child processes and extract the files accessed |
| 8 by them. |
| 9 |
| 10 The implementation uses OS-specific API. The native Kernel logger and the ETL |
| 11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise. |
| 12 The OS-specific implementation is hidden in an 'API' interface. |
| 13 |
| 14 The results are embedded in a Results instance. The tracing is done in two |
| 15 phases, the first is to do the actual trace and generate an |
| 16 implementation-specific log file. Then the log file is parsed to extract the |
| 17 information, including the individual child processes and the files accessed |
| 18 from the log. |
| 19 """ |
| 20 |
| 21 import codecs |
| 22 import csv |
| 23 import getpass |
| 24 import glob |
| 25 import json |
| 26 import logging |
| 27 import optparse |
| 28 import os |
| 29 import re |
| 30 import subprocess |
| 31 import sys |
| 32 import tempfile |
| 33 import threading |
| 34 import time |
| 35 import weakref |
| 36 |
| 37 ## OS-specific imports |
| 38 |
| 39 if sys.platform == 'win32': |
| 40 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p |
| 41 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611 |
| 42 from ctypes.wintypes import GetLastError # pylint: disable=E0611 |
| 43 elif sys.platform == 'darwin': |
| 44 import Carbon.File # pylint: disable=F0401 |
| 45 import MacOS # pylint: disable=F0401 |
| 46 |
| 47 |
| 48 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) |
| 49 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR)) |
| 50 |
| 51 |
| 52 class TracingFailure(Exception): |
| 53 """An exception occured during tracing.""" |
| 54 def __init__(self, description, pid, line_number, line, *args): |
| 55 super(TracingFailure, self).__init__( |
| 56 description, pid, line_number, line, *args) |
| 57 self.description = description |
| 58 self.pid = pid |
| 59 self.line_number = line_number |
| 60 self.line = line |
| 61 self.extra = args |
| 62 |
| 63 def __str__(self): |
| 64 out = self.description |
| 65 if self.pid: |
| 66 out += '\npid: %d' % self.pid |
| 67 if self.line_number: |
| 68 out += '\nline: %d' % self.line_number |
| 69 if self.line: |
| 70 out += '\n%s' % self.line |
| 71 if self.extra: |
| 72 out += '\n' + ', '.join(map(str, filter(None, self.extra))) |
| 73 return out |
| 74 |
| 75 |
| 76 ## OS-specific functions |
| 77 |
| 78 if sys.platform == 'win32': |
| 79 def QueryDosDevice(drive_letter): |
| 80 """Returns the Windows 'native' path for a DOS drive letter.""" |
| 81 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter |
| 82 # Guesswork. QueryDosDeviceW never returns the required number of bytes. |
| 83 chars = 1024 |
| 84 drive_letter = unicode(drive_letter) |
| 85 p = create_unicode_buffer(chars) |
| 86 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars): |
| 87 err = GetLastError() |
| 88 if err: |
| 89 # pylint: disable=E0602 |
| 90 raise WindowsError( |
| 91 err, |
| 92 'QueryDosDevice(%s): %s (%d)' % ( |
| 93 str(drive_letter), FormatError(err), err)) |
| 94 return p.value |
| 95 |
| 96 |
| 97 def GetShortPathName(long_path): |
| 98 """Returns the Windows short path equivalent for a 'long' path.""" |
| 99 long_path = unicode(long_path) |
| 100 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is |
| 101 # not enforced. |
| 102 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'): |
| 103 long_path = '\\\\?\\' + long_path |
| 104 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0) |
| 105 if chars: |
| 106 p = create_unicode_buffer(chars) |
| 107 if windll.kernel32.GetShortPathNameW(long_path, p, chars): |
| 108 return p.value |
| 109 |
| 110 err = GetLastError() |
| 111 if err: |
| 112 # pylint: disable=E0602 |
| 113 raise WindowsError( |
| 114 err, |
| 115 'GetShortPathName(%s): %s (%d)' % ( |
| 116 str(long_path), FormatError(err), err)) |
| 117 |
| 118 |
| 119 def GetLongPathName(short_path): |
| 120 """Returns the Windows long path equivalent for a 'short' path.""" |
| 121 short_path = unicode(short_path) |
| 122 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is |
| 123 # not enforced. |
| 124 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'): |
| 125 short_path = '\\\\?\\' + short_path |
| 126 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0) |
| 127 if chars: |
| 128 p = create_unicode_buffer(chars) |
| 129 if windll.kernel32.GetLongPathNameW(short_path, p, chars): |
| 130 return p.value |
| 131 |
| 132 err = GetLastError() |
| 133 if err: |
| 134 # pylint: disable=E0602 |
| 135 raise WindowsError( |
| 136 err, |
| 137 'GetLongPathName(%s): %s (%d)' % ( |
| 138 str(short_path), FormatError(err), err)) |
| 139 |
| 140 |
| 141 def get_current_encoding(): |
| 142 """Returns the 'ANSI' code page associated to the process.""" |
| 143 return 'cp%d' % int(windll.kernel32.GetACP()) |
| 144 |
| 145 |
| 146 class DosDriveMap(object): |
| 147 """Maps \Device\HarddiskVolumeN to N: on Windows.""" |
| 148 # Keep one global cache. |
| 149 _MAPPING = {} |
| 150 |
| 151 def __init__(self): |
| 152 """Lazy loads the cache.""" |
| 153 if not self._MAPPING: |
| 154 # This is related to UNC resolver on windows. Ignore that. |
| 155 self._MAPPING['\\Device\\Mup'] = None |
| 156 self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot'] |
| 157 |
| 158 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)): |
| 159 try: |
| 160 letter = '%s:' % letter |
| 161 mapped = QueryDosDevice(letter) |
| 162 if mapped in self._MAPPING: |
| 163 logging.warn( |
| 164 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk' |
| 165 '. Drive letters are a user-mode concept and the kernel ' |
| 166 'traces only have NT path, so all accesses will be ' |
| 167 'associated with the first drive letter, independent of the ' |
| 168 'actual letter used by the code') % ( |
| 169 self._MAPPING[mapped], letter)) |
| 170 else: |
| 171 self._MAPPING[mapped] = letter |
| 172 except WindowsError: # pylint: disable=E0602 |
| 173 pass |
| 174 |
| 175 def to_win32(self, path): |
| 176 """Converts a native NT path to Win32/DOS compatible path.""" |
| 177 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path) |
| 178 if not match: |
| 179 raise ValueError( |
| 180 'Can\'t convert %s into a Win32 compatible path' % path, |
| 181 path) |
| 182 if not match.group(1) in self._MAPPING: |
| 183 # Unmapped partitions may be accessed by windows for the |
| 184 # fun of it while the test is running. Discard these. |
| 185 return None |
| 186 drive = self._MAPPING[match.group(1)] |
| 187 if not drive or not match.group(2): |
| 188 return drive |
| 189 return drive + match.group(2) |
| 190 |
| 191 |
| 192 def isabs(path): |
| 193 """Accepts X: as an absolute path, unlike python's os.path.isabs().""" |
| 194 return os.path.isabs(path) or len(path) == 2 and path[1] == ':' |
| 195 |
| 196 |
| 197 def get_native_path_case(p): |
| 198 """Returns the native path case for an existing file. |
| 199 |
| 200 On Windows, removes any leading '\\?\'. |
| 201 """ |
| 202 if not isabs(p): |
| 203 raise ValueError( |
| 204 'Can\'t get native path case for a non-absolute path: %s' % p, |
| 205 p) |
| 206 # Windows used to have an option to turn on case sensitivity on non Win32 |
| 207 # subsystem but that's out of scope here and isn't supported anymore. |
| 208 # Go figure why GetShortPathName() is needed. |
| 209 try: |
| 210 out = GetLongPathName(GetShortPathName(p)) |
| 211 except OSError, e: |
| 212 if e.args[0] in (2, 3, 5): |
| 213 # The path does not exist. Try to recurse and reconstruct the path. |
| 214 base = os.path.dirname(p) |
| 215 rest = os.path.basename(p) |
| 216 return os.path.join(get_native_path_case(base), rest) |
| 217 raise |
| 218 if out.startswith('\\\\?\\'): |
| 219 out = out[4:] |
| 220 # Always upper case the first letter since GetLongPathName() will return the |
| 221 # drive letter in the case it was given. |
| 222 return out[0].upper() + out[1:] |
| 223 |
| 224 |
| 225 def CommandLineToArgvW(command_line): |
| 226 """Splits a commandline into argv using CommandLineToArgvW().""" |
| 227 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx |
| 228 size = c_int() |
| 229 ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size)) |
| 230 try: |
| 231 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)] |
| 232 finally: |
| 233 windll.kernel32.LocalFree(ptr) |
| 234 |
| 235 |
| 236 elif sys.platform == 'darwin': |
| 237 |
| 238 |
| 239 # On non-windows, keep the stdlib behavior. |
| 240 isabs = os.path.isabs |
| 241 |
| 242 |
| 243 def _find_item_native_case(root_path, item): |
| 244 """Gets the native path case of a single item based at root_path. |
| 245 |
| 246 There is no API to get the native path case of symlinks on OSX. So it |
| 247 needs to be done the slow way. |
| 248 """ |
| 249 item = item.lower() |
| 250 for element in os.listdir(root_path): |
| 251 if element.lower() == item: |
| 252 return element |
| 253 |
| 254 |
| 255 def _native_case(p): |
| 256 """Gets the native path case. Warning: this function resolves symlinks.""" |
| 257 logging.debug('native_case(%s)' % p) |
| 258 try: |
| 259 rel_ref, _ = Carbon.File.FSPathMakeRef(p) |
| 260 out = rel_ref.FSRefMakePath() |
| 261 if p.endswith(os.path.sep) and not out.endswith(os.path.sep): |
| 262 return out + os.path.sep |
| 263 return out |
| 264 except MacOS.Error, e: |
| 265 if e.args[0] in (-43, -120): |
| 266 # The path does not exist. Try to recurse and reconstruct the path. |
| 267 # -43 means file not found. |
| 268 # -120 means directory not found. |
| 269 base = os.path.dirname(p) |
| 270 rest = os.path.basename(p) |
| 271 return os.path.join(_native_case(base), rest) |
| 272 raise OSError( |
| 273 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1]) |
| 274 |
| 275 |
| 276 def _split_at_symlink_native(base_path, rest): |
| 277 """Returns the native path for a symlink.""" |
| 278 base, symlink, rest = split_at_symlink(base_path, rest) |
| 279 if symlink: |
| 280 if not base_path: |
| 281 base_path = base |
| 282 else: |
| 283 base_path = safe_join(base_path, base) |
| 284 symlink = _find_item_native_case(base_path, symlink) |
| 285 return base, symlink, rest |
| 286 |
| 287 |
| 288 def get_native_path_case(path): |
| 289 """Returns the native path case for an existing file. |
| 290 |
| 291 Technically, it's only HFS+ on OSX that is case preserving and |
| 292 insensitive. It's the default setting on HFS+ but can be changed. |
| 293 """ |
| 294 if not isabs(path): |
| 295 raise ValueError( |
| 296 'Can\'t get native path case for a non-absolute path: %s' % path, |
| 297 path) |
| 298 if path.startswith('/dev'): |
| 299 # /dev is not visible from Carbon, causing an exception. |
| 300 return path |
| 301 |
| 302 # Starts assuming there is no symlink along the path. |
| 303 resolved = _native_case(path) |
| 304 if resolved.lower() == path.lower(): |
| 305 # This code path is incredibly faster. |
| 306 return resolved |
| 307 |
| 308 # There was a symlink, process it. |
| 309 base, symlink, rest = _split_at_symlink_native(None, path) |
| 310 assert symlink, (path, base, symlink, rest, resolved) |
| 311 prev = base |
| 312 base = safe_join(_native_case(base), symlink) |
| 313 assert len(base) > len(prev) |
| 314 while rest: |
| 315 prev = base |
| 316 relbase, symlink, rest = _split_at_symlink_native(base, rest) |
| 317 base = safe_join(base, relbase) |
| 318 assert len(base) > len(prev), (prev, base, symlink) |
| 319 if symlink: |
| 320 base = safe_join(base, symlink) |
| 321 assert len(base) > len(prev), (prev, base, symlink) |
| 322 # Make sure no symlink was resolved. |
| 323 assert base.lower() == path.lower(), (base, path) |
| 324 return base |
| 325 |
| 326 |
| 327 else: # OSes other than Windows and OSX. |
| 328 |
| 329 |
| 330 # On non-windows, keep the stdlib behavior. |
| 331 isabs = os.path.isabs |
| 332 |
| 333 |
| 334 def get_native_path_case(path): |
| 335 """Returns the native path case for an existing file. |
| 336 |
| 337 On OSes other than OSX and Windows, assume the file system is |
| 338 case-sensitive. |
| 339 |
| 340 TODO(maruel): This is not strictly true. Implement if necessary. |
| 341 """ |
| 342 if not isabs(path): |
| 343 raise ValueError( |
| 344 'Can\'t get native path case for a non-absolute path: %s' % path, |
| 345 path) |
| 346 # Give up on cygwin, as GetLongPathName() can't be called. |
| 347 # Linux traces tends to not be normalized so use this occasion to normalize |
| 348 # it. This function implementation already normalizes the path on the other |
| 349 # OS so this needs to be done here to be coherent between OSes. |
| 350 out = os.path.normpath(path) |
| 351 if path.endswith(os.path.sep) and not out.endswith(os.path.sep): |
| 352 return out + os.path.sep |
| 353 return out |
| 354 |
| 355 |
| 356 if sys.platform != 'win32': # All non-Windows OSes. |
| 357 |
| 358 |
| 359 def safe_join(*args): |
| 360 """Joins path elements like os.path.join() but doesn't abort on absolute |
| 361 path. |
| 362 |
| 363 os.path.join('foo', '/bar') == '/bar' |
| 364 but safe_join('foo', '/bar') == 'foo/bar'. |
| 365 """ |
| 366 out = '' |
| 367 for element in args: |
| 368 if element.startswith(os.path.sep): |
| 369 if out.endswith(os.path.sep): |
| 370 out += element[1:] |
| 371 else: |
| 372 out += element |
| 373 else: |
| 374 if out.endswith(os.path.sep): |
| 375 out += element |
| 376 else: |
| 377 out += os.path.sep + element |
| 378 return out |
| 379 |
| 380 |
| 381 def split_at_symlink(base_dir, relfile): |
| 382 """Scans each component of relfile and cut the string at the symlink if |
| 383 there is any. |
| 384 |
| 385 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if |
| 386 not symlink was found. |
| 387 """ |
| 388 if base_dir: |
| 389 assert relfile |
| 390 assert os.path.isabs(base_dir) |
| 391 index = 0 |
| 392 else: |
| 393 assert os.path.isabs(relfile) |
| 394 index = 1 |
| 395 |
| 396 def at_root(rest): |
| 397 if base_dir: |
| 398 return safe_join(base_dir, rest) |
| 399 return rest |
| 400 |
| 401 while True: |
| 402 try: |
| 403 index = relfile.index(os.path.sep, index) |
| 404 except ValueError: |
| 405 index = len(relfile) |
| 406 full = at_root(relfile[:index]) |
| 407 if os.path.islink(full): |
| 408 # A symlink! |
| 409 base = os.path.dirname(relfile[:index]) |
| 410 symlink = os.path.basename(relfile[:index]) |
| 411 rest = relfile[index:] |
| 412 logging.debug( |
| 413 'split_at_symlink(%s, %s) -> (%s, %s, %s)' % |
| 414 (base_dir, relfile, base, symlink, rest)) |
| 415 return base, symlink, rest |
| 416 if index == len(relfile): |
| 417 break |
| 418 index += 1 |
| 419 return relfile, None, None |
| 420 |
| 421 |
| 422 def fix_python_path(cmd): |
| 423 """Returns the fixed command line to call the right python executable.""" |
| 424 out = cmd[:] |
| 425 if out[0] == 'python': |
| 426 out[0] = sys.executable |
| 427 elif out[0].endswith('.py'): |
| 428 out.insert(0, sys.executable) |
| 429 return out |
| 430 |
| 431 |
| 432 def create_thunk(): |
| 433 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py') |
| 434 os.write( |
| 435 handle, |
| 436 ( |
| 437 'import subprocess\n' |
| 438 'import sys\n' |
| 439 'sys.exit(subprocess.call(sys.argv[2:]))\n' |
| 440 )) |
| 441 os.close(handle) |
| 442 return name |
| 443 |
| 444 |
| 445 def strace_process_quoted_arguments(text): |
| 446 """Extracts quoted arguments on a string and return the arguments as a list. |
| 447 |
| 448 Implemented as an automaton. Supports incomplete strings in the form |
| 449 '"foo"...'. |
| 450 |
| 451 Example: |
| 452 With text = '"foo", "bar"', the function will return ['foo', 'bar'] |
| 453 |
| 454 TODO(maruel): Implement escaping. |
| 455 """ |
| 456 # All the possible states of the DFA. |
| 457 ( NEED_QUOTE, # Begining of a new arguments. |
| 458 INSIDE_STRING, # Inside an argument. |
| 459 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is. |
| 460 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be |
| 461 # a serie of 3 dots or a comma. |
| 462 NEED_SPACE, # Right after a comma |
| 463 NEED_DOT_2, # Found a dot, need a second one. |
| 464 NEED_DOT_3, # Found second dot, need a third one. |
| 465 NEED_COMMA, # Found third dot, need a comma. |
| 466 ) = range(8) |
| 467 |
| 468 state = NEED_QUOTE |
| 469 out = [] |
| 470 for index, char in enumerate(text): |
| 471 if char == '"': |
| 472 if state == NEED_QUOTE: |
| 473 state = INSIDE_STRING |
| 474 # A new argument was found. |
| 475 out.append('') |
| 476 elif state == INSIDE_STRING: |
| 477 # The argument is now closed. |
| 478 state = NEED_COMMA_OR_DOT |
| 479 elif state == ESCAPED: |
| 480 out[-1] += char |
| 481 state = INSIDE_STRING |
| 482 else: |
| 483 raise ValueError( |
| 484 'Can\'t process char at column %d for: %r' % (index, text), |
| 485 index, |
| 486 text) |
| 487 elif char == ',': |
| 488 if state in (NEED_COMMA_OR_DOT, NEED_COMMA): |
| 489 state = NEED_SPACE |
| 490 elif state == INSIDE_STRING: |
| 491 out[-1] += char |
| 492 elif state == ESCAPED: |
| 493 out[-1] += char |
| 494 state = INSIDE_STRING |
| 495 else: |
| 496 raise ValueError( |
| 497 'Can\'t process char at column %d for: %r' % (index, text), |
| 498 index, |
| 499 text) |
| 500 elif char == ' ': |
| 501 if state == NEED_SPACE: |
| 502 state = NEED_QUOTE |
| 503 elif state == INSIDE_STRING: |
| 504 out[-1] += char |
| 505 elif state == ESCAPED: |
| 506 out[-1] += char |
| 507 state = INSIDE_STRING |
| 508 else: |
| 509 raise ValueError( |
| 510 'Can\'t process char at column %d for: %r' % (index, text), |
| 511 index, |
| 512 text) |
| 513 elif char == '.': |
| 514 if state == NEED_COMMA_OR_DOT: |
| 515 # The string is incomplete, this mean the strace -s flag should be |
| 516 # increased. |
| 517 state = NEED_DOT_2 |
| 518 elif state == NEED_DOT_2: |
| 519 state = NEED_DOT_3 |
| 520 elif state == NEED_DOT_3: |
| 521 state = NEED_COMMA |
| 522 elif state == INSIDE_STRING: |
| 523 out[-1] += char |
| 524 elif state == ESCAPED: |
| 525 out[-1] += char |
| 526 state = INSIDE_STRING |
| 527 else: |
| 528 raise ValueError( |
| 529 'Can\'t process char at column %d for: %r' % (index, text), |
| 530 index, |
| 531 text) |
| 532 elif char == '\\': |
| 533 if state == ESCAPED: |
| 534 out[-1] += char |
| 535 state = INSIDE_STRING |
| 536 elif state == INSIDE_STRING: |
| 537 state = ESCAPED |
| 538 else: |
| 539 raise ValueError( |
| 540 'Can\'t process char at column %d for: %r' % (index, text), |
| 541 index, |
| 542 text) |
| 543 else: |
| 544 if state == INSIDE_STRING: |
| 545 out[-1] += char |
| 546 else: |
| 547 raise ValueError( |
| 548 'Can\'t process char at column %d for: %r' % (index, text), |
| 549 index, |
| 550 text) |
| 551 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT): |
| 552 raise ValueError( |
| 553 'String is incorrectly terminated: %r' % text, |
| 554 text) |
| 555 return out |
| 556 |
| 557 |
| 558 def read_json(filepath): |
| 559 with open(filepath, 'r') as f: |
| 560 return json.load(f) |
| 561 |
| 562 |
| 563 def write_json(filepath_or_handle, data, dense): |
| 564 """Writes data into filepath or file handle encoded as json. |
| 565 |
| 566 If dense is True, the json is packed. Otherwise, it is human readable. |
| 567 """ |
| 568 if hasattr(filepath_or_handle, 'write'): |
| 569 if dense: |
| 570 filepath_or_handle.write( |
| 571 json.dumps(data, sort_keys=True, separators=(',',':'))) |
| 572 else: |
| 573 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2)) |
| 574 else: |
| 575 with open(filepath_or_handle, 'wb') as f: |
| 576 if dense: |
| 577 json.dump(data, f, sort_keys=True, separators=(',',':')) |
| 578 else: |
| 579 json.dump(data, f, sort_keys=True, indent=2) |
| 580 |
| 581 |
| 582 class Results(object): |
| 583 """Results of a trace session.""" |
| 584 |
| 585 class _TouchedObject(object): |
| 586 """Something, a file or a directory, that was accessed.""" |
| 587 def __init__(self, root, path, tainted, size, nb_files): |
| 588 logging.debug( |
| 589 '%s(%s, %s, %s, %s, %s)' % |
| 590 (self.__class__.__name__, root, path, tainted, size, nb_files)) |
| 591 self.root = root |
| 592 self.path = path |
| 593 self.tainted = tainted |
| 594 self.nb_files = nb_files |
| 595 # Can be used as a cache or a default value, depending on context. |
| 596 self._size = size |
| 597 # These are cache only. |
| 598 self._real_path = None |
| 599 |
| 600 # Check internal consistency. |
| 601 assert path, path |
| 602 assert tainted or bool(root) != bool(isabs(path)), (root, path) |
| 603 assert tainted or ( |
| 604 not os.path.exists(self.full_path) or |
| 605 (self.full_path == get_native_path_case(self.full_path))), ( |
| 606 tainted, self.full_path, get_native_path_case(self.full_path)) |
| 607 |
| 608 @property |
| 609 def existent(self): |
| 610 return self.size != -1 |
| 611 |
| 612 @property |
| 613 def full_path(self): |
| 614 if self.root: |
| 615 return os.path.join(self.root, self.path) |
| 616 return self.path |
| 617 |
| 618 @property |
| 619 def real_path(self): |
| 620 """Returns the path with symlinks resolved.""" |
| 621 if not self._real_path: |
| 622 self._real_path = os.path.realpath(self.full_path) |
| 623 return self._real_path |
| 624 |
| 625 @property |
| 626 def size(self): |
| 627 """File's size. -1 is not existent.""" |
| 628 if self._size is None and not self.tainted: |
| 629 try: |
| 630 self._size = os.stat(self.full_path).st_size |
| 631 except OSError: |
| 632 self._size = -1 |
| 633 return self._size |
| 634 |
| 635 def flatten(self): |
| 636 """Returns a dict representing this object. |
| 637 |
| 638 A 'size' of 0 means the file was only touched and not read. |
| 639 """ |
| 640 return { |
| 641 'path': self.path, |
| 642 'size': self.size, |
| 643 } |
| 644 |
| 645 def replace_variables(self, variables): |
| 646 """Replaces the root of this File with one of the variables if it matches. |
| 647 |
| 648 If a variable replacement occurs, the cloned object becomes tainted. |
| 649 """ |
| 650 for variable, root_path in variables.iteritems(): |
| 651 if self.path.startswith(root_path): |
| 652 return self._clone( |
| 653 self.root, variable + self.path[len(root_path):], True) |
| 654 # No need to clone, returns ourself. |
| 655 return self |
| 656 |
| 657 def strip_root(self, root): |
| 658 """Returns a clone of itself with 'root' stripped off.""" |
| 659 # Check internal consistency. |
| 660 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root |
| 661 if not self.full_path.startswith(root): |
| 662 # Now try to resolve the symlinks to see if it can be reached this way. |
| 663 # Only try *after* trying without resolving symlink. |
| 664 if not self.real_path.startswith(root): |
| 665 return None |
| 666 path = self.real_path |
| 667 else: |
| 668 path = self.full_path |
| 669 return self._clone(root, path[len(root):], self.tainted) |
| 670 |
| 671 def _clone(self, new_root, new_path, tainted): |
| 672 raise NotImplementedError(self.__class__.__name__) |
| 673 |
| 674 class File(_TouchedObject): |
| 675 """A file that was accessed. May not be present anymore. |
| 676 |
| 677 If tainted is true, it means it is not a real path anymore as a variable |
| 678 replacement occured. |
| 679 |
| 680 If touched_only is True, this means the file was probed for existence, and |
| 681 it is existent, but was never _opened_. If touched_only is True, the file |
| 682 must have existed. |
| 683 """ |
| 684 def __init__(self, root, path, tainted, size): |
| 685 super(Results.File, self).__init__(root, path, tainted, size, 1) |
| 686 |
| 687 def _clone(self, new_root, new_path, tainted): |
| 688 """Clones itself keeping meta-data.""" |
| 689 # Keep the self.size and self._real_path caches for performance reason. It |
| 690 # is also important when the file becomes tainted (with a variable instead |
| 691 # of the real path) since self.path is not an on-disk path anymore so |
| 692 # out._size cannot be updated. |
| 693 out = self.__class__(new_root, new_path, tainted, self.size) |
| 694 out._real_path = self._real_path |
| 695 return out |
| 696 |
| 697 class Directory(_TouchedObject): |
| 698 """A directory of files. Must exist.""" |
| 699 def __init__(self, root, path, tainted, size, nb_files): |
| 700 """path='.' is a valid value and must be handled appropriately.""" |
| 701 assert not path.endswith(os.path.sep), path |
| 702 super(Results.Directory, self).__init__( |
| 703 root, path + os.path.sep, tainted, size, nb_files) |
| 704 # For a Directory instance, self.size is not a cache, it's an actual value |
| 705 # that is never modified and represents the total size of the files |
| 706 # contained in this directory. It is possible that the directory is empty |
| 707 # so that size == 0; this happens if there's only an invalid symlink in |
| 708 # it. |
| 709 |
| 710 def flatten(self): |
| 711 out = super(Results.Directory, self).flatten() |
| 712 out['nb_files'] = self.nb_files |
| 713 return out |
| 714 |
| 715 def _clone(self, new_root, new_path, tainted): |
| 716 """Clones itself keeping meta-data.""" |
| 717 out = self.__class__( |
| 718 new_root, |
| 719 new_path.rstrip(os.path.sep), |
| 720 tainted, |
| 721 self.size, |
| 722 self.nb_files) |
| 723 out._real_path = self._real_path |
| 724 return out |
| 725 |
| 726 class Process(object): |
| 727 """A process that was traced. |
| 728 |
| 729 Contains references to the files accessed by this process and its children. |
| 730 """ |
| 731 def __init__(self, pid, files, executable, command, initial_cwd, children): |
| 732 logging.debug('Process(%s, %d, ...)' % (pid, len(files))) |
| 733 self.pid = pid |
| 734 self.files = sorted(files, key=lambda x: x.path) |
| 735 self.children = children |
| 736 self.executable = executable |
| 737 self.command = command |
| 738 self.initial_cwd = initial_cwd |
| 739 |
| 740 # Check internal consistency. |
| 741 assert len(set(f.path for f in self.files)) == len(self.files), sorted( |
| 742 f.path for f in self.files) |
| 743 assert isinstance(self.children, list) |
| 744 assert isinstance(self.files, list) |
| 745 |
| 746 @property |
| 747 def all(self): |
| 748 for child in self.children: |
| 749 for i in child.all: |
| 750 yield i |
| 751 yield self |
| 752 |
| 753 def flatten(self): |
| 754 return { |
| 755 'children': [c.flatten() for c in self.children], |
| 756 'command': self.command, |
| 757 'executable': self.executable, |
| 758 'files': [f.flatten() for f in self.files], |
| 759 'initial_cwd': self.initial_cwd, |
| 760 'pid': self.pid, |
| 761 } |
| 762 |
| 763 def strip_root(self, root): |
| 764 assert isabs(root) and root.endswith(os.path.sep), root |
| 765 # Loads the files after since they are constructed as objects. |
| 766 out = self.__class__( |
| 767 self.pid, |
| 768 filter(None, (f.strip_root(root) for f in self.files)), |
| 769 self.executable, |
| 770 self.command, |
| 771 self.initial_cwd, |
| 772 [c.strip_root(root) for c in self.children]) |
| 773 logging.debug( |
| 774 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files))) |
| 775 return out |
| 776 |
| 777 def __init__(self, process): |
| 778 self.process = process |
| 779 # Cache. |
| 780 self._files = None |
| 781 |
| 782 def flatten(self): |
| 783 return { |
| 784 'root': self.process.flatten(), |
| 785 } |
| 786 |
| 787 @property |
| 788 def files(self): |
| 789 if self._files is None: |
| 790 self._files = sorted( |
| 791 sum((p.files for p in self.process.all), []), |
| 792 key=lambda x: x.path) |
| 793 return self._files |
| 794 |
| 795 @property |
| 796 def existent(self): |
| 797 return [f for f in self.files if f.existent] |
| 798 |
| 799 @property |
| 800 def non_existent(self): |
| 801 return [f for f in self.files if not f.existent] |
| 802 |
| 803 def strip_root(self, root): |
| 804 """Returns a clone with all the files outside the directory |root| removed |
| 805 and converts all the path to be relative paths. |
| 806 """ |
| 807 # Resolve any symlink |
| 808 root = os.path.realpath(root) |
| 809 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep |
| 810 logging.debug('strip_root(%s)' % root) |
| 811 return Results(self.process.strip_root(root)) |
| 812 |
| 813 |
| 814 class ApiBase(object): |
| 815 """OS-agnostic API to trace a process and its children.""" |
| 816 class Context(object): |
| 817 """Processes one log line at a time and keeps the list of traced processes. |
| 818 |
| 819 The parsing is complicated by the fact that logs are traced out of order for |
| 820 strace but in-order for dtrace and logman. In addition, on Windows it is |
| 821 very frequent that processids are reused so a flat list cannot be used. But |
| 822 at the same time, it is impossible to faithfully construct a graph when the |
| 823 logs are processed out of order. So both a tree and a flat mapping are used, |
| 824 the tree is the real process tree, while the flat mapping stores the last |
| 825 valid process for the corresponding processid. For the strace case, the |
| 826 tree's head is guessed at the last moment. |
| 827 """ |
| 828 class Process(object): |
| 829 """Keeps context for one traced child process. |
| 830 |
| 831 Logs all the files this process touched. Ignores directories. |
| 832 """ |
| 833 def __init__(self, blacklist, pid, initial_cwd): |
| 834 # Check internal consistency. |
| 835 assert isinstance(pid, int), repr(pid) |
| 836 self.pid = pid |
| 837 # children are Process instances. |
| 838 self.children = [] |
| 839 self.initial_cwd = initial_cwd |
| 840 self.cwd = None |
| 841 self.files = set() |
| 842 self.only_touched = set() |
| 843 self.executable = None |
| 844 self.command = None |
| 845 self._blacklist = blacklist |
| 846 |
| 847 def to_results_process(self): |
| 848 """Resolves file case sensitivity and or late-bound strings.""" |
| 849 # When resolving files, it's normal to get dupe because a file could be |
| 850 # opened multiple times with different case. Resolve the deduplication |
| 851 # here. |
| 852 def fix_path(x): |
| 853 """Returns the native file path case. |
| 854 |
| 855 Converts late-bound strings. |
| 856 """ |
| 857 if not x: |
| 858 # Do not convert None instance to 'None'. |
| 859 return x |
| 860 # TODO(maruel): Do not upconvert to unicode here, on linux we don't |
| 861 # know the file path encoding so they must be treated as bytes. |
| 862 x = unicode(x) |
| 863 if os.path.isabs(x): |
| 864 # If the path is not absolute, which tends to happen occasionally on |
| 865 # Windows, it is not possible to get the native path case so ignore |
| 866 # that trace. It mostly happens for 'executable' value. |
| 867 x = get_native_path_case(x) |
| 868 return x |
| 869 |
| 870 def fix_and_blacklist_path(x): |
| 871 x = fix_path(x) |
| 872 if not x: |
| 873 return |
| 874 # The blacklist needs to be reapplied, since path casing could |
| 875 # influence blacklisting. |
| 876 if self._blacklist(x): |
| 877 return |
| 878 return x |
| 879 |
| 880 # Filters out directories. Some may have passed through. |
| 881 files = set(f for f in map(fix_and_blacklist_path, self.files) if f) |
| 882 only_touched = set( |
| 883 f for f in map(fix_and_blacklist_path, self.only_touched) if f) |
| 884 only_touched -= files |
| 885 |
| 886 files = [ |
| 887 Results.File(None, f, False, None) for f in files |
| 888 if not os.path.isdir(f) |
| 889 ] |
| 890 # Using 0 as size means the file's content is ignored since the file was |
| 891 # never opened for I/O. |
| 892 files.extend( |
| 893 Results.File(None, f, False, 0) for f in only_touched |
| 894 if not os.path.isdir(f) |
| 895 ) |
| 896 return Results.Process( |
| 897 self.pid, |
| 898 files, |
| 899 fix_path(self.executable), |
| 900 self.command, |
| 901 fix_path(self.initial_cwd), |
| 902 [c.to_results_process() for c in self.children]) |
| 903 |
| 904 def add_file(self, filepath, touch_only): |
| 905 """Adds a file if it passes the blacklist.""" |
| 906 if self._blacklist(unicode(filepath)): |
| 907 return |
| 908 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only)) |
| 909 # Note that filepath and not unicode(filepath) is added. It is because |
| 910 # filepath could be something else than a string, like a RelativePath |
| 911 # instance for dtrace logs. |
| 912 if touch_only: |
| 913 self.only_touched.add(filepath) |
| 914 else: |
| 915 self.files.add(filepath) |
| 916 |
| 917 def __init__(self, blacklist): |
| 918 self.blacklist = blacklist |
| 919 # Initial process. |
| 920 self.root_process = None |
| 921 # dict to accelerate process lookup, to not have to lookup the whole graph |
| 922 # each time. |
| 923 self._process_lookup = {} |
| 924 |
| 925 class Tracer(object): |
| 926 """During it's lifetime, the tracing subsystem is enabled.""" |
| 927 def __init__(self, logname): |
| 928 self._logname = logname |
| 929 self._lock = threading.Lock() |
| 930 self._traces = [] |
| 931 self._initialized = True |
| 932 |
| 933 def trace(self, cmd, cwd, tracename, output): |
| 934 """Runs the OS-specific trace program on an executable. |
| 935 |
| 936 Arguments: |
| 937 - cmd: The command (a list) to run. |
| 938 - cwd: Current directory to start the child process in. |
| 939 - tracename: Name of the trace in the logname file. |
| 940 - output: If False, redirects output to PIPEs. |
| 941 |
| 942 Returns a tuple (resultcode, output) and updates the internal trace |
| 943 entries. |
| 944 """ |
| 945 # The implementation adds an item to self._traces. |
| 946 raise NotImplementedError(self.__class__.__name__) |
| 947 |
| 948 def close(self, _timeout=None): |
| 949 """Saves the meta-data in the logname file. |
| 950 |
| 951 For kernel-based tracing, stops the tracing subsystem. |
| 952 |
| 953 Must not be used manually when using 'with' construct. |
| 954 """ |
| 955 with self._lock: |
| 956 assert self._initialized |
| 957 try: |
| 958 data = { |
| 959 'traces': self._traces, |
| 960 } |
| 961 write_json(self._logname, data, False) |
| 962 finally: |
| 963 self._initialized = False |
| 964 |
| 965 def post_process_log(self): |
| 966 """Post-processes the log so it becomes faster to load afterward. |
| 967 |
| 968 Must not be used manually when using 'with' construct. |
| 969 """ |
| 970 assert not self._initialized, 'Must stop tracing first.' |
| 971 |
| 972 def __enter__(self): |
| 973 """Enables 'with' statement.""" |
| 974 return self |
| 975 |
| 976 def __exit__(self, exc_type, exc_value, traceback): |
| 977 """Enables 'with' statement.""" |
| 978 self.close() |
| 979 # If an exception was thrown, do not process logs. |
| 980 if not exc_type: |
| 981 self.post_process_log() |
| 982 |
| 983 def get_tracer(self, logname): |
| 984 """Returns an ApiBase.Tracer instance. |
| 985 |
| 986 Initializes the tracing subsystem, which is a requirement for kernel-based |
| 987 tracers. Only one tracer instance should be live at a time! |
| 988 |
| 989 logname is the filepath to the json file that will contain the meta-data |
| 990 about the logs. |
| 991 """ |
| 992 return self.Tracer(logname) |
| 993 |
| 994 @staticmethod |
| 995 def clean_trace(logname): |
| 996 """Deletes an old log.""" |
| 997 raise NotImplementedError() |
| 998 |
| 999 @classmethod |
| 1000 def parse_log(cls, logname, blacklist): |
| 1001 """Processes trace logs and returns the files opened and the files that do |
| 1002 not exist. |
| 1003 |
| 1004 It does not track directories. |
| 1005 |
| 1006 Most of the time, files that do not exist are temporary test files that |
| 1007 should be put in /tmp instead. See http://crbug.com/116251. |
| 1008 |
| 1009 Returns a list of dict with keys: |
| 1010 - results: A Results instance. |
| 1011 - trace: The corresponding tracename parameter provided to |
| 1012 get_tracer().trace(). |
| 1013 - output: Output gathered during execution, if get_tracer().trace(..., |
| 1014 output=False) was used. |
| 1015 """ |
| 1016 raise NotImplementedError(cls.__class__.__name__) |
| 1017 |
| 1018 |
| 1019 class Strace(ApiBase): |
| 1020 """strace implies linux.""" |
| 1021 class Context(ApiBase.Context): |
| 1022 """Processes a strace log line and keeps the list of existent and non |
| 1023 existent files accessed. |
| 1024 |
| 1025 Ignores directories. |
| 1026 |
| 1027 Uses late-binding to processes the cwd of each process. The problem is that |
| 1028 strace generates one log file per process it traced but doesn't give any |
| 1029 information about which process was started when and by who. So we don't |
| 1030 even know which process is the initial one. So process the logs out of |
| 1031 order and use late binding with RelativePath to be able to deduce the |
| 1032 initial directory of each process once all the logs are parsed. |
| 1033 """ |
| 1034 class Process(ApiBase.Context.Process): |
| 1035 """Represents the state of a process. |
| 1036 |
| 1037 Contains all the information retrieved from the pid-specific log. |
| 1038 """ |
| 1039 # Function names are using ([a-z_0-9]+) |
| 1040 # This is the most common format. function(args) = result |
| 1041 RE_HEADER = re.compile(r'^([a-z_0-9]+)\((.+?)\)\s+= (.+)$') |
| 1042 # An interrupted function call, only grab the minimal header. |
| 1043 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$') |
| 1044 # A resumed function call. |
| 1045 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$') |
| 1046 # A process received a signal. |
| 1047 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---') |
| 1048 # A process didn't handle a signal. Ignore any junk appearing before, |
| 1049 # because the process was forcibly killed so it won't open any new file. |
| 1050 RE_KILLED = re.compile( |
| 1051 r'^.*\+\+\+ killed by ([A-Z]+)( \(core dumped\))? \+\+\+$') |
| 1052 # The process has exited. |
| 1053 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+') |
| 1054 # A call was canceled. Ignore any prefix. |
| 1055 RE_UNAVAILABLE = re.compile(r'^.*\)\s*= \? <unavailable>$') |
| 1056 # Happens when strace fails to even get the function name. |
| 1057 UNNAMED_FUNCTION = '????' |
| 1058 |
| 1059 # Corner-case in python, a class member function decorator must not be |
| 1060 # @staticmethod. |
| 1061 def parse_args(regexp, expect_zero): # pylint: disable=E0213 |
| 1062 """Automatically convert the str 'args' into a list of processed |
| 1063 arguments. |
| 1064 |
| 1065 Arguments: |
| 1066 - regexp is used to parse args. |
| 1067 - expect_zero: one of True, False or None. |
| 1068 - True: will check for result.startswith('0') first and will ignore |
| 1069 the trace line completely otherwise. This is important because for |
| 1070 many functions, the regexp will not process if the call failed. |
| 1071 - False: will check for not result.startswith(('?', '-1')) for the |
| 1072 same reason than with True. |
| 1073 - None: ignore result. |
| 1074 """ |
| 1075 def meta_hook(function): |
| 1076 assert function.__name__.startswith('handle_') |
| 1077 def hook(self, args, result): |
| 1078 if expect_zero is True and not result.startswith('0'): |
| 1079 return |
| 1080 if expect_zero is False and result.startswith(('?', '-1')): |
| 1081 return |
| 1082 match = re.match(regexp, args) |
| 1083 if not match: |
| 1084 raise TracingFailure( |
| 1085 'Failed to parse %s(%s) = %s' % |
| 1086 (function.__name__[len('handle_'):], args, result), |
| 1087 None, None, None) |
| 1088 return function(self, match.groups(), result) |
| 1089 return hook |
| 1090 return meta_hook |
| 1091 |
| 1092 class RelativePath(object): |
| 1093 """A late-bound relative path.""" |
| 1094 def __init__(self, parent, value): |
| 1095 self.parent = parent |
| 1096 self.value = value |
| 1097 |
| 1098 def render(self): |
| 1099 """Returns the current directory this instance is representing. |
| 1100 |
| 1101 This function is used to return the late-bound value. |
| 1102 """ |
| 1103 if self.value and self.value.startswith(u'/'): |
| 1104 # An absolute path. |
| 1105 return self.value |
| 1106 parent = self.parent.render() if self.parent else u'<None>' |
| 1107 if self.value: |
| 1108 return os.path.normpath(os.path.join(parent, self.value)) |
| 1109 return parent |
| 1110 |
| 1111 def __unicode__(self): |
| 1112 """Acts as a string whenever needed.""" |
| 1113 return unicode(self.render()) |
| 1114 |
| 1115 def __str__(self): |
| 1116 """Acts as a string whenever needed.""" |
| 1117 return str(self.render()) |
| 1118 |
| 1119 def __init__(self, root, pid): |
| 1120 """Keeps enough information to be able to guess the original process |
| 1121 root. |
| 1122 |
| 1123 strace doesn't store which process was the initial process. So more |
| 1124 information needs to be kept so the graph can be reconstructed from the |
| 1125 flat map. |
| 1126 """ |
| 1127 logging.info('%s(%d)' % (self.__class__.__name__, pid)) |
| 1128 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None) |
| 1129 assert isinstance(root, ApiBase.Context) |
| 1130 self._root = weakref.ref(root) |
| 1131 # The dict key is the function name of the pending call, like 'open' |
| 1132 # or 'execve'. |
| 1133 self._pending_calls = {} |
| 1134 self._line_number = 0 |
| 1135 # Current directory when the process started. |
| 1136 self.initial_cwd = self.RelativePath(self._root(), None) |
| 1137 self.parentid = None |
| 1138 |
| 1139 def get_cwd(self): |
| 1140 """Returns the best known value of cwd.""" |
| 1141 return self.cwd or self.initial_cwd |
| 1142 |
| 1143 def render(self): |
| 1144 """Returns the string value of the RelativePath() object. |
| 1145 |
| 1146 Used by RelativePath. Returns the initial directory and not the |
| 1147 current one since the current directory 'cwd' validity is time-limited. |
| 1148 |
| 1149 The validity is only guaranteed once all the logs are processed. |
| 1150 """ |
| 1151 return self.initial_cwd.render() |
| 1152 |
| 1153 def on_line(self, line): |
| 1154 self._line_number += 1 |
| 1155 if self.RE_SIGNAL.match(line): |
| 1156 # Ignore signals. |
| 1157 return |
| 1158 |
| 1159 try: |
| 1160 match = self.RE_KILLED.match(line) |
| 1161 if match: |
| 1162 # Converts a '+++ killed by Foo +++' trace into an exit_group(). |
| 1163 self.handle_exit_group(match.group(1), None) |
| 1164 return |
| 1165 |
| 1166 match = self.RE_PROCESS_EXITED.match(line) |
| 1167 if match: |
| 1168 # Converts a '+++ exited with 1 +++' trace into an exit_group() |
| 1169 self.handle_exit_group(match.group(1), None) |
| 1170 return |
| 1171 |
| 1172 match = self.RE_UNFINISHED.match(line) |
| 1173 if match: |
| 1174 if match.group(1) in self._pending_calls: |
| 1175 raise TracingFailure( |
| 1176 'Found two unfinished calls for the same function', |
| 1177 None, None, None, |
| 1178 self._pending_calls) |
| 1179 self._pending_calls[match.group(1)] = ( |
| 1180 match.group(1) + match.group(2)) |
| 1181 return |
| 1182 |
| 1183 match = self.RE_UNAVAILABLE.match(line) |
| 1184 if match: |
| 1185 # This usually means a process was killed and a pending call was |
| 1186 # canceled. |
| 1187 # TODO(maruel): Look up the last exit_group() trace just above and |
| 1188 # make sure any self._pending_calls[anything] is properly flushed. |
| 1189 return |
| 1190 |
| 1191 match = self.RE_RESUMED.match(line) |
| 1192 if match: |
| 1193 if match.group(1) not in self._pending_calls: |
| 1194 raise TracingFailure( |
| 1195 'Found a resumed call that was not logged as unfinished', |
| 1196 None, None, None, |
| 1197 self._pending_calls) |
| 1198 pending = self._pending_calls.pop(match.group(1)) |
| 1199 # Reconstruct the line. |
| 1200 line = pending + match.group(2) |
| 1201 |
| 1202 match = self.RE_HEADER.match(line) |
| 1203 if not match: |
| 1204 raise TracingFailure( |
| 1205 'Found an invalid line: %s' % line, |
| 1206 None, None, None) |
| 1207 if match.group(1) == self.UNNAMED_FUNCTION: |
| 1208 return |
| 1209 |
| 1210 # It's a valid line, handle it. |
| 1211 handler = getattr(self, 'handle_%s' % match.group(1), None) |
| 1212 if not handler: |
| 1213 self._handle_unknown(match.group(1), match.group(2), match.group(3)) |
| 1214 return handler(match.group(2), match.group(3)) |
| 1215 except TracingFailure, e: |
| 1216 # Hack in the values since the handler could be a static function. |
| 1217 e.pid = self.pid |
| 1218 e.line = line |
| 1219 e.line_number = self._line_number |
| 1220 # Re-raise the modified exception. |
| 1221 raise |
| 1222 except (KeyError, NotImplementedError, ValueError), e: |
| 1223 raise TracingFailure( |
| 1224 'Trace generated a %s exception: %s' % ( |
| 1225 e.__class__.__name__, str(e)), |
| 1226 self.pid, |
| 1227 self._line_number, |
| 1228 line, |
| 1229 e) |
| 1230 |
| 1231 @parse_args(r'^\"(.+?)\", [FKORWX_|]+$', True) |
| 1232 def handle_access(self, args, _result): |
| 1233 self._handle_file(args[0], True) |
| 1234 |
| 1235 @parse_args(r'^\"(.+?)\"$', True) |
| 1236 def handle_chdir(self, args, _result): |
| 1237 """Updates cwd.""" |
| 1238 self.cwd = self.RelativePath(self, args[0]) |
| 1239 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd)) |
| 1240 |
| 1241 def handle_clone(self, _args, result): |
| 1242 """Transfers cwd.""" |
| 1243 if result.startswith(('?', '-1')): |
| 1244 # The call failed. |
| 1245 return |
| 1246 # Update the other process right away. |
| 1247 childpid = int(result) |
| 1248 child = self._root().get_or_set_proc(childpid) |
| 1249 if child.parentid is not None or childpid in self.children: |
| 1250 raise TracingFailure( |
| 1251 'Found internal inconsitency in process lifetime detection ' |
| 1252 'during a clone() call', |
| 1253 None, None, None) |
| 1254 |
| 1255 # Copy the cwd object. |
| 1256 child.initial_cwd = self.get_cwd() |
| 1257 child.parentid = self.pid |
| 1258 # It is necessary because the logs are processed out of order. |
| 1259 self.children.append(child) |
| 1260 |
| 1261 def handle_close(self, _args, _result): |
| 1262 pass |
| 1263 |
| 1264 def handle_chmod(self, _args, _result): |
| 1265 pass |
| 1266 |
| 1267 def handle_creat(self, _args, _result): |
| 1268 # Ignore files created, since they didn't need to exist. |
| 1269 pass |
| 1270 |
| 1271 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True) |
| 1272 def handle_execve(self, args, _result): |
| 1273 # Even if in practice execve() doesn't returns when it succeeds, strace |
| 1274 # still prints '0' as the result. |
| 1275 filepath = args[0] |
| 1276 self._handle_file(filepath, False) |
| 1277 self.executable = self.RelativePath(self.get_cwd(), filepath) |
| 1278 self.command = strace_process_quoted_arguments(args[1]) |
| 1279 |
| 1280 def handle_exit_group(self, _args, _result): |
| 1281 """Removes cwd.""" |
| 1282 self.cwd = None |
| 1283 |
| 1284 def handle_fork(self, args, result): |
| 1285 self._handle_unknown('fork', args, result) |
| 1286 |
| 1287 def handle_getcwd(self, _args, _result): |
| 1288 pass |
| 1289 |
| 1290 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True) |
| 1291 def handle_link(self, args, _result): |
| 1292 self._handle_file(args[0], False) |
| 1293 self._handle_file(args[1], False) |
| 1294 |
| 1295 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True) |
| 1296 def handle_lstat(self, args, _result): |
| 1297 self._handle_file(args[0], True) |
| 1298 |
| 1299 def handle_mkdir(self, _args, _result): |
| 1300 pass |
| 1301 |
| 1302 @parse_args(r'^\"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False) |
| 1303 def handle_open(self, args, _result): |
| 1304 if 'O_DIRECTORY' in args[1]: |
| 1305 return |
| 1306 self._handle_file(args[0], False) |
| 1307 |
| 1308 @parse_args(r'^(\d+|AT_FDCWD), \"(.*?)\", ([A-Z\_\|]+)(|, \d+)$', False) |
| 1309 def handle_openat(self, args, _result): |
| 1310 if 'O_DIRECTORY' in args[2]: |
| 1311 return |
| 1312 if args[1] == 'AT_FDCWD': |
| 1313 self._handle_file(args[1], False) |
| 1314 else: |
| 1315 # TODO(maruel): Implement relative open if necessary instead of the |
| 1316 # AT_FDCWD flag, let's hope not since this means tracking all active |
| 1317 # directory handles. |
| 1318 raise Exception('Relative open via openat not implemented.') |
| 1319 |
| 1320 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False) |
| 1321 def handle_readlink(self, args, _result): |
| 1322 self._handle_file(args[0], False) |
| 1323 |
| 1324 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True) |
| 1325 def handle_rename(self, args, _result): |
| 1326 self._handle_file(args[0], False) |
| 1327 self._handle_file(args[1], False) |
| 1328 |
| 1329 def handle_rmdir(self, _args, _result): |
| 1330 pass |
| 1331 |
| 1332 def handle_setxattr(self, _args, _result): |
| 1333 pass |
| 1334 |
| 1335 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True) |
| 1336 def handle_stat(self, args, _result): |
| 1337 self._handle_file(args[0], True) |
| 1338 |
| 1339 def handle_symlink(self, _args, _result): |
| 1340 pass |
| 1341 |
| 1342 def handle_unlink(self, _args, _result): |
| 1343 # In theory, the file had to be created anyway. |
| 1344 pass |
| 1345 |
| 1346 def handle_statfs(self, _args, _result): |
| 1347 pass |
| 1348 |
| 1349 def handle_vfork(self, args, result): |
| 1350 self._handle_unknown('vfork', args, result) |
| 1351 |
| 1352 @staticmethod |
| 1353 def _handle_unknown(function, args, result): |
| 1354 raise TracingFailure( |
| 1355 'Unexpected/unimplemented trace %s(%s)= %s' % |
| 1356 (function, args, result), |
| 1357 None, None, None) |
| 1358 |
| 1359 def _handle_file(self, filepath, touch_only): |
| 1360 filepath = self.RelativePath(self.get_cwd(), filepath) |
| 1361 #assert not touch_only, unicode(filepath) |
| 1362 self.add_file(filepath, touch_only) |
| 1363 |
| 1364 def __init__(self, blacklist, initial_cwd): |
| 1365 super(Strace.Context, self).__init__(blacklist) |
| 1366 self.initial_cwd = initial_cwd |
| 1367 |
| 1368 def render(self): |
| 1369 """Returns the string value of the initial cwd of the root process. |
| 1370 |
| 1371 Used by RelativePath. |
| 1372 """ |
| 1373 return self.initial_cwd |
| 1374 |
| 1375 def on_line(self, pid, line): |
| 1376 """Transfers control into the Process.on_line() function.""" |
| 1377 self.get_or_set_proc(pid).on_line(line.strip()) |
| 1378 |
| 1379 def to_results(self): |
| 1380 """Finds back the root process and verify consistency.""" |
| 1381 # TODO(maruel): Absolutely unecessary, fix me. |
| 1382 root = [p for p in self._process_lookup.itervalues() if not p.parentid] |
| 1383 if len(root) != 1: |
| 1384 raise TracingFailure( |
| 1385 'Found internal inconsitency in process lifetime detection ' |
| 1386 'while finding the root process', |
| 1387 None, |
| 1388 None, |
| 1389 None, |
| 1390 sorted(p.pid for p in root)) |
| 1391 self.root_process = root[0] |
| 1392 process = self.root_process.to_results_process() |
| 1393 if sorted(self._process_lookup) != sorted(p.pid for p in process.all): |
| 1394 raise TracingFailure( |
| 1395 'Found internal inconsitency in process lifetime detection ' |
| 1396 'while looking for len(tree) == len(list)', |
| 1397 None, |
| 1398 None, |
| 1399 None, |
| 1400 sorted(self._process_lookup), |
| 1401 sorted(p.pid for p in process.all)) |
| 1402 return Results(process) |
| 1403 |
| 1404 def get_or_set_proc(self, pid): |
| 1405 """Returns the Context.Process instance for this pid or creates a new one. |
| 1406 """ |
| 1407 if not pid or not isinstance(pid, int): |
| 1408 raise TracingFailure( |
| 1409 'Unpexpected value for pid: %r' % pid, |
| 1410 pid, |
| 1411 None, |
| 1412 None, |
| 1413 pid) |
| 1414 if pid not in self._process_lookup: |
| 1415 self._process_lookup[pid] = self.Process(self, pid) |
| 1416 return self._process_lookup[pid] |
| 1417 |
| 1418 @classmethod |
| 1419 def traces(cls): |
| 1420 """Returns the list of all handled traces to pass this as an argument to |
| 1421 strace. |
| 1422 """ |
| 1423 prefix = 'handle_' |
| 1424 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)] |
| 1425 |
| 1426 class Tracer(ApiBase.Tracer): |
| 1427 MAX_LEN = 256 |
| 1428 |
| 1429 def trace(self, cmd, cwd, tracename, output): |
| 1430 """Runs strace on an executable.""" |
| 1431 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) |
| 1432 assert os.path.isabs(cmd[0]), cmd[0] |
| 1433 assert os.path.isabs(cwd), cwd |
| 1434 assert os.path.normpath(cwd) == cwd, cwd |
| 1435 with self._lock: |
| 1436 if not self._initialized: |
| 1437 raise TracingFailure( |
| 1438 'Called Tracer.trace() on an unitialized object', |
| 1439 None, None, None, tracename) |
| 1440 assert tracename not in (i['trace'] for i in self._traces) |
| 1441 stdout = stderr = None |
| 1442 if output: |
| 1443 stdout = subprocess.PIPE |
| 1444 stderr = subprocess.STDOUT |
| 1445 # Ensure all file related APIs are hooked. |
| 1446 traces = ','.join(Strace.Context.traces() + ['file']) |
| 1447 trace_cmd = [ |
| 1448 'strace', |
| 1449 '-ff', |
| 1450 '-s', '%d' % self.MAX_LEN, |
| 1451 '-e', 'trace=%s' % traces, |
| 1452 '-o', self._logname + '.' + tracename, |
| 1453 ] |
| 1454 child = subprocess.Popen( |
| 1455 trace_cmd + cmd, |
| 1456 cwd=cwd, |
| 1457 stdin=subprocess.PIPE, |
| 1458 stdout=stdout, |
| 1459 stderr=stderr) |
| 1460 out = child.communicate()[0] |
| 1461 # TODO(maruel): Walk the logs and figure out the root process would |
| 1462 # simplify parsing the logs a *lot*. |
| 1463 with self._lock: |
| 1464 assert tracename not in (i['trace'] for i in self._traces) |
| 1465 self._traces.append( |
| 1466 { |
| 1467 'cmd': cmd, |
| 1468 'cwd': cwd, |
| 1469 # The pid of strace process, not very useful. |
| 1470 'pid': child.pid, |
| 1471 'trace': tracename, |
| 1472 'output': out, |
| 1473 }) |
| 1474 return child.returncode, out |
| 1475 |
| 1476 @staticmethod |
| 1477 def clean_trace(logname): |
| 1478 if os.path.isfile(logname): |
| 1479 os.remove(logname) |
| 1480 # Also delete any pid specific file from previous traces. |
| 1481 for i in glob.iglob(logname + '.*'): |
| 1482 if i.rsplit('.', 1)[1].isdigit(): |
| 1483 os.remove(i) |
| 1484 |
| 1485 @classmethod |
| 1486 def parse_log(cls, logname, blacklist): |
| 1487 logging.info('parse_log(%s, %s)' % (logname, blacklist)) |
| 1488 data = read_json(logname) |
| 1489 out = [] |
| 1490 for item in data['traces']: |
| 1491 result = { |
| 1492 'trace': item['trace'], |
| 1493 'output': item['output'], |
| 1494 } |
| 1495 try: |
| 1496 context = cls.Context(blacklist, item['cwd']) |
| 1497 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])): |
| 1498 pid = pidfile.rsplit('.', 1)[1] |
| 1499 if pid.isdigit(): |
| 1500 pid = int(pid) |
| 1501 # TODO(maruel): Load as utf-8 |
| 1502 for line in open(pidfile, 'rb'): |
| 1503 context.on_line(pid, line) |
| 1504 result['results'] = context.to_results() |
| 1505 except TracingFailure, e: |
| 1506 result['exception'] = e |
| 1507 out.append(result) |
| 1508 return out |
| 1509 |
| 1510 |
| 1511 class Dtrace(ApiBase): |
| 1512 """Uses DTrace framework through dtrace. Requires root access. |
| 1513 |
| 1514 Implies Mac OSX. |
| 1515 |
| 1516 dtruss can't be used because it has compatibility issues with python. |
| 1517 |
| 1518 Also, the pid->cwd handling needs to be done manually since OSX has no way to |
| 1519 get the absolute path of the 'cwd' dtrace variable from the probe. |
| 1520 |
| 1521 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see |
| 1522 https://discussions.apple.com/thread/1980539. So resort to handling execve() |
| 1523 manually. |
| 1524 |
| 1525 errno is not printed in the log since this implementation currently only cares |
| 1526 about files that were successfully opened. |
| 1527 """ |
| 1528 class Context(ApiBase.Context): |
| 1529 # Format: index pid function(args) |
| 1530 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)\((.*?)\)$') |
| 1531 |
| 1532 # Arguments parsing. |
| 1533 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$') |
| 1534 RE_CHDIR = re.compile(r'^\"(.+?)\"$') |
| 1535 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$') |
| 1536 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$') |
| 1537 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$') |
| 1538 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$') |
| 1539 |
| 1540 O_DIRECTORY = 0x100000 |
| 1541 |
| 1542 class Process(ApiBase.Context.Process): |
| 1543 def __init__(self, *args): |
| 1544 super(Dtrace.Context.Process, self).__init__(*args) |
| 1545 self.cwd = self.initial_cwd |
| 1546 |
| 1547 def __init__(self, blacklist, tracer_pid, initial_cwd): |
| 1548 logging.info( |
| 1549 '%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd)) |
| 1550 super(Dtrace.Context, self).__init__(blacklist) |
| 1551 # Process ID of the temporary script created by create_thunk(). |
| 1552 self._tracer_pid = tracer_pid |
| 1553 self._initial_cwd = initial_cwd |
| 1554 self._line_number = 0 |
| 1555 |
| 1556 def on_line(self, line): |
| 1557 self._line_number += 1 |
| 1558 match = self.RE_HEADER.match(line) |
| 1559 if not match: |
| 1560 raise TracingFailure( |
| 1561 'Found malformed line: %s' % line, |
| 1562 None, |
| 1563 self._line_number, |
| 1564 line) |
| 1565 fn = getattr( |
| 1566 self, |
| 1567 'handle_%s' % match.group(2).replace('-', '_'), |
| 1568 self._handle_ignored) |
| 1569 # It is guaranteed to succeed because of the regexp. Or at least I thought |
| 1570 # it would. |
| 1571 pid = int(match.group(1)) |
| 1572 try: |
| 1573 return fn(pid, match.group(3)) |
| 1574 except TracingFailure, e: |
| 1575 # Hack in the values since the handler could be a static function. |
| 1576 e.pid = pid |
| 1577 e.line = line |
| 1578 e.line_number = self._line_number |
| 1579 # Re-raise the modified exception. |
| 1580 raise |
| 1581 except (KeyError, NotImplementedError, ValueError), e: |
| 1582 raise TracingFailure( |
| 1583 'Trace generated a %s exception: %s' % ( |
| 1584 e.__class__.__name__, str(e)), |
| 1585 pid, |
| 1586 self._line_number, |
| 1587 line, |
| 1588 e) |
| 1589 |
| 1590 def to_results(self): |
| 1591 process = self.root_process.to_results_process() |
| 1592 # Internal concistency check. |
| 1593 if sorted(self._process_lookup) != sorted(p.pid for p in process.all): |
| 1594 raise TracingFailure( |
| 1595 'Found internal inconsitency in process lifetime detection ' |
| 1596 'while looking for len(tree) == len(list)', |
| 1597 None, |
| 1598 None, |
| 1599 None, |
| 1600 sorted(self._process_lookup), |
| 1601 sorted(p.pid for p in process.all)) |
| 1602 return Results(process) |
| 1603 |
| 1604 def handle_dtrace_BEGIN(self, _pid, args): |
| 1605 if not self.RE_DTRACE_BEGIN.match(args): |
| 1606 raise TracingFailure( |
| 1607 'Found internal inconsitency in dtrace_BEGIN log line', |
| 1608 None, None, None) |
| 1609 |
| 1610 def handle_proc_start(self, pid, args): |
| 1611 """Transfers cwd. |
| 1612 |
| 1613 The dtrace script already takes care of only tracing the processes that |
| 1614 are child of the traced processes so there is no need to verify the |
| 1615 process hierarchy. |
| 1616 """ |
| 1617 if pid in self._process_lookup: |
| 1618 raise TracingFailure( |
| 1619 'Found internal inconsitency in proc_start: %d started two times' % |
| 1620 pid, |
| 1621 None, None, None) |
| 1622 match = self.RE_PROC_START.match(args) |
| 1623 if not match: |
| 1624 raise TracingFailure( |
| 1625 'Failed to parse arguments: %s' % args, |
| 1626 None, None, None) |
| 1627 ppid = int(match.group(1)) |
| 1628 if ppid == self._tracer_pid and not self.root_process: |
| 1629 proc = self.root_process = self.Process( |
| 1630 self.blacklist, pid, self._initial_cwd) |
| 1631 elif ppid in self._process_lookup: |
| 1632 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd) |
| 1633 self._process_lookup[ppid].children.append(proc) |
| 1634 else: |
| 1635 # Another process tree, ignore. |
| 1636 return |
| 1637 self._process_lookup[pid] = proc |
| 1638 logging.debug( |
| 1639 'New child: %s -> %d cwd:%s' % |
| 1640 (ppid, pid, unicode(proc.initial_cwd))) |
| 1641 |
| 1642 def handle_proc_exit(self, pid, _args): |
| 1643 """Removes cwd.""" |
| 1644 if pid in self._process_lookup: |
| 1645 # self._tracer_pid is not traced itself and other traces run neither. |
| 1646 self._process_lookup[pid].cwd = None |
| 1647 |
| 1648 def handle_execve(self, pid, args): |
| 1649 """Sets the process' executable. |
| 1650 |
| 1651 TODO(maruel): Read command line arguments. See |
| 1652 https://discussions.apple.com/thread/1980539 for an example. |
| 1653 https://gist.github.com/1242279 |
| 1654 |
| 1655 Will have to put the answer at http://stackoverflow.com/questions/7556249. |
| 1656 :) |
| 1657 """ |
| 1658 if not pid in self._process_lookup: |
| 1659 # Another process tree, ignore. |
| 1660 return |
| 1661 match = self.RE_EXECVE.match(args) |
| 1662 if not match: |
| 1663 raise TracingFailure( |
| 1664 'Failed to parse arguments: %r' % args, |
| 1665 None, None, None) |
| 1666 proc = self._process_lookup[pid] |
| 1667 proc.executable = match.group(1) |
| 1668 proc.command = self.process_escaped_arguments(match.group(3)) |
| 1669 if int(match.group(2)) != len(proc.command): |
| 1670 raise TracingFailure( |
| 1671 'Failed to parse execve() arguments: %s' % args, |
| 1672 None, None, None) |
| 1673 |
| 1674 def handle_chdir(self, pid, args): |
| 1675 """Updates cwd.""" |
| 1676 if pid not in self._process_lookup: |
| 1677 # Another process tree, ignore. |
| 1678 return |
| 1679 cwd = self.RE_CHDIR.match(args).group(1) |
| 1680 if not cwd.startswith('/'): |
| 1681 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd) |
| 1682 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2)) |
| 1683 else: |
| 1684 logging.debug('handle_chdir(%d, %s)' % (pid, cwd)) |
| 1685 cwd2 = cwd |
| 1686 self._process_lookup[pid].cwd = cwd2 |
| 1687 |
| 1688 def handle_open_nocancel(self, pid, args): |
| 1689 """Redirects to handle_open().""" |
| 1690 return self.handle_open(pid, args) |
| 1691 |
| 1692 def handle_open(self, pid, args): |
| 1693 if pid not in self._process_lookup: |
| 1694 # Another process tree, ignore. |
| 1695 return |
| 1696 match = self.RE_OPEN.match(args) |
| 1697 if not match: |
| 1698 raise TracingFailure( |
| 1699 'Failed to parse arguments: %s' % args, |
| 1700 None, None, None) |
| 1701 flag = int(match.group(2), 16) |
| 1702 if self.O_DIRECTORY & flag == self.O_DIRECTORY: |
| 1703 # Ignore directories. |
| 1704 return |
| 1705 self._handle_file(pid, match.group(1)) |
| 1706 |
| 1707 def handle_rename(self, pid, args): |
| 1708 if pid not in self._process_lookup: |
| 1709 # Another process tree, ignore. |
| 1710 return |
| 1711 match = self.RE_RENAME.match(args) |
| 1712 if not match: |
| 1713 raise TracingFailure( |
| 1714 'Failed to parse arguments: %s' % args, |
| 1715 None, None, None) |
| 1716 self._handle_file(pid, match.group(1)) |
| 1717 self._handle_file(pid, match.group(2)) |
| 1718 |
| 1719 def _handle_file(self, pid, filepath): |
| 1720 if not filepath.startswith('/'): |
| 1721 filepath = os.path.join(self._process_lookup[pid].cwd, filepath) |
| 1722 # We can get '..' in the path. |
| 1723 filepath = os.path.normpath(filepath) |
| 1724 # Sadly, still need to filter out directories here; |
| 1725 # saw open_nocancel(".", 0, 0) = 0 lines. |
| 1726 if os.path.isdir(filepath): |
| 1727 return |
| 1728 self._process_lookup[pid].add_file(filepath, False) |
| 1729 |
| 1730 def handle_ftruncate(self, pid, args): |
| 1731 """Just used as a signal to kill dtrace, ignoring.""" |
| 1732 pass |
| 1733 |
| 1734 @staticmethod |
| 1735 def _handle_ignored(pid, args): |
| 1736 """Is called for all the event traces that are not handled.""" |
| 1737 raise NotImplementedError('Please implement me') |
| 1738 |
| 1739 @staticmethod |
| 1740 def process_escaped_arguments(text): |
| 1741 """Extracts escaped arguments on a string and return the arguments as a |
| 1742 list. |
| 1743 |
| 1744 Implemented as an automaton. |
| 1745 |
| 1746 Example: |
| 1747 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the |
| 1748 function will return ['python2.7', '-c', 'print("hi")] |
| 1749 """ |
| 1750 if not text.endswith('\\0'): |
| 1751 raise ValueError('String is not null terminated: %r' % text, text) |
| 1752 text = text[:-2] |
| 1753 |
| 1754 def unescape(x): |
| 1755 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?.""" |
| 1756 out = [] |
| 1757 escaped = False |
| 1758 for i in x: |
| 1759 if i == '\\' and not escaped: |
| 1760 escaped = True |
| 1761 continue |
| 1762 escaped = False |
| 1763 out.append(i) |
| 1764 return ''.join(out) |
| 1765 |
| 1766 return [unescape(i) for i in text.split('\\001')] |
| 1767 |
| 1768 class Tracer(ApiBase.Tracer): |
| 1769 # pylint: disable=C0301 |
| 1770 # |
| 1771 # To understand the following code, you'll want to take a look at: |
| 1772 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref
.html |
| 1773 # https://wikis.oracle.com/display/DTrace/Variables |
| 1774 # http://docs.oracle.com/cd/E19205-01/820-4221/ |
| 1775 # |
| 1776 # 0. Dump all the valid probes into a text file. It is important, you |
| 1777 # want to redirect into a file and you don't want to constantly 'sudo'. |
| 1778 # $ sudo dtrace -l > probes.txt |
| 1779 # |
| 1780 # 1. Count the number of probes: |
| 1781 # $ wc -l probes.txt |
| 1782 # 81823 # On OSX 10.7, including 1 header line. |
| 1783 # |
| 1784 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the |
| 1785 # likes and skipping the header with NR>1: |
| 1786 # $ awk 'NR>1 { print $2 }' probes.txt | sort | uniq | grep -v '[[:digit:]]' |
| 1787 # dtrace |
| 1788 # fbt |
| 1789 # io |
| 1790 # ip |
| 1791 # lockstat |
| 1792 # mach_trap |
| 1793 # proc |
| 1794 # profile |
| 1795 # sched |
| 1796 # syscall |
| 1797 # tcp |
| 1798 # vminfo |
| 1799 # |
| 1800 # 3. List of valid probes: |
| 1801 # $ grep syscall probes.txt | less |
| 1802 # or use dtrace directly: |
| 1803 # $ sudo dtrace -l -P syscall | less |
| 1804 # |
| 1805 # trackedpid is an associative array where its value can be 0, 1 or 2. |
| 1806 # 0 is for untracked processes and is the default value for items not |
| 1807 # in the associative array. |
| 1808 # 1 is for tracked processes. |
| 1809 # 2 is for the script created by create_thunk() only. It is not tracked |
| 1810 # itself but all its decendants are. |
| 1811 # |
| 1812 # The script will kill itself only once waiting_to_die == 1 and |
| 1813 # current_processes == 0, so that both getlogin() was called and that |
| 1814 # all traced processes exited. |
| 1815 # |
| 1816 # TODO(maruel): Use cacheable predicates. See |
| 1817 # https://wikis.oracle.com/display/DTrace/Performance+Considerations |
| 1818 D_CODE = """ |
| 1819 dtrace:::BEGIN { |
| 1820 waiting_to_die = 0; |
| 1821 current_processes = 0; |
| 1822 logindex = 0; |
| 1823 printf("%d %d %s_%s(\\"%s\\")\\n", |
| 1824 logindex, PID, probeprov, probename, SCRIPT); |
| 1825 logindex++; |
| 1826 } |
| 1827 |
| 1828 proc:::start /trackedpid[ppid]/ { |
| 1829 trackedpid[pid] = 1; |
| 1830 current_processes += 1; |
| 1831 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n", |
| 1832 logindex, pid, probeprov, probename, |
| 1833 ppid, |
| 1834 execname, |
| 1835 current_processes); |
| 1836 logindex++; |
| 1837 } |
| 1838 /* Should use SCRIPT but there is no access to this variable at that |
| 1839 * point. */ |
| 1840 proc:::start /ppid == PID && execname == "Python"/ { |
| 1841 trackedpid[pid] = 2; |
| 1842 current_processes += 1; |
| 1843 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n", |
| 1844 logindex, pid, probeprov, probename, |
| 1845 ppid, |
| 1846 execname, |
| 1847 current_processes); |
| 1848 logindex++; |
| 1849 } |
| 1850 proc:::exit /trackedpid[pid] && |
| 1851 current_processes == 1 && |
| 1852 waiting_to_die == 1/ { |
| 1853 trackedpid[pid] = 0; |
| 1854 current_processes -= 1; |
| 1855 printf("%d %d %s_%s(%d)\\n", |
| 1856 logindex, pid, probeprov, probename, |
| 1857 current_processes); |
| 1858 logindex++; |
| 1859 exit(0); |
| 1860 } |
| 1861 proc:::exit /trackedpid[pid]/ { |
| 1862 trackedpid[pid] = 0; |
| 1863 current_processes -= 1; |
| 1864 printf("%d %d %s_%s(%d)\\n", |
| 1865 logindex, pid, probeprov, probename, |
| 1866 current_processes); |
| 1867 logindex++; |
| 1868 } |
| 1869 |
| 1870 /* Use an arcane function to detect when we need to die */ |
| 1871 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ { |
| 1872 waiting_to_die = 1; |
| 1873 printf("%d %d %s()\\n", logindex, pid, probefunc); |
| 1874 logindex++; |
| 1875 } |
| 1876 syscall::ftruncate:entry / |
| 1877 pid == PID && arg0 == FILE_ID && current_processes == 0/ { |
| 1878 exit(0); |
| 1879 } |
| 1880 |
| 1881 syscall::open*:entry /trackedpid[pid] == 1/ { |
| 1882 self->open_arg0 = arg0; |
| 1883 self->open_arg1 = arg1; |
| 1884 self->open_arg2 = arg2; |
| 1885 } |
| 1886 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ { |
| 1887 this->open_arg0 = copyinstr(self->open_arg0); |
| 1888 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n", |
| 1889 logindex, pid, probefunc, |
| 1890 this->open_arg0, |
| 1891 self->open_arg1, |
| 1892 self->open_arg2); |
| 1893 logindex++; |
| 1894 this->open_arg0 = 0; |
| 1895 } |
| 1896 syscall::open*:return /trackedpid[pid] == 1/ { |
| 1897 self->open_arg0 = 0; |
| 1898 self->open_arg1 = 0; |
| 1899 self->open_arg2 = 0; |
| 1900 } |
| 1901 |
| 1902 syscall::rename:entry /trackedpid[pid] == 1/ { |
| 1903 self->rename_arg0 = arg0; |
| 1904 self->rename_arg1 = arg1; |
| 1905 } |
| 1906 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ { |
| 1907 this->rename_arg0 = copyinstr(self->rename_arg0); |
| 1908 this->rename_arg1 = copyinstr(self->rename_arg1); |
| 1909 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n", |
| 1910 logindex, pid, probefunc, |
| 1911 this->rename_arg0, |
| 1912 this->rename_arg1); |
| 1913 logindex++; |
| 1914 this->rename_arg0 = 0; |
| 1915 this->rename_arg1 = 0; |
| 1916 } |
| 1917 syscall::rename:return /trackedpid[pid] == 1/ { |
| 1918 self->rename_arg0 = 0; |
| 1919 self->rename_arg1 = 0; |
| 1920 } |
| 1921 |
| 1922 /* Track chdir, it's painful because it is only receiving relative path. |
| 1923 */ |
| 1924 syscall::chdir:entry /trackedpid[pid] == 1/ { |
| 1925 self->chdir_arg0 = arg0; |
| 1926 } |
| 1927 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ { |
| 1928 this->chdir_arg0 = copyinstr(self->chdir_arg0); |
| 1929 printf("%d %d %s(\\"%s\\")\\n", |
| 1930 logindex, pid, probefunc, |
| 1931 this->chdir_arg0); |
| 1932 logindex++; |
| 1933 this->chdir_arg0 = 0; |
| 1934 } |
| 1935 syscall::chdir:return /trackedpid[pid] == 1/ { |
| 1936 self->chdir_arg0 = 0; |
| 1937 } |
| 1938 """ |
| 1939 |
| 1940 # execve-specific code, tends to throw a lot of exceptions. |
| 1941 D_CODE_EXECVE = """ |
| 1942 /* Finally what we care about! */ |
| 1943 syscall::exec*:entry /trackedpid[pid]/ { |
| 1944 self->exec_arg0 = copyinstr(arg0); |
| 1945 /* Incrementally probe for a NULL in the argv parameter of execve() to |
| 1946 * figure out argc. */ |
| 1947 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was |
| 1948 * found. */ |
| 1949 self->exec_argc = 0; |
| 1950 /* Probe for argc==1 */ |
| 1951 this->exec_argv = (user_addr_t*)copyin( |
| 1952 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); |
| 1953 self->exec_argc = this->exec_argv[self->exec_argc] ? |
| 1954 (self->exec_argc + 1) : self->exec_argc; |
| 1955 |
| 1956 /* Probe for argc==2 */ |
| 1957 this->exec_argv = (user_addr_t*)copyin( |
| 1958 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); |
| 1959 self->exec_argc = this->exec_argv[self->exec_argc] ? |
| 1960 (self->exec_argc + 1) : self->exec_argc; |
| 1961 |
| 1962 /* Probe for argc==3 */ |
| 1963 this->exec_argv = (user_addr_t*)copyin( |
| 1964 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); |
| 1965 self->exec_argc = this->exec_argv[self->exec_argc] ? |
| 1966 (self->exec_argc + 1) : self->exec_argc; |
| 1967 |
| 1968 /* Probe for argc==4 */ |
| 1969 this->exec_argv = (user_addr_t*)copyin( |
| 1970 arg1, sizeof(user_addr_t) * (self->exec_argc + 1)); |
| 1971 self->exec_argc = this->exec_argv[self->exec_argc] ? |
| 1972 (self->exec_argc + 1) : self->exec_argc; |
| 1973 |
| 1974 /* Copy the inputs strings since there is no guarantee they'll be |
| 1975 * present after the call completed. */ |
| 1976 self->exec_argv0 = (self->exec_argc > 0) ? |
| 1977 copyinstr(this->exec_argv[0]) : ""; |
| 1978 self->exec_argv1 = (self->exec_argc > 1) ? |
| 1979 copyinstr(this->exec_argv[1]) : ""; |
| 1980 self->exec_argv2 = (self->exec_argc > 2) ? |
| 1981 copyinstr(this->exec_argv[2]) : ""; |
| 1982 self->exec_argv3 = (self->exec_argc > 3) ? |
| 1983 copyinstr(this->exec_argv[3]) : ""; |
| 1984 this->exec_argv = 0; |
| 1985 } |
| 1986 syscall::exec*:return /trackedpid[pid] && errno == 0/ { |
| 1987 /* We need to join strings here, as using multiple printf() would |
| 1988 * cause tearing when multiple threads/processes are traced. |
| 1989 * Since it is impossible to escape a string and join it to another one, |
| 1990 * like sprintf("%s%S", previous, more), use hackery. |
| 1991 * Each of the elements are split with a \\1. \\0 cannot be used because |
| 1992 * it is simply ignored. This will conflict with any program putting a |
| 1993 * \\1 in their execve() string but this should be "rare enough" */ |
| 1994 this->args = ""; |
| 1995 /* Process exec_argv[0] */ |
| 1996 this->args = strjoin( |
| 1997 this->args, (self->exec_argc > 0) ? self->exec_argv0 : ""); |
| 1998 |
| 1999 /* Process exec_argv[1] */ |
| 2000 this->args = strjoin( |
| 2001 this->args, (self->exec_argc > 1) ? "\\1" : ""); |
| 2002 this->args = strjoin( |
| 2003 this->args, (self->exec_argc > 1) ? self->exec_argv1 : ""); |
| 2004 |
| 2005 /* Process exec_argv[2] */ |
| 2006 this->args = strjoin( |
| 2007 this->args, (self->exec_argc > 2) ? "\\1" : ""); |
| 2008 this->args = strjoin( |
| 2009 this->args, (self->exec_argc > 2) ? self->exec_argv2 : ""); |
| 2010 |
| 2011 /* Process exec_argv[3] */ |
| 2012 this->args = strjoin( |
| 2013 this->args, (self->exec_argc > 3) ? "\\1" : ""); |
| 2014 this->args = strjoin( |
| 2015 this->args, (self->exec_argc > 3) ? self->exec_argv3 : ""); |
| 2016 |
| 2017 /* Prints self->exec_argc to permits verifying the internal |
| 2018 * consistency since this code is quite fishy. */ |
| 2019 printf("%d %d %s(\\"%s\\", [%d, %S])\\n", |
| 2020 logindex, pid, probefunc, |
| 2021 self->exec_arg0, |
| 2022 self->exec_argc, |
| 2023 this->args); |
| 2024 logindex++; |
| 2025 this->args = 0; |
| 2026 } |
| 2027 syscall::exec*:return /trackedpid[pid]/ { |
| 2028 self->exec_arg0 = 0; |
| 2029 self->exec_argc = 0; |
| 2030 self->exec_argv0 = 0; |
| 2031 self->exec_argv1 = 0; |
| 2032 self->exec_argv2 = 0; |
| 2033 self->exec_argv3 = 0; |
| 2034 } |
| 2035 """ |
| 2036 |
| 2037 # Code currently not used. |
| 2038 D_EXTRANEOUS = """ |
| 2039 /* This is a good learning experience, since it traces a lot of things |
| 2040 * related to the process and child processes. |
| 2041 * Warning: it generates a gigantic log. For example, tracing |
| 2042 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes |
| 2043 * several minutes to execute. |
| 2044 */ |
| 2045 /* |
| 2046 mach_trap::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { |
| 2047 printf("%d %d %s_%s() = %d\\n", |
| 2048 logindex, pid, probeprov, probefunc, errno); |
| 2049 logindex++; |
| 2050 } |
| 2051 proc::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { |
| 2052 printf("%d %d %s_%s() = %d\\n", |
| 2053 logindex, pid, probeprov, probefunc, errno); |
| 2054 logindex++; |
| 2055 } |
| 2056 sched::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { |
| 2057 printf("%d %d %s_%s() = %d\\n", |
| 2058 logindex, pid, probeprov, probefunc, errno); |
| 2059 logindex++; |
| 2060 } |
| 2061 syscall::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { |
| 2062 printf("%d %d %s_%s() = %d\\n", |
| 2063 logindex, pid, probeprov, probefunc, errno); |
| 2064 logindex++; |
| 2065 } |
| 2066 vminfo::: /trackedpid[pid] == 1 || trackedpid[ppid]/ { |
| 2067 printf("%d %d %s_%s() = %d\\n", |
| 2068 logindex, pid, probeprov, probefunc, errno); |
| 2069 logindex++; |
| 2070 } |
| 2071 */ |
| 2072 /* TODO(maruel): *stat* functions and friends |
| 2073 syscall::access:return, |
| 2074 syscall::chdir:return, |
| 2075 syscall::chflags:return, |
| 2076 syscall::chown:return, |
| 2077 syscall::chroot:return, |
| 2078 syscall::getattrlist:return, |
| 2079 syscall::getxattr:return, |
| 2080 syscall::lchown:return, |
| 2081 syscall::lstat64:return, |
| 2082 syscall::lstat:return, |
| 2083 syscall::mkdir:return, |
| 2084 syscall::pathconf:return, |
| 2085 syscall::readlink:return, |
| 2086 syscall::removexattr:return, |
| 2087 syscall::setxattr:return, |
| 2088 syscall::stat64:return, |
| 2089 syscall::stat:return, |
| 2090 syscall::truncate:return, |
| 2091 syscall::unlink:return, |
| 2092 syscall::utimes:return, |
| 2093 */ |
| 2094 """ |
| 2095 |
| 2096 def __init__(self, logname): |
| 2097 """Starts the log collection with dtrace. |
| 2098 |
| 2099 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so |
| 2100 this needs to wait for dtrace to be "warmed up". |
| 2101 """ |
| 2102 super(Dtrace.Tracer, self).__init__(logname) |
| 2103 self._script = create_thunk() |
| 2104 # This unique dummy temp file is used to signal the dtrace script that it |
| 2105 # should stop as soon as all the child processes are done. A bit hackish |
| 2106 # but works fine enough. |
| 2107 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp( |
| 2108 prefix='trace_signal_file') |
| 2109 |
| 2110 # Note: do not use the -p flag. It's useless if the initial process quits |
| 2111 # too fast, resulting in missing traces from the grand-children. The D |
| 2112 # code manages the dtrace lifetime itself. |
| 2113 trace_cmd = [ |
| 2114 'sudo', |
| 2115 'dtrace', |
| 2116 # Use a larger buffer if getting 'out of scratch space' errors. |
| 2117 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables |
| 2118 '-b', '10m', |
| 2119 '-x', 'dynvarsize=10m', |
| 2120 #'-x', 'dtrace_global_maxsize=1m', |
| 2121 '-x', 'evaltime=exec', |
| 2122 '-o', '/dev/stderr', |
| 2123 '-q', |
| 2124 '-n', self._get_dtrace_code(), |
| 2125 ] |
| 2126 with open(self._logname + '.log', 'wb') as logfile: |
| 2127 self._dtrace = subprocess.Popen( |
| 2128 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT) |
| 2129 logging.debug('Started dtrace pid: %d' % self._dtrace.pid) |
| 2130 |
| 2131 # Reads until one line is printed, which signifies dtrace is up and ready. |
| 2132 with open(self._logname + '.log', 'rb') as logfile: |
| 2133 while 'dtrace_BEGIN' not in logfile.readline(): |
| 2134 if self._dtrace.poll() is not None: |
| 2135 # Do a busy wait. :/ |
| 2136 break |
| 2137 logging.debug('dtrace started') |
| 2138 |
| 2139 def _get_dtrace_code(self): |
| 2140 """Setups the D code to implement child process tracking. |
| 2141 |
| 2142 Injects the cookie in the script so it knows when to stop. |
| 2143 |
| 2144 The script will detect any instance of the script created with |
| 2145 create_thunk() and will start tracing it. |
| 2146 """ |
| 2147 return ( |
| 2148 'inline int PID = %d;\n' |
| 2149 'inline string SCRIPT = "%s";\n' |
| 2150 'inline int FILE_ID = %d;\n' |
| 2151 '\n' |
| 2152 '%s') % ( |
| 2153 os.getpid(), |
| 2154 self._script, |
| 2155 self._dummy_file_id, |
| 2156 self.D_CODE) + self.D_CODE_EXECVE |
| 2157 |
| 2158 def trace(self, cmd, cwd, tracename, output): |
| 2159 """Runs dtrace on an executable. |
| 2160 |
| 2161 This dtruss is broken when it starts the process itself or when tracing |
| 2162 child processes, this code starts a wrapper process |
| 2163 generated with create_thunk() which starts the executable to trace. |
| 2164 """ |
| 2165 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) |
| 2166 assert os.path.isabs(cmd[0]), cmd[0] |
| 2167 assert os.path.isabs(cwd), cwd |
| 2168 assert os.path.normpath(cwd) == cwd, cwd |
| 2169 with self._lock: |
| 2170 if not self._initialized: |
| 2171 raise TracingFailure( |
| 2172 'Called Tracer.trace() on an unitialized object', |
| 2173 None, None, None, tracename) |
| 2174 assert tracename not in (i['trace'] for i in self._traces) |
| 2175 |
| 2176 # Starts the script wrapper to start the child process. This signals the |
| 2177 # dtrace script that this process is to be traced. |
| 2178 stdout = stderr = None |
| 2179 if output: |
| 2180 stdout = subprocess.PIPE |
| 2181 stderr = subprocess.STDOUT |
| 2182 child_cmd = [ |
| 2183 sys.executable, |
| 2184 self._script, |
| 2185 tracename, |
| 2186 ] |
| 2187 # Call a dummy function so that dtrace knows I'm about to launch a process |
| 2188 # that needs to be traced. |
| 2189 # Yummy. |
| 2190 child = subprocess.Popen( |
| 2191 child_cmd + fix_python_path(cmd), |
| 2192 stdin=subprocess.PIPE, |
| 2193 stdout=stdout, |
| 2194 stderr=stderr, |
| 2195 cwd=cwd) |
| 2196 logging.debug('Started child pid: %d' % child.pid) |
| 2197 |
| 2198 out = child.communicate()[0] |
| 2199 # This doesn't mean tracing is done, one of the grand-child process may |
| 2200 # still be alive. It will be tracked with the dtrace script. |
| 2201 |
| 2202 with self._lock: |
| 2203 assert tracename not in (i['trace'] for i in self._traces) |
| 2204 self._traces.append( |
| 2205 { |
| 2206 'cmd': cmd, |
| 2207 'cwd': cwd, |
| 2208 # The pid of strace process, not very useful. |
| 2209 'pid': child.pid, |
| 2210 'trace': tracename, |
| 2211 'output': out, |
| 2212 }) |
| 2213 return child.returncode, out |
| 2214 |
| 2215 def close(self, timeout=None): |
| 2216 """Terminates dtrace.""" |
| 2217 logging.debug('close(%s)' % timeout) |
| 2218 try: |
| 2219 try: |
| 2220 super(Dtrace.Tracer, self).close(timeout) |
| 2221 # Signal dtrace that it should stop now. |
| 2222 os.ftruncate(self._dummy_file_id, 0) |
| 2223 if timeout: |
| 2224 start = time.time() |
| 2225 # Use polling. :/ |
| 2226 while (self._dtrace.poll() is None and |
| 2227 (time.time() - start) < timeout): |
| 2228 time.sleep(0.1) |
| 2229 self._dtrace.kill() |
| 2230 self._dtrace.wait() |
| 2231 finally: |
| 2232 # Make sure to kill it in any case. |
| 2233 if self._dtrace.poll() is None: |
| 2234 try: |
| 2235 self._dtrace.kill() |
| 2236 self._dtrace.wait() |
| 2237 except OSError: |
| 2238 pass |
| 2239 |
| 2240 if self._dtrace.returncode != 0: |
| 2241 # Warn about any dtrace failure but basically ignore it. |
| 2242 print 'dtrace failure: %s' % self._dtrace.returncode |
| 2243 finally: |
| 2244 os.close(self._dummy_file_id) |
| 2245 os.remove(self._dummy_file_name) |
| 2246 os.remove(self._script) |
| 2247 |
| 2248 def post_process_log(self): |
| 2249 """Sorts the log back in order when each call occured. |
| 2250 |
| 2251 dtrace doesn't save the buffer in strict order since it keeps one buffer |
| 2252 per CPU. |
| 2253 """ |
| 2254 super(Dtrace.Tracer, self).post_process_log() |
| 2255 logname = self._logname + '.log' |
| 2256 with open(logname, 'rb') as logfile: |
| 2257 lines = [l for l in logfile if l.strip()] |
| 2258 errors = [l for l in lines if l.startswith('dtrace:')] |
| 2259 if errors: |
| 2260 raise TracingFailure( |
| 2261 'Found errors in the trace: %s' % '\n'.join(errors), |
| 2262 None, None, None, logname) |
| 2263 try: |
| 2264 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0])) |
| 2265 except ValueError: |
| 2266 raise TracingFailure( |
| 2267 'Found errors in the trace: %s' % '\n'.join( |
| 2268 l for l in lines if l.split(' ', 1)[0].isdigit()), |
| 2269 None, None, None, logname) |
| 2270 with open(logname, 'wb') as logfile: |
| 2271 logfile.write(''.join(lines)) |
| 2272 |
| 2273 @staticmethod |
| 2274 def clean_trace(logname): |
| 2275 for ext in ('', '.log'): |
| 2276 if os.path.isfile(logname + ext): |
| 2277 os.remove(logname + ext) |
| 2278 |
| 2279 @classmethod |
| 2280 def parse_log(cls, logname, blacklist): |
| 2281 logging.info('parse_log(%s, ...)' % logname) |
| 2282 |
| 2283 def blacklist_more(filepath): |
| 2284 # All the HFS metadata is in the form /.vol/... |
| 2285 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath) |
| 2286 |
| 2287 data = read_json(logname) |
| 2288 out = [] |
| 2289 for item in data['traces']: |
| 2290 context = cls.Context(blacklist_more, item['pid'], item['cwd']) |
| 2291 for line in open(logname + '.log', 'rb'): |
| 2292 context.on_line(line) |
| 2293 out.append( |
| 2294 { |
| 2295 'results': context.to_results(), |
| 2296 'trace': item['trace'], |
| 2297 'output': item['output'], |
| 2298 }) |
| 2299 return out |
| 2300 |
| 2301 |
| 2302 class LogmanTrace(ApiBase): |
| 2303 """Uses the native Windows ETW based tracing functionality to trace a child |
| 2304 process. |
| 2305 |
| 2306 Caveat: this implementations doesn't track cwd or initial_cwd. It is because |
| 2307 the Windows Kernel doesn't have a concept of 'current working directory' at |
| 2308 all. A Win32 process has a map of current directories, one per drive letter |
| 2309 and it is managed by the user mode kernel32.dll. In kernel, a file is always |
| 2310 opened relative to another file_object or as an absolute path. All the current |
| 2311 working directory logic is done in user mode. |
| 2312 """ |
| 2313 class Context(ApiBase.Context): |
| 2314 """Processes a ETW log line and keeps the list of existent and non |
| 2315 existent files accessed. |
| 2316 |
| 2317 Ignores directories. |
| 2318 """ |
| 2319 # These indexes are for the stripped version in json. |
| 2320 EVENT_NAME = 0 |
| 2321 TYPE = 1 |
| 2322 PID = 2 |
| 2323 TID = 3 |
| 2324 PROCESSOR_ID = 4 |
| 2325 TIMESTAMP = 5 |
| 2326 USER_DATA = 6 |
| 2327 |
| 2328 class Process(ApiBase.Context.Process): |
| 2329 def __init__(self, *args): |
| 2330 super(LogmanTrace.Context.Process, self).__init__(*args) |
| 2331 # Handle file objects that succeeded. |
| 2332 self.file_objects = {} |
| 2333 |
| 2334 def __init__(self, blacklist, tracer_pid): |
| 2335 logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid)) |
| 2336 super(LogmanTrace.Context, self).__init__(blacklist) |
| 2337 self._drive_map = DosDriveMap() |
| 2338 # Threads mapping to the corresponding process id. |
| 2339 self._threads_active = {} |
| 2340 # Process ID of the tracer, e.g. tracer_inputs.py |
| 2341 self._tracer_pid = tracer_pid |
| 2342 self._line_number = 0 |
| 2343 |
| 2344 def on_line(self, line): |
| 2345 """Processes a json Event line.""" |
| 2346 self._line_number += 1 |
| 2347 try: |
| 2348 # By Opcode |
| 2349 handler = getattr( |
| 2350 self, |
| 2351 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]), |
| 2352 None) |
| 2353 if not handler: |
| 2354 raise TracingFailure( |
| 2355 'Unexpected event %s_%s' % ( |
| 2356 line[self.EVENT_NAME], line[self.TYPE]), |
| 2357 None, None, None) |
| 2358 handler(line) |
| 2359 except TracingFailure, e: |
| 2360 # Hack in the values since the handler could be a static function. |
| 2361 e.pid = line[self.PID] |
| 2362 e.line = line |
| 2363 e.line_number = self._line_number |
| 2364 # Re-raise the modified exception. |
| 2365 raise |
| 2366 except (KeyError, NotImplementedError, ValueError), e: |
| 2367 raise TracingFailure( |
| 2368 'Trace generated a %s exception: %s' % ( |
| 2369 e.__class__.__name__, str(e)), |
| 2370 line[self.PID], |
| 2371 self._line_number, |
| 2372 line, |
| 2373 e) |
| 2374 |
| 2375 def to_results(self): |
| 2376 if not self.root_process: |
| 2377 raise TracingFailure( |
| 2378 'Failed to detect the initial process', |
| 2379 None, None, None) |
| 2380 process = self.root_process.to_results_process() |
| 2381 return Results(process) |
| 2382 |
| 2383 def _thread_to_process(self, tid): |
| 2384 """Finds the process from the thread id.""" |
| 2385 tid = int(tid, 16) |
| 2386 pid = self._threads_active.get(tid) |
| 2387 if not pid or not self._process_lookup.get(pid): |
| 2388 return |
| 2389 return self._process_lookup[pid] |
| 2390 |
| 2391 @classmethod |
| 2392 def handle_EventTrace_Header(cls, line): |
| 2393 """Verifies no event was dropped, e.g. no buffer overrun occured.""" |
| 2394 BUFFER_SIZE = cls.USER_DATA |
| 2395 #VERSION = cls.USER_DATA + 1 |
| 2396 #PROVIDER_VERSION = cls.USER_DATA + 2 |
| 2397 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3 |
| 2398 #END_TIME = cls.USER_DATA + 4 |
| 2399 #TIMER_RESOLUTION = cls.USER_DATA + 5 |
| 2400 #MAX_FILE_SIZE = cls.USER_DATA + 6 |
| 2401 #LOG_FILE_MODE = cls.USER_DATA + 7 |
| 2402 #BUFFERS_WRITTEN = cls.USER_DATA + 8 |
| 2403 #START_BUFFERS = cls.USER_DATA + 9 |
| 2404 #POINTER_SIZE = cls.USER_DATA + 10 |
| 2405 EVENTS_LOST = cls.USER_DATA + 11 |
| 2406 #CPU_SPEED = cls.USER_DATA + 12 |
| 2407 #LOGGER_NAME = cls.USER_DATA + 13 |
| 2408 #LOG_FILE_NAME = cls.USER_DATA + 14 |
| 2409 #BOOT_TIME = cls.USER_DATA + 15 |
| 2410 #PERF_FREQ = cls.USER_DATA + 16 |
| 2411 #START_TIME = cls.USER_DATA + 17 |
| 2412 #RESERVED_FLAGS = cls.USER_DATA + 18 |
| 2413 #BUFFERS_LOST = cls.USER_DATA + 19 |
| 2414 #SESSION_NAME_STRING = cls.USER_DATA + 20 |
| 2415 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21 |
| 2416 if line[EVENTS_LOST] != '0': |
| 2417 raise TracingFailure( |
| 2418 ( '%s events were lost during trace, please increase the buffer ' |
| 2419 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]), |
| 2420 None, None, None) |
| 2421 |
| 2422 def handle_FileIo_Cleanup(self, line): |
| 2423 """General wisdom: if a file is closed, it's because it was opened. |
| 2424 |
| 2425 Note that FileIo_Close is not used since if a file was opened properly but |
| 2426 not closed before the process exits, only Cleanup will be logged. |
| 2427 """ |
| 2428 #IRP = self.USER_DATA |
| 2429 TTID = self.USER_DATA + 1 # Thread ID, that's what we want. |
| 2430 FILE_OBJECT = self.USER_DATA + 2 |
| 2431 #FILE_KEY = self.USER_DATA + 3 |
| 2432 proc = self._thread_to_process(line[TTID]) |
| 2433 if not proc: |
| 2434 # Not a process we care about. |
| 2435 return |
| 2436 file_object = line[FILE_OBJECT] |
| 2437 if file_object in proc.file_objects: |
| 2438 proc.add_file(proc.file_objects.pop(file_object), False) |
| 2439 |
| 2440 def handle_FileIo_Create(self, line): |
| 2441 """Handles a file open. |
| 2442 |
| 2443 All FileIo events are described at |
| 2444 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx |
| 2445 for some value of 'description'. |
| 2446 |
| 2447 " (..) process and thread id values of the IO events (..) are not valid " |
| 2448 http://msdn.microsoft.com/magazine/ee358703.aspx |
| 2449 |
| 2450 The FileIo.Create event doesn't return if the CreateFile() call |
| 2451 succeeded, so keep track of the file_object and check that it is |
| 2452 eventually closed with FileIo_Cleanup. |
| 2453 """ |
| 2454 #IRP = self.USER_DATA |
| 2455 TTID = self.USER_DATA + 1 # Thread ID, that's what we want. |
| 2456 FILE_OBJECT = self.USER_DATA + 2 |
| 2457 #CREATE_OPTIONS = self.USER_DATA + 3 |
| 2458 #FILE_ATTRIBUTES = self.USER_DATA + 4 |
| 2459 #self.USER_DATA + SHARE_ACCESS = 5 |
| 2460 OPEN_PATH = self.USER_DATA + 6 |
| 2461 |
| 2462 proc = self._thread_to_process(line[TTID]) |
| 2463 if not proc: |
| 2464 # Not a process we care about. |
| 2465 return |
| 2466 |
| 2467 match = re.match(r'^\"(.+)\"$', line[OPEN_PATH]) |
| 2468 raw_path = match.group(1) |
| 2469 # Ignore directories and bare drive right away. |
| 2470 if raw_path.endswith(os.path.sep): |
| 2471 return |
| 2472 filepath = self._drive_map.to_win32(raw_path) |
| 2473 # Ignore bare drive right away. Some may still fall through with format |
| 2474 # like '\\?\X:' |
| 2475 if len(filepath) == 2: |
| 2476 return |
| 2477 file_object = line[FILE_OBJECT] |
| 2478 if os.path.isdir(filepath): |
| 2479 # There is no O_DIRECTORY equivalent on Windows. The closed is |
| 2480 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So |
| 2481 # simply discard directories are they are found. |
| 2482 return |
| 2483 # Override any stale file object |
| 2484 proc.file_objects[file_object] = filepath |
| 2485 |
| 2486 def handle_FileIo_Rename(self, line): |
| 2487 # TODO(maruel): Handle? |
| 2488 pass |
| 2489 |
| 2490 def handle_Process_End(self, line): |
| 2491 pid = line[self.PID] |
| 2492 if self._process_lookup.get(pid): |
| 2493 logging.info('Terminated: %d' % pid) |
| 2494 self._process_lookup[pid] = None |
| 2495 else: |
| 2496 logging.debug('Terminated: %d' % pid) |
| 2497 |
| 2498 def handle_Process_Start(self, line): |
| 2499 """Handles a new child process started by PID.""" |
| 2500 #UNIQUE_PROCESS_KEY = self.USER_DATA |
| 2501 PROCESS_ID = self.USER_DATA + 1 |
| 2502 #PARENT_PID = self.USER_DATA + 2 |
| 2503 #SESSION_ID = self.USER_DATA + 3 |
| 2504 #EXIT_STATUS = self.USER_DATA + 4 |
| 2505 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5 |
| 2506 #USER_SID = self.USER_DATA + 6 |
| 2507 IMAGE_FILE_NAME = self.USER_DATA + 7 |
| 2508 COMMAND_LINE = self.USER_DATA + 8 |
| 2509 |
| 2510 ppid = line[self.PID] |
| 2511 pid = int(line[PROCESS_ID], 16) |
| 2512 logging.debug( |
| 2513 'New process %d->%d (%s) %s' % |
| 2514 (ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE])) |
| 2515 |
| 2516 if ppid == self._tracer_pid: |
| 2517 # Need to ignore processes we don't know about because the log is |
| 2518 # system-wide. self._tracer_pid shall start only one process. |
| 2519 if self.root_process: |
| 2520 raise TracingFailure( |
| 2521 ( 'Parent process is _tracer_pid(%d) but root_process(%d) is ' |
| 2522 'already set') % (self._tracer_pid, self.root_process.pid), |
| 2523 None, None, None) |
| 2524 proc = self.Process(self.blacklist, pid, None) |
| 2525 self.root_process = proc |
| 2526 ppid = None |
| 2527 elif self._process_lookup.get(ppid): |
| 2528 proc = self.Process(self.blacklist, pid, None) |
| 2529 self._process_lookup[ppid].children.append(proc) |
| 2530 else: |
| 2531 # Ignore |
| 2532 return |
| 2533 self._process_lookup[pid] = proc |
| 2534 |
| 2535 if (not line[IMAGE_FILE_NAME].startswith('"') or |
| 2536 not line[IMAGE_FILE_NAME].endswith('"')): |
| 2537 raise TracingFailure( |
| 2538 'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME], |
| 2539 None, None, None) |
| 2540 |
| 2541 # TODO(maruel): Process escapes. |
| 2542 if (not line[COMMAND_LINE].startswith('"') or |
| 2543 not line[COMMAND_LINE].endswith('"')): |
| 2544 raise TracingFailure( |
| 2545 'Command line is not properly quoted: %s' % line[COMMAND_LINE], |
| 2546 None, None, None) |
| 2547 proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1]) |
| 2548 proc.executable = line[IMAGE_FILE_NAME][1:-1] |
| 2549 # proc.command[0] may be the absolute path of 'executable' but it may be |
| 2550 # anything else too. If it happens that command[0] ends with executable, |
| 2551 # use it, otherwise defaults to the base name. |
| 2552 cmd0 = proc.command[0].lower() |
| 2553 if not cmd0.endswith('.exe'): |
| 2554 # TODO(maruel): That's not strictly true either. |
| 2555 cmd0 += '.exe' |
| 2556 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0): |
| 2557 # Fix the path. |
| 2558 cmd0 = cmd0.replace('/', os.path.sep) |
| 2559 cmd0 = os.path.normpath(cmd0) |
| 2560 proc.executable = get_native_path_case(cmd0) |
| 2561 logging.info( |
| 2562 'New child: %s -> %d %s' % (ppid, pid, proc.executable)) |
| 2563 |
| 2564 def handle_Thread_End(self, line): |
| 2565 """Has the same parameters as Thread_Start.""" |
| 2566 tid = int(line[self.TID], 16) |
| 2567 self._threads_active.pop(tid, None) |
| 2568 |
| 2569 def handle_Thread_Start(self, line): |
| 2570 """Handles a new thread created. |
| 2571 |
| 2572 Do not use self.PID here since a process' initial thread is created by |
| 2573 the parent process. |
| 2574 """ |
| 2575 PROCESS_ID = self.USER_DATA |
| 2576 TTHREAD_ID = self.USER_DATA + 1 |
| 2577 #STACK_BASE = self.USER_DATA + 2 |
| 2578 #STACK_LIMIT = self.USER_DATA + 3 |
| 2579 #USER_STACK_BASE = self.USER_DATA + 4 |
| 2580 #USER_STACK_LIMIT = self.USER_DATA + 5 |
| 2581 #AFFINITY = self.USER_DATA + 6 |
| 2582 #WIN32_START_ADDR = self.USER_DATA + 7 |
| 2583 #TEB_BASE = self.USER_DATA + 8 |
| 2584 #SUB_PROCESS_TAG = self.USER_DATA + 9 |
| 2585 #BASE_PRIORITY = self.USER_DATA + 10 |
| 2586 #PAGE_PRIORITY = self.USER_DATA + 11 |
| 2587 #IO_PRIORITY = self.USER_DATA + 12 |
| 2588 #THREAD_FLAGS = self.USER_DATA + 13 |
| 2589 # Do not use self.PID here since a process' initial thread is created by |
| 2590 # the parent process. |
| 2591 pid = int(line[PROCESS_ID], 16) |
| 2592 tid = int(line[TTHREAD_ID], 16) |
| 2593 logging.debug('New thread pid:%d, tid:%d' % (pid, tid)) |
| 2594 self._threads_active[tid] = pid |
| 2595 |
| 2596 @classmethod |
| 2597 def supported_events(cls): |
| 2598 """Returns all the procesed events.""" |
| 2599 out = [] |
| 2600 for member in dir(cls): |
| 2601 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member) |
| 2602 if match: |
| 2603 out.append(match.groups()) |
| 2604 return out |
| 2605 |
| 2606 class Tracer(ApiBase.Tracer): |
| 2607 # The basic headers. |
| 2608 EXPECTED_HEADER = [ |
| 2609 u'Event Name', |
| 2610 u'Type', |
| 2611 u'Event ID', |
| 2612 u'Version', |
| 2613 u'Channel', |
| 2614 u'Level', # 5 |
| 2615 u'Opcode', |
| 2616 u'Task', |
| 2617 u'Keyword', |
| 2618 u'PID', |
| 2619 u'TID', # 10 |
| 2620 u'Processor Number', |
| 2621 u'Instance ID', |
| 2622 u'Parent Instance ID', |
| 2623 u'Activity ID', |
| 2624 u'Related Activity ID', # 15 |
| 2625 u'Clock-Time', |
| 2626 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them |
| 2627 u'User(ms)', # pretty much useless. |
| 2628 u'User Data', # Extra arguments that are event-specific. |
| 2629 ] |
| 2630 # Only the useful headers common to all entries are listed there. Any column |
| 2631 # at 19 or higher is dependent on the specific event. |
| 2632 EVENT_NAME = 0 |
| 2633 TYPE = 1 |
| 2634 PID = 9 |
| 2635 TID = 10 |
| 2636 PROCESSOR_ID = 11 |
| 2637 TIMESTAMP = 16 |
| 2638 NULL_GUID = '{00000000-0000-0000-0000-000000000000}' |
| 2639 USER_DATA = 19 |
| 2640 |
| 2641 def __init__(self, logname): |
| 2642 """Starts the log collection. |
| 2643 |
| 2644 Requires administrative access. logman.exe is synchronous so no need for a |
| 2645 "warmup" call. 'Windows Kernel Trace' is *localized* so use its GUID |
| 2646 instead. The GUID constant name is SystemTraceControlGuid. Lovely. |
| 2647 |
| 2648 One can get the list of potentially interesting providers with: |
| 2649 "logman query providers | findstr /i file" |
| 2650 """ |
| 2651 super(LogmanTrace.Tracer, self).__init__(logname) |
| 2652 self._script = create_thunk() |
| 2653 cmd_start = [ |
| 2654 'logman.exe', |
| 2655 'start', |
| 2656 'NT Kernel Logger', |
| 2657 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}', |
| 2658 # splitio,fileiocompletion,syscall,file,cswitch,img |
| 2659 '(process,fileio,thread)', |
| 2660 '-o', self._logname + '.etl', |
| 2661 '-ets', # Send directly to kernel |
| 2662 # Values extracted out of thin air. |
| 2663 # Event Trace Session buffer size in kb. |
| 2664 '-bs', '10240', |
| 2665 # Number of Event Trace Session buffers. |
| 2666 '-nb', '16', '256', |
| 2667 ] |
| 2668 logging.debug('Running: %s' % cmd_start) |
| 2669 try: |
| 2670 subprocess.check_call( |
| 2671 cmd_start, |
| 2672 stdin=subprocess.PIPE, |
| 2673 stdout=subprocess.PIPE, |
| 2674 stderr=subprocess.STDOUT) |
| 2675 except subprocess.CalledProcessError, e: |
| 2676 if e.returncode == -2147024891: |
| 2677 print >> sys.stderr, 'Please restart with an elevated admin prompt' |
| 2678 elif e.returncode == -2144337737: |
| 2679 print >> sys.stderr, ( |
| 2680 'A kernel trace was already running, stop it and try again') |
| 2681 raise |
| 2682 |
| 2683 def trace(self, cmd, cwd, tracename, output): |
| 2684 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output)) |
| 2685 assert os.path.isabs(cmd[0]), cmd[0] |
| 2686 assert os.path.isabs(cwd), cwd |
| 2687 assert os.path.normpath(cwd) == cwd, cwd |
| 2688 with self._lock: |
| 2689 if not self._initialized: |
| 2690 raise TracingFailure( |
| 2691 'Called Tracer.trace() on an unitialized object', |
| 2692 None, None, None, tracename) |
| 2693 assert tracename not in (i['trace'] for i in self._traces) |
| 2694 |
| 2695 # Use "logman -?" for help. |
| 2696 |
| 2697 stdout = stderr = None |
| 2698 if output: |
| 2699 stdout = subprocess.PIPE |
| 2700 stderr = subprocess.STDOUT |
| 2701 |
| 2702 # Run the child process. |
| 2703 logging.debug('Running: %s' % cmd) |
| 2704 # Use the temporary script generated with create_thunk() so we have a |
| 2705 # clear pid owner. Since trace_inputs.py can be used as a library and |
| 2706 # could trace multiple processes simultaneously, it makes it more complex |
| 2707 # if the executable to be traced is executed directly here. It also solves |
| 2708 # issues related to logman.exe that needs to be executed to control the |
| 2709 # kernel trace. |
| 2710 child_cmd = [ |
| 2711 sys.executable, |
| 2712 self._script, |
| 2713 tracename, |
| 2714 ] |
| 2715 child = subprocess.Popen( |
| 2716 child_cmd + fix_python_path(cmd), |
| 2717 cwd=cwd, |
| 2718 stdin=subprocess.PIPE, |
| 2719 stdout=stdout, |
| 2720 stderr=stderr) |
| 2721 logging.debug('Started child pid: %d' % child.pid) |
| 2722 out = child.communicate()[0] |
| 2723 # This doesn't mean all the grand-children are done. Sadly, we don't have |
| 2724 # a good way to determine that. |
| 2725 |
| 2726 with self._lock: |
| 2727 assert tracename not in (i['trace'] for i in self._traces) |
| 2728 self._traces.append({ |
| 2729 'command': cmd, |
| 2730 'cwd': cwd, |
| 2731 'pid': child.pid, |
| 2732 'trace': tracename, |
| 2733 'output': out, |
| 2734 }) |
| 2735 |
| 2736 return child.returncode, out |
| 2737 |
| 2738 def close(self, _timeout=None): |
| 2739 """Stops the kernel log collection and converts the traces to text |
| 2740 representation. |
| 2741 """ |
| 2742 with self._lock: |
| 2743 if not self._initialized: |
| 2744 raise TracingFailure( |
| 2745 'Called Tracer.close() on an unitialized object', |
| 2746 None, None, None) |
| 2747 os.remove(self._script) |
| 2748 # Save metadata, add 'format' key.. |
| 2749 data = { |
| 2750 'format': 'csv', |
| 2751 'traces': self._traces, |
| 2752 } |
| 2753 write_json(self._logname, data, False) |
| 2754 |
| 2755 cmd_stop = [ |
| 2756 'logman.exe', |
| 2757 'stop', |
| 2758 'NT Kernel Logger', |
| 2759 '-ets', # Sends the command directly to the kernel. |
| 2760 ] |
| 2761 logging.debug('Running: %s' % cmd_stop) |
| 2762 subprocess.check_call( |
| 2763 cmd_stop, |
| 2764 stdin=subprocess.PIPE, |
| 2765 stdout=subprocess.PIPE, |
| 2766 stderr=subprocess.STDOUT) |
| 2767 self._initialized = False |
| 2768 |
| 2769 def post_process_log(self): |
| 2770 """Converts the .etl file into .csv then into .json.""" |
| 2771 super(LogmanTrace.Tracer, self).post_process_log() |
| 2772 logformat = 'csv' |
| 2773 self._convert_log(logformat) |
| 2774 |
| 2775 if logformat == 'csv_utf16': |
| 2776 def load_file(): |
| 2777 def utf_8_encoder(unicode_csv_data): |
| 2778 """Encodes the unicode object as utf-8 encoded str instance""" |
| 2779 for line in unicode_csv_data: |
| 2780 yield line.encode('utf-8') |
| 2781 |
| 2782 def unicode_csv_reader(unicode_csv_data, **kwargs): |
| 2783 """Encodes temporarily as UTF-8 since csv module doesn't do unicode. |
| 2784 """ |
| 2785 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs) |
| 2786 for row in csv_reader: |
| 2787 # Decode str utf-8 instances back to unicode instances, cell by |
| 2788 # cell: |
| 2789 yield [cell.decode('utf-8') for cell in row] |
| 2790 |
| 2791 # The CSV file is UTF-16 so use codecs.open() to load the file into |
| 2792 # the python internal unicode format (utf-8). Then explicitly |
| 2793 # re-encode as utf8 as str instances so csv can parse it fine. Then |
| 2794 # decode the utf-8 str back into python unicode instances. This |
| 2795 # sounds about right. |
| 2796 for line in unicode_csv_reader( |
| 2797 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')): |
| 2798 # line is a list of unicode objects |
| 2799 # So much white space! |
| 2800 yield [i.strip() for i in line] |
| 2801 |
| 2802 elif logformat == 'csv': |
| 2803 def load_file(): |
| 2804 def ansi_csv_reader(ansi_csv_data, **kwargs): |
| 2805 """Loads an 'ANSI' code page and returns unicode() objects.""" |
| 2806 assert sys.getfilesystemencoding() == 'mbcs' |
| 2807 encoding = get_current_encoding() |
| 2808 for row in csv.reader(ansi_csv_data, **kwargs): |
| 2809 # Decode str 'ansi' instances to unicode instances, cell by cell: |
| 2810 yield [cell.decode(encoding) for cell in row] |
| 2811 |
| 2812 # The fastest and smallest format but only supports 'ANSI' file paths. |
| 2813 # E.g. the filenames are encoding in the 'current' encoding. |
| 2814 for line in ansi_csv_reader(open(self._logname + '.' + logformat)): |
| 2815 # line is a list of unicode objects. |
| 2816 yield [i.strip() for i in line] |
| 2817 |
| 2818 supported_events = LogmanTrace.Context.supported_events() |
| 2819 |
| 2820 def trim(generator): |
| 2821 for index, line in enumerate(generator): |
| 2822 if not index: |
| 2823 if line != self.EXPECTED_HEADER: |
| 2824 raise TracingFailure( |
| 2825 'Found malformed header: %s' % ' '.join(line), |
| 2826 None, None, None) |
| 2827 continue |
| 2828 # As you can see, the CSV is full of useful non-redundant information: |
| 2829 if (line[2] != '0' or # Event ID |
| 2830 line[3] not in ('2', '3') or # Version |
| 2831 line[4] != '0' or # Channel |
| 2832 line[5] != '0' or # Level |
| 2833 line[7] != '0' or # Task |
| 2834 line[8] != '0x0000000000000000' or # Keyword |
| 2835 line[12] != '' or # Instance ID |
| 2836 line[13] != '' or # Parent Instance ID |
| 2837 line[14] != self.NULL_GUID or # Activity ID |
| 2838 line[15] != ''): # Related Activity ID |
| 2839 raise TracingFailure( |
| 2840 'Found unexpected values in line: %s' % ' '.join(line), |
| 2841 None, None, None) |
| 2842 |
| 2843 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events: |
| 2844 continue |
| 2845 |
| 2846 # Convert the PID in-place from hex. |
| 2847 line[self.PID] = int(line[self.PID], 16) |
| 2848 |
| 2849 yield [ |
| 2850 line[self.EVENT_NAME], |
| 2851 line[self.TYPE], |
| 2852 line[self.PID], |
| 2853 line[self.TID], |
| 2854 line[self.PROCESSOR_ID], |
| 2855 line[self.TIMESTAMP], |
| 2856 ] + line[self.USER_DATA:] |
| 2857 |
| 2858 write_json('%s.json' % self._logname, list(trim(load_file())), True) |
| 2859 |
| 2860 def _convert_log(self, logformat): |
| 2861 """Converts the ETL trace to text representation. |
| 2862 |
| 2863 Normally, 'csv' is sufficient. If complex scripts are used (like eastern |
| 2864 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'. |
| 2865 |
| 2866 Arguments: |
| 2867 - logformat: Text format to be generated, csv, csv_utf16 or xml. |
| 2868 |
| 2869 Use "tracerpt -?" for help. |
| 2870 """ |
| 2871 LOCALE_INVARIANT = 0x7F |
| 2872 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT) |
| 2873 cmd_convert = [ |
| 2874 'tracerpt.exe', |
| 2875 '-l', self._logname + '.etl', |
| 2876 '-o', self._logname + '.' + logformat, |
| 2877 '-gmt', # Use UTC |
| 2878 '-y', # No prompt |
| 2879 # Use -of XML to get the header of each items after column 19, e.g. all |
| 2880 # the actual headers of 'User Data'. |
| 2881 ] |
| 2882 |
| 2883 if logformat == 'csv': |
| 2884 # tracerpt localizes the 'Type' column, for major brainfuck |
| 2885 # entertainment. I can't imagine any sane reason to do that. |
| 2886 cmd_convert.extend(['-of', 'CSV']) |
| 2887 elif logformat == 'csv_utf16': |
| 2888 # This causes it to use UTF-16, which doubles the log size but ensures |
| 2889 # the log is readable for non-ASCII characters. |
| 2890 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode']) |
| 2891 elif logformat == 'xml': |
| 2892 cmd_convert.extend(['-of', 'XML']) |
| 2893 else: |
| 2894 raise ValueError('Unexpected log format \'%s\'' % logformat) |
| 2895 logging.debug('Running: %s' % cmd_convert) |
| 2896 # This can takes tens of minutes for large logs. |
| 2897 # Redirects all output to stderr. |
| 2898 subprocess.check_call( |
| 2899 cmd_convert, |
| 2900 stdin=subprocess.PIPE, |
| 2901 stdout=sys.stderr, |
| 2902 stderr=sys.stderr) |
| 2903 |
| 2904 @staticmethod |
| 2905 def clean_trace(logname): |
| 2906 for ext in ('', '.csv', '.etl', '.json', '.xml'): |
| 2907 if os.path.isfile(logname + ext): |
| 2908 os.remove(logname + ext) |
| 2909 |
| 2910 @classmethod |
| 2911 def parse_log(cls, logname, blacklist): |
| 2912 logging.info('parse_log(%s, %s)' % (logname, blacklist)) |
| 2913 |
| 2914 def blacklist_more(filepath): |
| 2915 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that. |
| 2916 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath) |
| 2917 |
| 2918 data = read_json(logname) |
| 2919 lines = read_json(logname + '.json') |
| 2920 out = [] |
| 2921 for item in data['traces']: |
| 2922 context = cls.Context(blacklist_more, item['pid']) |
| 2923 for line in lines: |
| 2924 context.on_line(line) |
| 2925 out.append( |
| 2926 { |
| 2927 'results': context.to_results(), |
| 2928 'trace': item['trace'], |
| 2929 'output': item['output'], |
| 2930 }) |
| 2931 return out |
| 2932 |
| 2933 |
| 2934 def get_api(): |
| 2935 """Returns the correct implementation for the current OS.""" |
| 2936 if sys.platform == 'cygwin': |
| 2937 raise NotImplementedError( |
| 2938 'Not implemented for cygwin, start the script from Win32 python') |
| 2939 flavors = { |
| 2940 'win32': LogmanTrace, |
| 2941 'darwin': Dtrace, |
| 2942 'sunos5': Dtrace, |
| 2943 'freebsd7': Dtrace, |
| 2944 'freebsd8': Dtrace, |
| 2945 } |
| 2946 # Defaults to strace. |
| 2947 return flavors.get(sys.platform, Strace)() |
| 2948 |
| 2949 |
| 2950 def extract_directories(root_dir, files, blacklist): |
| 2951 """Detects if all the files in a directory are in |files| and if so, replace |
| 2952 the individual files by a Results.Directory instance. |
| 2953 |
| 2954 Takes a list of Results.File instances and returns a shorter list of |
| 2955 Results.File and Results.Directory instances. |
| 2956 |
| 2957 Arguments: |
| 2958 - root_dir: Optional base directory that shouldn't be search further. |
| 2959 - files: list of Results.File instances. |
| 2960 - blacklist: regexp of files to ignore, for example r'.+\.pyc'. |
| 2961 """ |
| 2962 logging.info( |
| 2963 'extract_directories(%s, %d files, ...)' % (root_dir, len(files))) |
| 2964 assert not (root_dir or '').endswith(os.path.sep), root_dir |
| 2965 assert not root_dir or (get_native_path_case(root_dir) == root_dir) |
| 2966 assert not any(isinstance(f, Results.Directory) for f in files) |
| 2967 # Remove non existent files. |
| 2968 files = [f for f in files if f.existent] |
| 2969 if not files: |
| 2970 return files |
| 2971 # All files must share the same root, which can be None. |
| 2972 assert len(set(f.root for f in files)) == 1, set(f.root for f in files) |
| 2973 |
| 2974 # Creates a {directory: {filename: File}} mapping, up to root. |
| 2975 buckets = {} |
| 2976 if root_dir: |
| 2977 buckets[root_dir] = {} |
| 2978 for fileobj in files: |
| 2979 path = fileobj.full_path |
| 2980 directory = os.path.dirname(path) |
| 2981 assert directory |
| 2982 # Do not use os.path.basename() so trailing os.path.sep is kept. |
| 2983 basename = path[len(directory)+1:] |
| 2984 files_in_directory = buckets.setdefault(directory, {}) |
| 2985 files_in_directory[basename] = fileobj |
| 2986 # Add all the directories recursively up to root. |
| 2987 while True: |
| 2988 old_d = directory |
| 2989 directory = os.path.dirname(directory) |
| 2990 if directory + os.path.sep == root_dir or directory == old_d: |
| 2991 break |
| 2992 buckets.setdefault(directory, {}) |
| 2993 |
| 2994 root_prefix = len(root_dir) + 1 if root_dir else 0 |
| 2995 for directory in sorted(buckets, reverse=True): |
| 2996 actual = set(f for f in os.listdir(directory) if not blacklist(f)) |
| 2997 expected = set(buckets[directory]) |
| 2998 if not (actual - expected): |
| 2999 parent = os.path.dirname(directory) |
| 3000 buckets[parent][os.path.basename(directory)] = Results.Directory( |
| 3001 root_dir, |
| 3002 directory[root_prefix:], |
| 3003 False, |
| 3004 sum(f.size for f in buckets[directory].itervalues()), |
| 3005 sum(f.nb_files for f in buckets[directory].itervalues())) |
| 3006 # Remove the whole bucket. |
| 3007 del buckets[directory] |
| 3008 |
| 3009 # Reverse the mapping with what remains. The original instances are returned, |
| 3010 # so the cached meta data is kept. |
| 3011 files = sum((x.values() for x in buckets.itervalues()), []) |
| 3012 return sorted(files, key=lambda x: x.path) |
| 3013 |
| 3014 |
| 3015 def trace(logfile, cmd, cwd, api, output): |
| 3016 """Traces an executable. Returns (returncode, output) from api. |
| 3017 |
| 3018 Arguments: |
| 3019 - logfile: file to write to. |
| 3020 - cmd: command to run. |
| 3021 - cwd: current directory to start the process in. |
| 3022 - api: a tracing api instance. |
| 3023 - output: if True, returns output, otherwise prints it at the console. |
| 3024 """ |
| 3025 cmd = fix_python_path(cmd) |
| 3026 api.clean_trace(logfile) |
| 3027 with api.get_tracer(logfile) as tracer: |
| 3028 return tracer.trace(cmd, cwd, 'default', output) |
| 3029 |
| 3030 |
| 3031 def load_trace(logfile, root_dir, api, blacklist): |
| 3032 """Loads a trace file and returns the Results instance. |
| 3033 |
| 3034 Arguments: |
| 3035 - logfile: File to load. |
| 3036 - root_dir: Root directory to use to determine if a file is relevant to the |
| 3037 trace or not. |
| 3038 - api: A tracing api instance. |
| 3039 - blacklist: Optional blacklist function to filter out unimportant files. |
| 3040 """ |
| 3041 data = api.parse_log(logfile, (blacklist or (lambda _: False))) |
| 3042 assert len(data) == 1, 'More than one trace was detected!' |
| 3043 if 'exception' in data[0]: |
| 3044 # It got an exception, raise it. |
| 3045 raise data[0]['exception'] |
| 3046 results = data[0]['results'] |
| 3047 if root_dir: |
| 3048 results = results.strip_root(root_dir) |
| 3049 return results |
| 3050 |
| 3051 |
| 3052 def CMDclean(args): |
| 3053 """Cleans up traces.""" |
| 3054 parser = OptionParserTraceInputs(command='clean') |
| 3055 options, args = parser.parse_args(args) |
| 3056 api = get_api() |
| 3057 api.clean_trace(options.log) |
| 3058 return 0 |
| 3059 |
| 3060 |
| 3061 def CMDtrace(args): |
| 3062 """Traces an executable.""" |
| 3063 parser = OptionParserTraceInputs(command='trace') |
| 3064 parser.allow_interspersed_args = False |
| 3065 parser.add_option( |
| 3066 '-q', '--quiet', action='store_true', |
| 3067 help='Redirects traced executable output to /dev/null') |
| 3068 options, args = parser.parse_args(args) |
| 3069 |
| 3070 if not args: |
| 3071 parser.error('Please provide a command to run') |
| 3072 |
| 3073 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK): |
| 3074 args[0] = os.path.abspath(args[0]) |
| 3075 |
| 3076 api = get_api() |
| 3077 return trace(options.log, args, os.getcwd(), api, options.quiet)[0] |
| 3078 |
| 3079 |
| 3080 def CMDread(args): |
| 3081 """Reads the logs and prints the result.""" |
| 3082 parser = OptionParserTraceInputs(command='read') |
| 3083 parser.add_option( |
| 3084 '-V', '--variable', |
| 3085 nargs=2, |
| 3086 action='append', |
| 3087 dest='variables', |
| 3088 metavar='VAR_NAME directory', |
| 3089 default=[], |
| 3090 help=('Variables to replace relative directories against. Example: ' |
| 3091 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your ' |
| 3092 'home dir with $HOME') % getpass.getuser()) |
| 3093 parser.add_option( |
| 3094 '--root-dir', |
| 3095 help='Root directory to base everything off it. Anything outside of this ' |
| 3096 'this directory will not be reported') |
| 3097 parser.add_option( |
| 3098 '-j', '--json', action='store_true', |
| 3099 help='Outputs raw result data as json') |
| 3100 parser.add_option( |
| 3101 '-b', '--blacklist', action='append', default=[], |
| 3102 help='List of regexp to use as blacklist filter') |
| 3103 options, args = parser.parse_args(args) |
| 3104 |
| 3105 if options.root_dir: |
| 3106 options.root_dir = os.path.abspath(options.root_dir) |
| 3107 |
| 3108 variables = dict(options.variables) |
| 3109 api = get_api() |
| 3110 def blacklist(f): |
| 3111 return any(re.match(b, f) for b in options.blacklist) |
| 3112 data = api.parse_log(options.log, blacklist) |
| 3113 # Process each trace. |
| 3114 output_as_json = [] |
| 3115 for item in data: |
| 3116 if 'exception' in item: |
| 3117 print >> sys.stderr, ( |
| 3118 'Trace %s: Got an exception: %s' % (item['trace'], item['exception'])) |
| 3119 continue |
| 3120 results = item['results'] |
| 3121 if options.root_dir: |
| 3122 results = results.strip_root(options.root_dir) |
| 3123 |
| 3124 if options.json: |
| 3125 output_as_json.append(results.flatten()) |
| 3126 else: |
| 3127 simplified = extract_directories( |
| 3128 options.root_dir, results.files, blacklist) |
| 3129 simplified = [f.replace_variables(variables) for f in simplified] |
| 3130 if len(data) > 1: |
| 3131 print('Trace: %s' % item['trace']) |
| 3132 print('Total: %d' % len(results.files)) |
| 3133 print('Non existent: %d' % len(results.non_existent)) |
| 3134 for f in results.non_existent: |
| 3135 print(' %s' % f.path) |
| 3136 print( |
| 3137 'Interesting: %d reduced to %d' % ( |
| 3138 len(results.existent), len(simplified))) |
| 3139 for f in simplified: |
| 3140 print(' %s' % f.path) |
| 3141 |
| 3142 if options.json: |
| 3143 write_json(sys.stdout, output_as_json, False) |
| 3144 return 0 |
| 3145 |
| 3146 |
| 3147 class OptionParserWithLogging(optparse.OptionParser): |
| 3148 """Adds --verbose option.""" |
| 3149 def __init__(self, verbose=0, **kwargs): |
| 3150 optparse.OptionParser.__init__(self, **kwargs) |
| 3151 self.add_option( |
| 3152 '-v', '--verbose', |
| 3153 action='count', |
| 3154 default=verbose, |
| 3155 help='Use multiple times to increase verbosity') |
| 3156 |
| 3157 def parse_args(self, *args, **kwargs): |
| 3158 options, args = optparse.OptionParser.parse_args(self, *args, **kwargs) |
| 3159 levels = [logging.ERROR, logging.INFO, logging.DEBUG] |
| 3160 logging.basicConfig( |
| 3161 level=levels[min(len(levels)-1, options.verbose)], |
| 3162 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s') |
| 3163 return options, args |
| 3164 |
| 3165 |
| 3166 class OptionParserWithNiceDescription(OptionParserWithLogging): |
| 3167 """Generates the description with the command's docstring.""" |
| 3168 def __init__(self, **kwargs): |
| 3169 """Sets 'description' and 'usage' if not already specified.""" |
| 3170 command = kwargs.pop('command', 'help') |
| 3171 kwargs.setdefault( |
| 3172 'description', |
| 3173 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__)) |
| 3174 kwargs.setdefault('usage', '%%prog %s [options]' % command) |
| 3175 OptionParserWithLogging.__init__(self, **kwargs) |
| 3176 |
| 3177 |
| 3178 class OptionParserTraceInputs(OptionParserWithNiceDescription): |
| 3179 """Adds automatic --log handling.""" |
| 3180 def __init__(self, **kwargs): |
| 3181 OptionParserWithNiceDescription.__init__(self, **kwargs) |
| 3182 self.add_option( |
| 3183 '-l', '--log', help='Log file to generate or read, required') |
| 3184 |
| 3185 def parse_args(self, *args, **kwargs): |
| 3186 """Makes sure the paths make sense. |
| 3187 |
| 3188 On Windows, / and \ are often mixed together in a path. |
| 3189 """ |
| 3190 options, args = OptionParserWithNiceDescription.parse_args( |
| 3191 self, *args, **kwargs) |
| 3192 if not options.log: |
| 3193 self.error('Must supply a log file with -l') |
| 3194 options.log = os.path.abspath(options.log) |
| 3195 return options, args |
| 3196 |
| 3197 |
| 3198 def extract_documentation(): |
| 3199 """Returns a dict {command: description} for each of documented command.""" |
| 3200 commands = ( |
| 3201 fn[3:] |
| 3202 for fn in dir(sys.modules['__main__']) |
| 3203 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__) |
| 3204 return dict((fn, get_command_handler(fn).__doc__) for fn in commands) |
| 3205 |
| 3206 |
| 3207 def CMDhelp(args): |
| 3208 """Prints list of commands or help for a specific command.""" |
| 3209 doc = extract_documentation() |
| 3210 # Calculates the optimal offset. |
| 3211 offset = max(len(cmd) for cmd in doc) |
| 3212 format_str = ' %-' + str(offset + 2) + 's %s' |
| 3213 # Generate a one-liner documentation of each commands. |
| 3214 commands_description = '\n'.join( |
| 3215 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc)) |
| 3216 |
| 3217 parser = OptionParserWithNiceDescription( |
| 3218 usage='%prog <command> [options]', |
| 3219 description='Commands are:\n%s\n' % commands_description) |
| 3220 parser.format_description = lambda _: parser.description |
| 3221 |
| 3222 # Strip out any -h or --help argument. |
| 3223 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')]) |
| 3224 if len(args) == 1: |
| 3225 if not get_command_handler(args[0]): |
| 3226 parser.error('Unknown command %s' % args[0]) |
| 3227 # The command was "%prog help command", replaces ourself with |
| 3228 # "%prog command --help" so help is correctly printed out. |
| 3229 return main(args + ['--help']) |
| 3230 elif args: |
| 3231 parser.error('Unknown argument "%s"' % ' '.join(args)) |
| 3232 parser.print_help() |
| 3233 return 0 |
| 3234 |
| 3235 |
| 3236 def get_command_handler(name): |
| 3237 """Returns the command handler or CMDhelp if it doesn't exist.""" |
| 3238 return getattr(sys.modules['__main__'], 'CMD%s' % name, None) |
| 3239 |
| 3240 |
| 3241 def main_impl(argv): |
| 3242 command = get_command_handler(argv[0] if argv else 'help') |
| 3243 if not command: |
| 3244 return CMDhelp(argv) |
| 3245 return command(argv[1:]) |
| 3246 |
| 3247 def main(argv): |
| 3248 try: |
| 3249 main_impl(argv) |
| 3250 except TracingFailure, e: |
| 3251 sys.stderr.write('\nError: ') |
| 3252 sys.stderr.write(str(e)) |
| 3253 sys.stderr.write('\n') |
| 3254 return 1 |
| 3255 |
| 3256 |
| 3257 if __name__ == '__main__': |
| 3258 sys.exit(main(sys.argv[1:])) |
OLD | NEW |