tools/isolate/trace_inputs.py - Issue 11045023: Move src/tools/isolate to src/tools/swarm_client as a DEPS.

Side by Side Diff: tools/isolate/trace_inputs.py

Issue 11045023: Move src/tools/isolate to src/tools/swarm_client as a DEPS. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Use r159961 Created 8 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
	(Empty)
1 #!/usr/bin/env python

2 # coding=utf-8

3 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

4 # Use of this source code is governed by a BSD-style license that can be

5 # found in the LICENSE file.

6

7 """Traces an executable and its child processes and extract the files accessed

8 by them.

9

10 The implementation uses OS-specific API. The native Kernel logger and the ETL

11 interface is used on Windows. Dtrace is used on OSX. Strace is used otherwise.

12 The OS-specific implementation is hidden in an 'API' interface.

13

14 The results are embedded in a Results instance. The tracing is done in two

15 phases, the first is to do the actual trace and generate an

16 implementation-specific log file. Then the log file is parsed to extract the

17 information, including the individual child processes and the files accessed

18 from the log.

19 """

20

21 import codecs

22 import csv

23 import getpass

24 import glob

25 import json

26 import logging

27 import optparse

28 import os

29 import re

30 import subprocess

31 import sys

32 import tempfile

33 import threading

34 import time

35 import weakref

36

37 ## OS-specific imports

38

39 if sys.platform == 'win32':

40 from ctypes.wintypes import byref, create_unicode_buffer, c_int, c_wchar_p

41 from ctypes.wintypes import windll, FormatError # pylint: disable=E0611

42 from ctypes.wintypes import GetLastError # pylint: disable=E0611

43 elif sys.platform == 'darwin':

44 import Carbon.File # pylint: disable=F0401

45 import MacOS # pylint: disable=F0401

46

47

48 BASE_DIR = os.path.dirname(os.path.abspath(__file__))

49 ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR))

50

51

52 class TracingFailure(Exception):

53 """An exception occured during tracing."""

54 def __init__(self, description, pid, line_number, line, *args):

55 super(TracingFailure, self).__init__(

56 description, pid, line_number, line, *args)

57 self.description = description

58 self.pid = pid

59 self.line_number = line_number

60 self.line = line

61 self.extra = args

62

63 def __str__(self):

64 out = self.description

65 if self.pid:

66 out += '\npid: %d' % self.pid

67 if self.line_number:

68 out += '\nline: %d' % self.line_number

69 if self.line:

70 out += '\n%s' % self.line

71 if self.extra:

72 out += '\n' + ', '.join(map(str, filter(None, self.extra)))

73 return out

74

75

76 ## OS-specific functions

77

78 if sys.platform == 'win32':

79 def QueryDosDevice(drive_letter):

80 """Returns the Windows 'native' path for a DOS drive letter."""

81 assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter

82 # Guesswork. QueryDosDeviceW never returns the required number of bytes.

83 chars = 1024

84 drive_letter = unicode(drive_letter)

85 p = create_unicode_buffer(chars)

86 if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars):

87 err = GetLastError()

88 if err:

89 # pylint: disable=E0602

90 raise WindowsError(

91 err,

92 'QueryDosDevice(%s): %s (%d)' % (

93 str(drive_letter), FormatError(err), err))

94 return p.value

95

96

97 def GetShortPathName(long_path):

98 """Returns the Windows short path equivalent for a 'long' path."""

99 long_path = unicode(long_path)

100 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is

101 # not enforced.

102 if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'):

103 long_path = '\\\\?\\' + long_path

104 chars = windll.kernel32.GetShortPathNameW(long_path, None, 0)

105 if chars:

106 p = create_unicode_buffer(chars)

107 if windll.kernel32.GetShortPathNameW(long_path, p, chars):

108 return p.value

109

110 err = GetLastError()

111 if err:

112 # pylint: disable=E0602

113 raise WindowsError(

114 err,

115 'GetShortPathName(%s): %s (%d)' % (

116 str(long_path), FormatError(err), err))

117

118

119 def GetLongPathName(short_path):

120 """Returns the Windows long path equivalent for a 'short' path."""

121 short_path = unicode(short_path)

122 # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is

123 # not enforced.

124 if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'):

125 short_path = '\\\\?\\' + short_path

126 chars = windll.kernel32.GetLongPathNameW(short_path, None, 0)

127 if chars:

128 p = create_unicode_buffer(chars)

129 if windll.kernel32.GetLongPathNameW(short_path, p, chars):

130 return p.value

131

132 err = GetLastError()

133 if err:

134 # pylint: disable=E0602

135 raise WindowsError(

136 err,

137 'GetLongPathName(%s): %s (%d)' % (

138 str(short_path), FormatError(err), err))

139

140

141 def get_current_encoding():

142 """Returns the 'ANSI' code page associated to the process."""

143 return 'cp%d' % int(windll.kernel32.GetACP())

144

145

146 class DosDriveMap(object):

147 """Maps \Device\HarddiskVolumeN to N: on Windows."""

148 # Keep one global cache.

149 _MAPPING = {}

150

151 def __init__(self):

152 """Lazy loads the cache."""

153 if not self._MAPPING:

154 # This is related to UNC resolver on windows. Ignore that.

155 self._MAPPING['\\Device\\Mup'] = None

156 self._MAPPING['\\SystemRoot'] = os.environ['SystemRoot']

157

158 for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)):

159 try:

160 letter = '%s:' % letter

161 mapped = QueryDosDevice(letter)

162 if mapped in self._MAPPING:

163 logging.warn(

164 ('Two drives: \'%s\' and \'%s\', are mapped to the same disk'

165 '. Drive letters are a user-mode concept and the kernel '

166 'traces only have NT path, so all accesses will be '

167 'associated with the first drive letter, independent of the '

168 'actual letter used by the code') % (

169 self._MAPPING[mapped], letter))

170 else:

171 self._MAPPING[mapped] = letter

172 except WindowsError: # pylint: disable=E0602

173 pass

174

175 def to_win32(self, path):

176 """Converts a native NT path to Win32/DOS compatible path."""

177 match = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path)

178 if not match:

179 raise ValueError(

180 'Can\'t convert %s into a Win32 compatible path' % path,

181 path)

182 if not match.group(1) in self._MAPPING:

183 # Unmapped partitions may be accessed by windows for the

184 # fun of it while the test is running. Discard these.

185 return None

186 drive = self._MAPPING[match.group(1)]

187 if not drive or not match.group(2):

188 return drive

189 return drive + match.group(2)

190

191

192 def isabs(path):

193 """Accepts X: as an absolute path, unlike python's os.path.isabs()."""

194 return os.path.isabs(path) or len(path) == 2 and path[1] == ':'

195

196

197 def get_native_path_case(p):

198 """Returns the native path case for an existing file.

199

200 On Windows, removes any leading '\\?\'.

201 """

202 if not isabs(p):

203 raise ValueError(

204 'Can\'t get native path case for a non-absolute path: %s' % p,

205 p)

206 # Windows used to have an option to turn on case sensitivity on non Win32

207 # subsystem but that's out of scope here and isn't supported anymore.

208 # Go figure why GetShortPathName() is needed.

209 try:

210 out = GetLongPathName(GetShortPathName(p))

211 except OSError, e:

212 if e.args[0] in (2, 3, 5):

213 # The path does not exist. Try to recurse and reconstruct the path.

214 base = os.path.dirname(p)

215 rest = os.path.basename(p)

216 return os.path.join(get_native_path_case(base), rest)

217 raise

218 if out.startswith('\\\\?\\'):

219 out = out[4:]

220 # Always upper case the first letter since GetLongPathName() will return the

221 # drive letter in the case it was given.

222 return out[0].upper() + out[1:]

223

224

225 def CommandLineToArgvW(command_line):

226 """Splits a commandline into argv using CommandLineToArgvW()."""

227 # http://msdn.microsoft.com/library/windows/desktop/bb776391.aspx

228 size = c_int()

229 ptr = windll.shell32.CommandLineToArgvW(unicode(command_line), byref(size))

230 try:

231 return [arg for arg in (c_wchar_p * size.value).from_address(ptr)]

232 finally:

233 windll.kernel32.LocalFree(ptr)

234

235

236 elif sys.platform == 'darwin':

237

238

239 # On non-windows, keep the stdlib behavior.

240 isabs = os.path.isabs

241

242

243 def _find_item_native_case(root_path, item):

244 """Gets the native path case of a single item based at root_path.

245

246 There is no API to get the native path case of symlinks on OSX. So it

247 needs to be done the slow way.

248 """

249 item = item.lower()

250 for element in os.listdir(root_path):

251 if element.lower() == item:

252 return element

253

254

255 def _native_case(p):

256 """Gets the native path case. Warning: this function resolves symlinks."""

257 logging.debug('native_case(%s)' % p)

258 try:

259 rel_ref, _ = Carbon.File.FSPathMakeRef(p)

260 out = rel_ref.FSRefMakePath()

261 if p.endswith(os.path.sep) and not out.endswith(os.path.sep):

262 return out + os.path.sep

263 return out

264 except MacOS.Error, e:

265 if e.args[0] in (-43, -120):

266 # The path does not exist. Try to recurse and reconstruct the path.

267 # -43 means file not found.

268 # -120 means directory not found.

269 base = os.path.dirname(p)

270 rest = os.path.basename(p)

271 return os.path.join(_native_case(base), rest)

272 raise OSError(

273 e.args[0], 'Failed to get native path for %s' % p, p, e.args[1])

274

275

276 def _split_at_symlink_native(base_path, rest):

277 """Returns the native path for a symlink."""

278 base, symlink, rest = split_at_symlink(base_path, rest)

279 if symlink:

280 if not base_path:

281 base_path = base

282 else:

283 base_path = safe_join(base_path, base)

284 symlink = _find_item_native_case(base_path, symlink)

285 return base, symlink, rest

286

287

288 def get_native_path_case(path):

289 """Returns the native path case for an existing file.

290

291 Technically, it's only HFS+ on OSX that is case preserving and

292 insensitive. It's the default setting on HFS+ but can be changed.

293 """

294 if not isabs(path):

295 raise ValueError(

296 'Can\'t get native path case for a non-absolute path: %s' % path,

297 path)

298 if path.startswith('/dev'):

299 # /dev is not visible from Carbon, causing an exception.

300 return path

301

302 # Starts assuming there is no symlink along the path.

303 resolved = _native_case(path)

304 if resolved.lower() == path.lower():

305 # This code path is incredibly faster.

306 return resolved

307

308 # There was a symlink, process it.

309 base, symlink, rest = _split_at_symlink_native(None, path)

310 assert symlink, (path, base, symlink, rest, resolved)

311 prev = base

312 base = safe_join(_native_case(base), symlink)

313 assert len(base) > len(prev)

314 while rest:

315 prev = base

316 relbase, symlink, rest = _split_at_symlink_native(base, rest)

317 base = safe_join(base, relbase)

318 assert len(base) > len(prev), (prev, base, symlink)

319 if symlink:

320 base = safe_join(base, symlink)

321 assert len(base) > len(prev), (prev, base, symlink)

322 # Make sure no symlink was resolved.

323 assert base.lower() == path.lower(), (base, path)

324 return base

325

326

327 else: # OSes other than Windows and OSX.

328

329

330 # On non-windows, keep the stdlib behavior.

331 isabs = os.path.isabs

332

333

334 def get_native_path_case(path):

335 """Returns the native path case for an existing file.

336

337 On OSes other than OSX and Windows, assume the file system is

338 case-sensitive.

339

340 TODO(maruel): This is not strictly true. Implement if necessary.

341 """

342 if not isabs(path):

343 raise ValueError(

344 'Can\'t get native path case for a non-absolute path: %s' % path,

345 path)

346 # Give up on cygwin, as GetLongPathName() can't be called.

347 # Linux traces tends to not be normalized so use this occasion to normalize

348 # it. This function implementation already normalizes the path on the other

349 # OS so this needs to be done here to be coherent between OSes.

350 out = os.path.normpath(path)

351 if path.endswith(os.path.sep) and not out.endswith(os.path.sep):

352 return out + os.path.sep

353 return out

354

355

356 if sys.platform != 'win32': # All non-Windows OSes.

357

358

359 def safe_join(*args):

360 """Joins path elements like os.path.join() but doesn't abort on absolute

361 path.

362

363 os.path.join('foo', '/bar') == '/bar'

364 but safe_join('foo', '/bar') == 'foo/bar'.

365 """

366 out = ''

367 for element in args:

368 if element.startswith(os.path.sep):

369 if out.endswith(os.path.sep):

370 out += element[1:]

371 else:

372 out += element

373 else:

374 if out.endswith(os.path.sep):

375 out += element

376 else:

377 out += os.path.sep + element

378 return out

379

380

381 def split_at_symlink(base_dir, relfile):

382 """Scans each component of relfile and cut the string at the symlink if

383 there is any.

384

385 Returns a tuple (base_path, symlink, rest), with symlink == rest == None if

386 not symlink was found.

387 """

388 if base_dir:

389 assert relfile

390 assert os.path.isabs(base_dir)

391 index = 0

392 else:

393 assert os.path.isabs(relfile)

394 index = 1

395

396 def at_root(rest):

397 if base_dir:

398 return safe_join(base_dir, rest)

399 return rest

400

401 while True:

402 try:

403 index = relfile.index(os.path.sep, index)

404 except ValueError:

405 index = len(relfile)

406 full = at_root(relfile[:index])

407 if os.path.islink(full):

408 # A symlink!

409 base = os.path.dirname(relfile[:index])

410 symlink = os.path.basename(relfile[:index])

411 rest = relfile[index:]

412 logging.debug(

413 'split_at_symlink(%s, %s) -> (%s, %s, %s)' %

414 (base_dir, relfile, base, symlink, rest))

415 return base, symlink, rest

416 if index == len(relfile):

417 break

418 index += 1

419 return relfile, None, None

420

421

422 def fix_python_path(cmd):

423 """Returns the fixed command line to call the right python executable."""

424 out = cmd[:]

425 if out[0] == 'python':

426 out[0] = sys.executable

427 elif out[0].endswith('.py'):

428 out.insert(0, sys.executable)

429 return out

430

431

432 def create_thunk():

433 handle, name = tempfile.mkstemp(prefix='trace_inputs_thunk', suffix='.py')

434 os.write(

435 handle,

436 (

437 'import subprocess\n'

438 'import sys\n'

439 'sys.exit(subprocess.call(sys.argv[2:]))\n'

440 ))

441 os.close(handle)

442 return name

443

444

445 def strace_process_quoted_arguments(text):

446 """Extracts quoted arguments on a string and return the arguments as a list.

447

448 Implemented as an automaton. Supports incomplete strings in the form

449 '"foo"...'.

450

451 Example:

452 With text = '"foo", "bar"', the function will return ['foo', 'bar']

453

454 TODO(maruel): Implement escaping.

455 """

456 # All the possible states of the DFA.

457 ( NEED_QUOTE, # Begining of a new arguments.

458 INSIDE_STRING, # Inside an argument.

459 ESCAPED, # Found a '\' inside a quote. Treat the next char as-is.

460 NEED_COMMA_OR_DOT, # Right after the closing quote of an argument. Could be

461 # a serie of 3 dots or a comma.

462 NEED_SPACE, # Right after a comma

463 NEED_DOT_2, # Found a dot, need a second one.

464 NEED_DOT_3, # Found second dot, need a third one.

465 NEED_COMMA, # Found third dot, need a comma.

466 ) = range(8)

467

468 state = NEED_QUOTE

469 out = []

470 for index, char in enumerate(text):

471 if char == '"':

472 if state == NEED_QUOTE:

473 state = INSIDE_STRING

474 # A new argument was found.

475 out.append('')

476 elif state == INSIDE_STRING:

477 # The argument is now closed.

478 state = NEED_COMMA_OR_DOT

479 elif state == ESCAPED:

480 out[-1] += char

481 state = INSIDE_STRING

482 else:

483 raise ValueError(

484 'Can\'t process char at column %d for: %r' % (index, text),

485 index,

486 text)

487 elif char == ',':

488 if state in (NEED_COMMA_OR_DOT, NEED_COMMA):

489 state = NEED_SPACE

490 elif state == INSIDE_STRING:

491 out[-1] += char

492 elif state == ESCAPED:

493 out[-1] += char

494 state = INSIDE_STRING

495 else:

496 raise ValueError(

497 'Can\'t process char at column %d for: %r' % (index, text),

498 index,

499 text)

500 elif char == ' ':

501 if state == NEED_SPACE:

502 state = NEED_QUOTE

503 elif state == INSIDE_STRING:

504 out[-1] += char

505 elif state == ESCAPED:

506 out[-1] += char

507 state = INSIDE_STRING

508 else:

509 raise ValueError(

510 'Can\'t process char at column %d for: %r' % (index, text),

511 index,

512 text)

513 elif char == '.':

514 if state == NEED_COMMA_OR_DOT:

515 # The string is incomplete, this mean the strace -s flag should be

516 # increased.

517 state = NEED_DOT_2

518 elif state == NEED_DOT_2:

519 state = NEED_DOT_3

520 elif state == NEED_DOT_3:

521 state = NEED_COMMA

522 elif state == INSIDE_STRING:

523 out[-1] += char

524 elif state == ESCAPED:

525 out[-1] += char

526 state = INSIDE_STRING

527 else:

528 raise ValueError(

529 'Can\'t process char at column %d for: %r' % (index, text),

530 index,

531 text)

532 elif char == '\\':

533 if state == ESCAPED:

534 out[-1] += char

535 state = INSIDE_STRING

536 elif state == INSIDE_STRING:

537 state = ESCAPED

538 else:

539 raise ValueError(

540 'Can\'t process char at column %d for: %r' % (index, text),

541 index,

542 text)

543 else:

544 if state == INSIDE_STRING:

545 out[-1] += char

546 else:

547 raise ValueError(

548 'Can\'t process char at column %d for: %r' % (index, text),

549 index,

550 text)

551 if state not in (NEED_COMMA, NEED_COMMA_OR_DOT):

552 raise ValueError(

553 'String is incorrectly terminated: %r' % text,

554 text)

555 return out

556

557

558 def read_json(filepath):

559 with open(filepath, 'r') as f:

560 return json.load(f)

561

562

563 def write_json(filepath_or_handle, data, dense):

564 """Writes data into filepath or file handle encoded as json.

565

566 If dense is True, the json is packed. Otherwise, it is human readable.

567 """

568 if hasattr(filepath_or_handle, 'write'):

569 if dense:

570 filepath_or_handle.write(

571 json.dumps(data, sort_keys=True, separators=(',',':')))

572 else:

573 filepath_or_handle.write(json.dumps(data, sort_keys=True, indent=2))

574 else:

575 with open(filepath_or_handle, 'wb') as f:

576 if dense:

577 json.dump(data, f, sort_keys=True, separators=(',',':'))

578 else:

579 json.dump(data, f, sort_keys=True, indent=2)

580

581

582 class Results(object):

583 """Results of a trace session."""

584

585 class _TouchedObject(object):

586 """Something, a file or a directory, that was accessed."""

587 def __init__(self, root, path, tainted, size, nb_files):

588 logging.debug(

589 '%s(%s, %s, %s, %s, %s)' %

590 (self.__class__.__name__, root, path, tainted, size, nb_files))

591 self.root = root

592 self.path = path

593 self.tainted = tainted

594 self.nb_files = nb_files

595 # Can be used as a cache or a default value, depending on context.

596 self._size = size

597 # These are cache only.

598 self._real_path = None

599

600 # Check internal consistency.

601 assert path, path

602 assert tainted or bool(root) != bool(isabs(path)), (root, path)

603 assert tainted or (

604 not os.path.exists(self.full_path) or

605 (self.full_path == get_native_path_case(self.full_path))), (

606 tainted, self.full_path, get_native_path_case(self.full_path))

607

608 @property

609 def existent(self):

610 return self.size != -1

611

612 @property

613 def full_path(self):

614 if self.root:

615 return os.path.join(self.root, self.path)

616 return self.path

617

618 @property

619 def real_path(self):

620 """Returns the path with symlinks resolved."""

621 if not self._real_path:

622 self._real_path = os.path.realpath(self.full_path)

623 return self._real_path

624

625 @property

626 def size(self):

627 """File's size. -1 is not existent."""

628 if self._size is None and not self.tainted:

629 try:

630 self._size = os.stat(self.full_path).st_size

631 except OSError:

632 self._size = -1

633 return self._size

634

635 def flatten(self):

636 """Returns a dict representing this object.

637

638 A 'size' of 0 means the file was only touched and not read.

639 """

640 return {

641 'path': self.path,

642 'size': self.size,

643 }

644

645 def replace_variables(self, variables):

646 """Replaces the root of this File with one of the variables if it matches.

647

648 If a variable replacement occurs, the cloned object becomes tainted.

649 """

650 for variable, root_path in variables.iteritems():

651 if self.path.startswith(root_path):

652 return self._clone(

653 self.root, variable + self.path[len(root_path):], True)

654 # No need to clone, returns ourself.

655 return self

656

657 def strip_root(self, root):

658 """Returns a clone of itself with 'root' stripped off."""

659 # Check internal consistency.

660 assert self.tainted or (isabs(root) and root.endswith(os.path.sep)), root

661 if not self.full_path.startswith(root):

662 # Now try to resolve the symlinks to see if it can be reached this way.

663 # Only try after trying without resolving symlink.

664 if not self.real_path.startswith(root):

665 return None

666 path = self.real_path

667 else:

668 path = self.full_path

669 return self._clone(root, path[len(root):], self.tainted)

670

671 def _clone(self, new_root, new_path, tainted):

672 raise NotImplementedError(self.__class__.__name__)

673

674 class File(_TouchedObject):

675 """A file that was accessed. May not be present anymore.

676

677 If tainted is true, it means it is not a real path anymore as a variable

678 replacement occured.

679

680 If touched_only is True, this means the file was probed for existence, and

681 it is existent, but was never _opened_. If touched_only is True, the file

682 must have existed.

683 """

684 def __init__(self, root, path, tainted, size):

685 super(Results.File, self).__init__(root, path, tainted, size, 1)

686

687 def _clone(self, new_root, new_path, tainted):

688 """Clones itself keeping meta-data."""

689 # Keep the self.size and self._real_path caches for performance reason. It

690 # is also important when the file becomes tainted (with a variable instead

691 # of the real path) since self.path is not an on-disk path anymore so

692 # out._size cannot be updated.

693 out = self.__class__(new_root, new_path, tainted, self.size)

694 out._real_path = self._real_path

695 return out

696

697 class Directory(_TouchedObject):

698 """A directory of files. Must exist."""

699 def __init__(self, root, path, tainted, size, nb_files):

700 """path='.' is a valid value and must be handled appropriately."""

701 assert not path.endswith(os.path.sep), path

702 super(Results.Directory, self).__init__(

703 root, path + os.path.sep, tainted, size, nb_files)

704 # For a Directory instance, self.size is not a cache, it's an actual value

705 # that is never modified and represents the total size of the files

706 # contained in this directory. It is possible that the directory is empty

707 # so that size == 0; this happens if there's only an invalid symlink in

708 # it.

709

710 def flatten(self):

711 out = super(Results.Directory, self).flatten()

712 out['nb_files'] = self.nb_files

713 return out

714

715 def _clone(self, new_root, new_path, tainted):

716 """Clones itself keeping meta-data."""

717 out = self.__class__(

718 new_root,

719 new_path.rstrip(os.path.sep),

720 tainted,

721 self.size,

722 self.nb_files)

723 out._real_path = self._real_path

724 return out

725

726 class Process(object):

727 """A process that was traced.

728

729 Contains references to the files accessed by this process and its children.

730 """

731 def __init__(self, pid, files, executable, command, initial_cwd, children):

732 logging.debug('Process(%s, %d, ...)' % (pid, len(files)))

733 self.pid = pid

734 self.files = sorted(files, key=lambda x: x.path)

735 self.children = children

736 self.executable = executable

737 self.command = command

738 self.initial_cwd = initial_cwd

739

740 # Check internal consistency.

741 assert len(set(f.path for f in self.files)) == len(self.files), sorted(

742 f.path for f in self.files)

743 assert isinstance(self.children, list)

744 assert isinstance(self.files, list)

745

746 @property

747 def all(self):

748 for child in self.children:

749 for i in child.all:

750 yield i

751 yield self

752

753 def flatten(self):

754 return {

755 'children': [c.flatten() for c in self.children],

756 'command': self.command,

757 'executable': self.executable,

758 'files': [f.flatten() for f in self.files],

759 'initial_cwd': self.initial_cwd,

760 'pid': self.pid,

761 }

762

763 def strip_root(self, root):

764 assert isabs(root) and root.endswith(os.path.sep), root

765 # Loads the files after since they are constructed as objects.

766 out = self.__class__(

767 self.pid,

768 filter(None, (f.strip_root(root) for f in self.files)),

769 self.executable,

770 self.command,

771 self.initial_cwd,

772 [c.strip_root(root) for c in self.children])

773 logging.debug(

774 'strip_root(%s) %d -> %d' % (root, len(self.files), len(out.files)))

775 return out

776

777 def __init__(self, process):

778 self.process = process

779 # Cache.

780 self._files = None

781

782 def flatten(self):

783 return {

784 'root': self.process.flatten(),

785 }

786

787 @property

788 def files(self):

789 if self._files is None:

790 self._files = sorted(

791 sum((p.files for p in self.process.all), []),

792 key=lambda x: x.path)

793 return self._files

794

795 @property

796 def existent(self):

797 return [f for f in self.files if f.existent]

798

799 @property

800 def non_existent(self):

801 return [f for f in self.files if not f.existent]

802

803 def strip_root(self, root):

804 """Returns a clone with all the files outside the directory \|root\| removed

805 and converts all the path to be relative paths.

806 """

807 # Resolve any symlink

808 root = os.path.realpath(root)

809 root = get_native_path_case(root).rstrip(os.path.sep) + os.path.sep

810 logging.debug('strip_root(%s)' % root)

811 return Results(self.process.strip_root(root))

812

813

814 class ApiBase(object):

815 """OS-agnostic API to trace a process and its children."""

816 class Context(object):

817 """Processes one log line at a time and keeps the list of traced processes.

818

819 The parsing is complicated by the fact that logs are traced out of order for

820 strace but in-order for dtrace and logman. In addition, on Windows it is

821 very frequent that processids are reused so a flat list cannot be used. But

822 at the same time, it is impossible to faithfully construct a graph when the

823 logs are processed out of order. So both a tree and a flat mapping are used,

824 the tree is the real process tree, while the flat mapping stores the last

825 valid process for the corresponding processid. For the strace case, the

826 tree's head is guessed at the last moment.

827 """

828 class Process(object):

829 """Keeps context for one traced child process.

830

831 Logs all the files this process touched. Ignores directories.

832 """

833 def __init__(self, blacklist, pid, initial_cwd):

834 # Check internal consistency.

835 assert isinstance(pid, int), repr(pid)

836 self.pid = pid

837 # children are Process instances.

838 self.children = []

839 self.initial_cwd = initial_cwd

840 self.cwd = None

841 self.files = set()

842 self.only_touched = set()

843 self.executable = None

844 self.command = None

845 self._blacklist = blacklist

846

847 def to_results_process(self):

848 """Resolves file case sensitivity and or late-bound strings."""

849 # When resolving files, it's normal to get dupe because a file could be

850 # opened multiple times with different case. Resolve the deduplication

851 # here.

852 def fix_path(x):

853 """Returns the native file path case.

854

855 Converts late-bound strings.

856 """

857 if not x:

858 # Do not convert None instance to 'None'.

859 return x

860 # TODO(maruel): Do not upconvert to unicode here, on linux we don't

861 # know the file path encoding so they must be treated as bytes.

862 x = unicode(x)

863 if os.path.isabs(x):

864 # If the path is not absolute, which tends to happen occasionally on

865 # Windows, it is not possible to get the native path case so ignore

866 # that trace. It mostly happens for 'executable' value.

867 x = get_native_path_case(x)

868 return x

869

870 def fix_and_blacklist_path(x):

871 x = fix_path(x)

872 if not x:

873 return

874 # The blacklist needs to be reapplied, since path casing could

875 # influence blacklisting.

876 if self._blacklist(x):

877 return

878 return x

879

880 # Filters out directories. Some may have passed through.

881 files = set(f for f in map(fix_and_blacklist_path, self.files) if f)

882 only_touched = set(

883 f for f in map(fix_and_blacklist_path, self.only_touched) if f)

884 only_touched -= files

885

886 files = [

887 Results.File(None, f, False, None) for f in files

888 if not os.path.isdir(f)

889 ]

890 # Using 0 as size means the file's content is ignored since the file was

891 # never opened for I/O.

892 files.extend(

893 Results.File(None, f, False, 0) for f in only_touched

894 if not os.path.isdir(f)

895 )

896 return Results.Process(

897 self.pid,

898 files,

899 fix_path(self.executable),

900 self.command,

901 fix_path(self.initial_cwd),

902 [c.to_results_process() for c in self.children])

903

904 def add_file(self, filepath, touch_only):

905 """Adds a file if it passes the blacklist."""

906 if self._blacklist(unicode(filepath)):

907 return

908 logging.debug('add_file(%d, %s, %s)' % (self.pid, filepath, touch_only))

909 # Note that filepath and not unicode(filepath) is added. It is because

910 # filepath could be something else than a string, like a RelativePath

911 # instance for dtrace logs.

912 if touch_only:

913 self.only_touched.add(filepath)

914 else:

915 self.files.add(filepath)

916

917 def __init__(self, blacklist):

918 self.blacklist = blacklist

919 # Initial process.

920 self.root_process = None

921 # dict to accelerate process lookup, to not have to lookup the whole graph

922 # each time.

923 self._process_lookup = {}

924

925 class Tracer(object):

926 """During it's lifetime, the tracing subsystem is enabled."""

927 def __init__(self, logname):

928 self._logname = logname

929 self._lock = threading.Lock()

930 self._traces = []

931 self._initialized = True

932

933 def trace(self, cmd, cwd, tracename, output):

934 """Runs the OS-specific trace program on an executable.

935

936 Arguments:

937 - cmd: The command (a list) to run.

938 - cwd: Current directory to start the child process in.

939 - tracename: Name of the trace in the logname file.

940 - output: If False, redirects output to PIPEs.

941

942 Returns a tuple (resultcode, output) and updates the internal trace

943 entries.

944 """

945 # The implementation adds an item to self._traces.

946 raise NotImplementedError(self.__class__.__name__)

947

948 def close(self, _timeout=None):

949 """Saves the meta-data in the logname file.

950

951 For kernel-based tracing, stops the tracing subsystem.

952

953 Must not be used manually when using 'with' construct.

954 """

955 with self._lock:

956 assert self._initialized

957 try:

958 data = {

959 'traces': self._traces,

960 }

961 write_json(self._logname, data, False)

962 finally:

963 self._initialized = False

964

965 def post_process_log(self):

966 """Post-processes the log so it becomes faster to load afterward.

967

968 Must not be used manually when using 'with' construct.

969 """

970 assert not self._initialized, 'Must stop tracing first.'

971

972 def __enter__(self):

973 """Enables 'with' statement."""

974 return self

975

976 def __exit__(self, exc_type, exc_value, traceback):

977 """Enables 'with' statement."""

978 self.close()

979 # If an exception was thrown, do not process logs.

980 if not exc_type:

981 self.post_process_log()

982

983 def get_tracer(self, logname):

984 """Returns an ApiBase.Tracer instance.

985

986 Initializes the tracing subsystem, which is a requirement for kernel-based

987 tracers. Only one tracer instance should be live at a time!

988

989 logname is the filepath to the json file that will contain the meta-data

990 about the logs.

991 """

992 return self.Tracer(logname)

993

994 @staticmethod

995 def clean_trace(logname):

996 """Deletes an old log."""

997 raise NotImplementedError()

998

999 @classmethod

1000 def parse_log(cls, logname, blacklist):

1001 """Processes trace logs and returns the files opened and the files that do

1002 not exist.

1003

1004 It does not track directories.

1005

1006 Most of the time, files that do not exist are temporary test files that

1007 should be put in /tmp instead. See http://crbug.com/116251.

1008

1009 Returns a list of dict with keys:

1010 - results: A Results instance.

1011 - trace: The corresponding tracename parameter provided to

1012 get_tracer().trace().

1013 - output: Output gathered during execution, if get_tracer().trace(...,

1014 output=False) was used.

1015 """

1016 raise NotImplementedError(cls.__class__.__name__)

1017

1018

1019 class Strace(ApiBase):

1020 """strace implies linux."""

1021 class Context(ApiBase.Context):

1022 """Processes a strace log line and keeps the list of existent and non

1023 existent files accessed.

1024

1025 Ignores directories.

1026

1027 Uses late-binding to processes the cwd of each process. The problem is that

1028 strace generates one log file per process it traced but doesn't give any

1029 information about which process was started when and by who. So we don't

1030 even know which process is the initial one. So process the logs out of

1031 order and use late binding with RelativePath to be able to deduce the

1032 initial directory of each process once all the logs are parsed.

1033 """

1034 class Process(ApiBase.Context.Process):

1035 """Represents the state of a process.

1036

1037 Contains all the information retrieved from the pid-specific log.

1038 """

1039 # Function names are using ([a-z_0-9]+)

1040 # This is the most common format. function(args) = result

1041 RE_HEADER = re.compile(r'^([a-z_0-9]+)$(.+?)$\s+= (.+)$')

1042 # An interrupted function call, only grab the minimal header.

1043 RE_UNFINISHED = re.compile(r'^([^\(]+)(.*) \<unfinished \.\.\.\>$')

1044 # A resumed function call.

1045 RE_RESUMED = re.compile(r'^<\.\.\. ([^ ]+) resumed> (.+)$')

1046 # A process received a signal.

1047 RE_SIGNAL = re.compile(r'^--- SIG[A-Z]+ .+ ---')

1048 # A process didn't handle a signal. Ignore any junk appearing before,

1049 # because the process was forcibly killed so it won't open any new file.

1050 RE_KILLED = re.compile(

1051 r'^.*\+\+\+ killed by ([A-Z]+)( $core dumped$)? \+\+\+$')

1052 # The process has exited.

1053 RE_PROCESS_EXITED = re.compile(r'^\+\+\+ exited with (\d+) \+\+\+')

1054 # A call was canceled. Ignore any prefix.

1055 RE_UNAVAILABLE = re.compile(r'^.\)\s= \? <unavailable>$')

1056 # Happens when strace fails to even get the function name.

1057 UNNAMED_FUNCTION = '????'

1058

1059 # Corner-case in python, a class member function decorator must not be

1060 # @staticmethod.

1061 def parse_args(regexp, expect_zero): # pylint: disable=E0213

1062 """Automatically convert the str 'args' into a list of processed

1063 arguments.

1064

1065 Arguments:

1066 - regexp is used to parse args.

1067 - expect_zero: one of True, False or None.

1068 - True: will check for result.startswith('0') first and will ignore

1069 the trace line completely otherwise. This is important because for

1070 many functions, the regexp will not process if the call failed.

1071 - False: will check for not result.startswith(('?', '-1')) for the

1072 same reason than with True.

1073 - None: ignore result.

1074 """

1075 def meta_hook(function):

1076 assert function.__name__.startswith('handle_')

1077 def hook(self, args, result):

1078 if expect_zero is True and not result.startswith('0'):

1079 return

1080 if expect_zero is False and result.startswith(('?', '-1')):

1081 return

1082 match = re.match(regexp, args)

1083 if not match:

1084 raise TracingFailure(

1085 'Failed to parse %s(%s) = %s' %

1086 (function.__name__[len('handle_'):], args, result),

1087 None, None, None)

1088 return function(self, match.groups(), result)

1089 return hook

1090 return meta_hook

1091

1092 class RelativePath(object):

1093 """A late-bound relative path."""

1094 def __init__(self, parent, value):

1095 self.parent = parent

1096 self.value = value

1097

1098 def render(self):

1099 """Returns the current directory this instance is representing.

1100

1101 This function is used to return the late-bound value.

1102 """

1103 if self.value and self.value.startswith(u'/'):

1104 # An absolute path.

1105 return self.value

1106 parent = self.parent.render() if self.parent else u'<None>'

1107 if self.value:

1108 return os.path.normpath(os.path.join(parent, self.value))

1109 return parent

1110

1111 def __unicode__(self):

1112 """Acts as a string whenever needed."""

1113 return unicode(self.render())

1114

1115 def __str__(self):

1116 """Acts as a string whenever needed."""

1117 return str(self.render())

1118

1119 def __init__(self, root, pid):

1120 """Keeps enough information to be able to guess the original process

1121 root.

1122

1123 strace doesn't store which process was the initial process. So more

1124 information needs to be kept so the graph can be reconstructed from the

1125 flat map.

1126 """

1127 logging.info('%s(%d)' % (self.__class__.__name__, pid))

1128 super(Strace.Context.Process, self).__init__(root.blacklist, pid, None)

1129 assert isinstance(root, ApiBase.Context)

1130 self._root = weakref.ref(root)

1131 # The dict key is the function name of the pending call, like 'open'

1132 # or 'execve'.

1133 self._pending_calls = {}

1134 self._line_number = 0

1135 # Current directory when the process started.

1136 self.initial_cwd = self.RelativePath(self._root(), None)

1137 self.parentid = None

1138

1139 def get_cwd(self):

1140 """Returns the best known value of cwd."""

1141 return self.cwd or self.initial_cwd

1142

1143 def render(self):

1144 """Returns the string value of the RelativePath() object.

1145

1146 Used by RelativePath. Returns the initial directory and not the

1147 current one since the current directory 'cwd' validity is time-limited.

1148

1149 The validity is only guaranteed once all the logs are processed.

1150 """

1151 return self.initial_cwd.render()

1152

1153 def on_line(self, line):

1154 self._line_number += 1

1155 if self.RE_SIGNAL.match(line):

1156 # Ignore signals.

1157 return

1158

1159 try:

1160 match = self.RE_KILLED.match(line)

1161 if match:

1162 # Converts a '+++ killed by Foo +++' trace into an exit_group().

1163 self.handle_exit_group(match.group(1), None)

1164 return

1165

1166 match = self.RE_PROCESS_EXITED.match(line)

1167 if match:

1168 # Converts a '+++ exited with 1 +++' trace into an exit_group()

1169 self.handle_exit_group(match.group(1), None)

1170 return

1171

1172 match = self.RE_UNFINISHED.match(line)

1173 if match:

1174 if match.group(1) in self._pending_calls:

1175 raise TracingFailure(

1176 'Found two unfinished calls for the same function',

1177 None, None, None,

1178 self._pending_calls)

1179 self._pending_calls[match.group(1)] = (

1180 match.group(1) + match.group(2))

1181 return

1182

1183 match = self.RE_UNAVAILABLE.match(line)

1184 if match:

1185 # This usually means a process was killed and a pending call was

1186 # canceled.

1187 # TODO(maruel): Look up the last exit_group() trace just above and

1188 # make sure any self._pending_calls[anything] is properly flushed.

1189 return

1190

1191 match = self.RE_RESUMED.match(line)

1192 if match:

1193 if match.group(1) not in self._pending_calls:

1194 raise TracingFailure(

1195 'Found a resumed call that was not logged as unfinished',

1196 None, None, None,

1197 self._pending_calls)

1198 pending = self._pending_calls.pop(match.group(1))

1199 # Reconstruct the line.

1200 line = pending + match.group(2)

1201

1202 match = self.RE_HEADER.match(line)

1203 if not match:

1204 raise TracingFailure(

1205 'Found an invalid line: %s' % line,

1206 None, None, None)

1207 if match.group(1) == self.UNNAMED_FUNCTION:

1208 return

1209

1210 # It's a valid line, handle it.

1211 handler = getattr(self, 'handle_%s' % match.group(1), None)

1212 if not handler:

1213 self._handle_unknown(match.group(1), match.group(2), match.group(3))

1214 return handler(match.group(2), match.group(3))

1215 except TracingFailure, e:

1216 # Hack in the values since the handler could be a static function.

1217 e.pid = self.pid

1218 e.line = line

1219 e.line_number = self._line_number

1220 # Re-raise the modified exception.

1221 raise

1222 except (KeyError, NotImplementedError, ValueError), e:

1223 raise TracingFailure(

1224 'Trace generated a %s exception: %s' % (

1225 e.__class__.__name__, str(e)),

1226 self.pid,

1227 self._line_number,

1228 line,

1229 e)

1230

1231 @parse_args(r'^\"(.+?)\", [FKORWX_\|]+$', True)

1232 def handle_access(self, args, _result):

1233 self._handle_file(args[0], True)

1234

1235 @parse_args(r'^\"(.+?)\"$', True)

1236 def handle_chdir(self, args, _result):

1237 """Updates cwd."""

1238 self.cwd = self.RelativePath(self, args[0])

1239 logging.debug('handle_chdir(%d, %s)' % (self.pid, self.cwd))

1240

1241 def handle_clone(self, _args, result):

1242 """Transfers cwd."""

1243 if result.startswith(('?', '-1')):

1244 # The call failed.

1245 return

1246 # Update the other process right away.

1247 childpid = int(result)

1248 child = self._root().get_or_set_proc(childpid)

1249 if child.parentid is not None or childpid in self.children:

1250 raise TracingFailure(

1251 'Found internal inconsitency in process lifetime detection '

1252 'during a clone() call',

1253 None, None, None)

1254

1255 # Copy the cwd object.

1256 child.initial_cwd = self.get_cwd()

1257 child.parentid = self.pid

1258 # It is necessary because the logs are processed out of order.

1259 self.children.append(child)

1260

1261 def handle_close(self, _args, _result):

1262 pass

1263

1264 def handle_chmod(self, _args, _result):

1265 pass

1266

1267 def handle_creat(self, _args, _result):

1268 # Ignore files created, since they didn't need to exist.

1269 pass

1270

1271 @parse_args(r'^\"(.+?)\", \[(.+)\], \[\/\* \d+ vars? \*\/\]$', True)

1272 def handle_execve(self, args, _result):

1273 # Even if in practice execve() doesn't returns when it succeeds, strace

1274 # still prints '0' as the result.

1275 filepath = args[0]

1276 self._handle_file(filepath, False)

1277 self.executable = self.RelativePath(self.get_cwd(), filepath)

1278 self.command = strace_process_quoted_arguments(args[1])

1279

1280 def handle_exit_group(self, _args, _result):

1281 """Removes cwd."""

1282 self.cwd = None

1283

1284 def handle_fork(self, args, result):

1285 self._handle_unknown('fork', args, result)

1286

1287 def handle_getcwd(self, _args, _result):

1288 pass

1289

1290 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)

1291 def handle_link(self, args, _result):

1292 self._handle_file(args[0], False)

1293 self._handle_file(args[1], False)

1294

1295 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)

1296 def handle_lstat(self, args, _result):

1297 self._handle_file(args[0], True)

1298

1299 def handle_mkdir(self, _args, _result):

1300 pass

1301

1302 @parse_args(r'^\"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)

1303 def handle_open(self, args, _result):

1304 if 'O_DIRECTORY' in args[1]:

1305 return

1306 self._handle_file(args[0], False)

1307

1308 @parse_args(r'^(\d+\|AT_FDCWD), \"(.*?)\", ([A-Z\_\\|]+)(\|, \d+)$', False)

1309 def handle_openat(self, args, _result):

1310 if 'O_DIRECTORY' in args[2]:

1311 return

1312 if args[1] == 'AT_FDCWD':

1313 self._handle_file(args[1], False)

1314 else:

1315 # TODO(maruel): Implement relative open if necessary instead of the

1316 # AT_FDCWD flag, let's hope not since this means tracking all active

1317 # directory handles.

1318 raise Exception('Relative open via openat not implemented.')

1319

1320 @parse_args(r'^\"(.+?)\", \".+?\"(\.\.\.)?, \d+$', False)

1321 def handle_readlink(self, args, _result):

1322 self._handle_file(args[0], False)

1323

1324 @parse_args(r'^\"(.+?)\", \"(.+?)\"$', True)

1325 def handle_rename(self, args, _result):

1326 self._handle_file(args[0], False)

1327 self._handle_file(args[1], False)

1328

1329 def handle_rmdir(self, _args, _result):

1330 pass

1331

1332 def handle_setxattr(self, _args, _result):

1333 pass

1334

1335 @parse_args(r'\"(.+?)\", \{.+?, \.\.\.\}', True)

1336 def handle_stat(self, args, _result):

1337 self._handle_file(args[0], True)

1338

1339 def handle_symlink(self, _args, _result):

1340 pass

1341

1342 def handle_unlink(self, _args, _result):

1343 # In theory, the file had to be created anyway.

1344 pass

1345

1346 def handle_statfs(self, _args, _result):

1347 pass

1348

1349 def handle_vfork(self, args, result):

1350 self._handle_unknown('vfork', args, result)

1351

1352 @staticmethod

1353 def _handle_unknown(function, args, result):

1354 raise TracingFailure(

1355 'Unexpected/unimplemented trace %s(%s)= %s' %

1356 (function, args, result),

1357 None, None, None)

1358

1359 def _handle_file(self, filepath, touch_only):

1360 filepath = self.RelativePath(self.get_cwd(), filepath)

1361 #assert not touch_only, unicode(filepath)

1362 self.add_file(filepath, touch_only)

1363

1364 def __init__(self, blacklist, initial_cwd):

1365 super(Strace.Context, self).__init__(blacklist)

1366 self.initial_cwd = initial_cwd

1367

1368 def render(self):

1369 """Returns the string value of the initial cwd of the root process.

1370

1371 Used by RelativePath.

1372 """

1373 return self.initial_cwd

1374

1375 def on_line(self, pid, line):

1376 """Transfers control into the Process.on_line() function."""

1377 self.get_or_set_proc(pid).on_line(line.strip())

1378

1379 def to_results(self):

1380 """Finds back the root process and verify consistency."""

1381 # TODO(maruel): Absolutely unecessary, fix me.

1382 root = [p for p in self._process_lookup.itervalues() if not p.parentid]

1383 if len(root) != 1:

1384 raise TracingFailure(

1385 'Found internal inconsitency in process lifetime detection '

1386 'while finding the root process',

1387 None,

1388 None,

1389 None,

1390 sorted(p.pid for p in root))

1391 self.root_process = root[0]

1392 process = self.root_process.to_results_process()

1393 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):

1394 raise TracingFailure(

1395 'Found internal inconsitency in process lifetime detection '

1396 'while looking for len(tree) == len(list)',

1397 None,

1398 None,

1399 None,

1400 sorted(self._process_lookup),

1401 sorted(p.pid for p in process.all))

1402 return Results(process)

1403

1404 def get_or_set_proc(self, pid):

1405 """Returns the Context.Process instance for this pid or creates a new one.

1406 """

1407 if not pid or not isinstance(pid, int):

1408 raise TracingFailure(

1409 'Unpexpected value for pid: %r' % pid,

1410 pid,

1411 None,

1412 None,

1413 pid)

1414 if pid not in self._process_lookup:

1415 self._process_lookup[pid] = self.Process(self, pid)

1416 return self._process_lookup[pid]

1417

1418 @classmethod

1419 def traces(cls):

1420 """Returns the list of all handled traces to pass this as an argument to

1421 strace.

1422 """

1423 prefix = 'handle_'

1424 return [i[len(prefix):] for i in dir(cls.Process) if i.startswith(prefix)]

1425

1426 class Tracer(ApiBase.Tracer):

1427 MAX_LEN = 256

1428

1429 def trace(self, cmd, cwd, tracename, output):

1430 """Runs strace on an executable."""

1431 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))

1432 assert os.path.isabs(cmd[0]), cmd[0]

1433 assert os.path.isabs(cwd), cwd

1434 assert os.path.normpath(cwd) == cwd, cwd

1435 with self._lock:

1436 if not self._initialized:

1437 raise TracingFailure(

1438 'Called Tracer.trace() on an unitialized object',

1439 None, None, None, tracename)

1440 assert tracename not in (i['trace'] for i in self._traces)

1441 stdout = stderr = None

1442 if output:

1443 stdout = subprocess.PIPE

1444 stderr = subprocess.STDOUT

1445 # Ensure all file related APIs are hooked.

1446 traces = ','.join(Strace.Context.traces() + ['file'])

1447 trace_cmd = [

1448 'strace',

1449 '-ff',

1450 '-s', '%d' % self.MAX_LEN,

1451 '-e', 'trace=%s' % traces,

1452 '-o', self._logname + '.' + tracename,

1453 ]

1454 child = subprocess.Popen(

1455 trace_cmd + cmd,

1456 cwd=cwd,

1457 stdin=subprocess.PIPE,

1458 stdout=stdout,

1459 stderr=stderr)

1460 out = child.communicate()[0]

1461 # TODO(maruel): Walk the logs and figure out the root process would

1462 # simplify parsing the logs a lot.

1463 with self._lock:

1464 assert tracename not in (i['trace'] for i in self._traces)

1465 self._traces.append(

1466 {

1467 'cmd': cmd,

1468 'cwd': cwd,

1469 # The pid of strace process, not very useful.

1470 'pid': child.pid,

1471 'trace': tracename,

1472 'output': out,

1473 })

1474 return child.returncode, out

1475

1476 @staticmethod

1477 def clean_trace(logname):

1478 if os.path.isfile(logname):

1479 os.remove(logname)

1480 # Also delete any pid specific file from previous traces.

1481 for i in glob.iglob(logname + '.*'):

1482 if i.rsplit('.', 1)[1].isdigit():

1483 os.remove(i)

1484

1485 @classmethod

1486 def parse_log(cls, logname, blacklist):

1487 logging.info('parse_log(%s, %s)' % (logname, blacklist))

1488 data = read_json(logname)

1489 out = []

1490 for item in data['traces']:

1491 result = {

1492 'trace': item['trace'],

1493 'output': item['output'],

1494 }

1495 try:

1496 context = cls.Context(blacklist, item['cwd'])

1497 for pidfile in glob.iglob('%s.%s.*' % (logname, item['trace'])):

1498 pid = pidfile.rsplit('.', 1)[1]

1499 if pid.isdigit():

1500 pid = int(pid)

1501 # TODO(maruel): Load as utf-8

1502 for line in open(pidfile, 'rb'):

1503 context.on_line(pid, line)

1504 result['results'] = context.to_results()

1505 except TracingFailure, e:

1506 result['exception'] = e

1507 out.append(result)

1508 return out

1509

1510

1511 class Dtrace(ApiBase):

1512 """Uses DTrace framework through dtrace. Requires root access.

1513

1514 Implies Mac OSX.

1515

1516 dtruss can't be used because it has compatibility issues with python.

1517

1518 Also, the pid->cwd handling needs to be done manually since OSX has no way to

1519 get the absolute path of the 'cwd' dtrace variable from the probe.

1520

1521 Also, OSX doesn't populate curpsinfo->pr_psargs properly, see

1522 https://discussions.apple.com/thread/1980539. So resort to handling execve()

1523 manually.

1524

1525 errno is not printed in the log since this implementation currently only cares

1526 about files that were successfully opened.

1527 """

1528 class Context(ApiBase.Context):

1529 # Format: index pid function(args)

1530 RE_HEADER = re.compile(r'^\d+ (\d+) ([a-zA-Z_\-]+)$(.*?)$$')

1531

1532 # Arguments parsing.

1533 RE_DTRACE_BEGIN = re.compile(r'^\"(.+?)\"$')

1534 RE_CHDIR = re.compile(r'^\"(.+?)\"$')

1535 RE_EXECVE = re.compile(r'^\"(.+?)\", \[(\d+), (.+)\]$')

1536 RE_OPEN = re.compile(r'^\"(.+?)\", (0x[0-9a-z]+), (0x[0-9a-z]+)$')

1537 RE_PROC_START = re.compile(r'^(\d+), \"(.+?)\", (\d+)$')

1538 RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$')

1539

1540 O_DIRECTORY = 0x100000

1541

1542 class Process(ApiBase.Context.Process):

1543 def __init__(self, *args):

1544 super(Dtrace.Context.Process, self).__init__(*args)

1545 self.cwd = self.initial_cwd

1546

1547 def __init__(self, blacklist, tracer_pid, initial_cwd):

1548 logging.info(

1549 '%s(%d, %s)' % (self.__class__.__name__, tracer_pid, initial_cwd))

1550 super(Dtrace.Context, self).__init__(blacklist)

1551 # Process ID of the temporary script created by create_thunk().

1552 self._tracer_pid = tracer_pid

1553 self._initial_cwd = initial_cwd

1554 self._line_number = 0

1555

1556 def on_line(self, line):

1557 self._line_number += 1

1558 match = self.RE_HEADER.match(line)

1559 if not match:

1560 raise TracingFailure(

1561 'Found malformed line: %s' % line,

1562 None,

1563 self._line_number,

1564 line)

1565 fn = getattr(

1566 self,

1567 'handle_%s' % match.group(2).replace('-', '_'),

1568 self._handle_ignored)

1569 # It is guaranteed to succeed because of the regexp. Or at least I thought

1570 # it would.

1571 pid = int(match.group(1))

1572 try:

1573 return fn(pid, match.group(3))

1574 except TracingFailure, e:

1575 # Hack in the values since the handler could be a static function.

1576 e.pid = pid

1577 e.line = line

1578 e.line_number = self._line_number

1579 # Re-raise the modified exception.

1580 raise

1581 except (KeyError, NotImplementedError, ValueError), e:

1582 raise TracingFailure(

1583 'Trace generated a %s exception: %s' % (

1584 e.__class__.__name__, str(e)),

1585 pid,

1586 self._line_number,

1587 line,

1588 e)

1589

1590 def to_results(self):

1591 process = self.root_process.to_results_process()

1592 # Internal concistency check.

1593 if sorted(self._process_lookup) != sorted(p.pid for p in process.all):

1594 raise TracingFailure(

1595 'Found internal inconsitency in process lifetime detection '

1596 'while looking for len(tree) == len(list)',

1597 None,

1598 None,

1599 None,

1600 sorted(self._process_lookup),

1601 sorted(p.pid for p in process.all))

1602 return Results(process)

1603

1604 def handle_dtrace_BEGIN(self, _pid, args):

1605 if not self.RE_DTRACE_BEGIN.match(args):

1606 raise TracingFailure(

1607 'Found internal inconsitency in dtrace_BEGIN log line',

1608 None, None, None)

1609

1610 def handle_proc_start(self, pid, args):

1611 """Transfers cwd.

1612

1613 The dtrace script already takes care of only tracing the processes that

1614 are child of the traced processes so there is no need to verify the

1615 process hierarchy.

1616 """

1617 if pid in self._process_lookup:

1618 raise TracingFailure(

1619 'Found internal inconsitency in proc_start: %d started two times' %

1620 pid,

1621 None, None, None)

1622 match = self.RE_PROC_START.match(args)

1623 if not match:

1624 raise TracingFailure(

1625 'Failed to parse arguments: %s' % args,

1626 None, None, None)

1627 ppid = int(match.group(1))

1628 if ppid == self._tracer_pid and not self.root_process:

1629 proc = self.root_process = self.Process(

1630 self.blacklist, pid, self._initial_cwd)

1631 elif ppid in self._process_lookup:

1632 proc = self.Process(self.blacklist, pid, self._process_lookup[ppid].cwd)

1633 self._process_lookup[ppid].children.append(proc)

1634 else:

1635 # Another process tree, ignore.

1636 return

1637 self._process_lookup[pid] = proc

1638 logging.debug(

1639 'New child: %s -> %d cwd:%s' %

1640 (ppid, pid, unicode(proc.initial_cwd)))

1641

1642 def handle_proc_exit(self, pid, _args):

1643 """Removes cwd."""

1644 if pid in self._process_lookup:

1645 # self._tracer_pid is not traced itself and other traces run neither.

1646 self._process_lookup[pid].cwd = None

1647

1648 def handle_execve(self, pid, args):

1649 """Sets the process' executable.

1650

1651 TODO(maruel): Read command line arguments. See

1652 https://discussions.apple.com/thread/1980539 for an example.

1653 https://gist.github.com/1242279

1654

1655 Will have to put the answer at http://stackoverflow.com/questions/7556249.

1656 :)

1657 """

1658 if not pid in self._process_lookup:

1659 # Another process tree, ignore.

1660 return

1661 match = self.RE_EXECVE.match(args)

1662 if not match:

1663 raise TracingFailure(

1664 'Failed to parse arguments: %r' % args,

1665 None, None, None)

1666 proc = self._process_lookup[pid]

1667 proc.executable = match.group(1)

1668 proc.command = self.process_escaped_arguments(match.group(3))

1669 if int(match.group(2)) != len(proc.command):

1670 raise TracingFailure(

1671 'Failed to parse execve() arguments: %s' % args,

1672 None, None, None)

1673

1674 def handle_chdir(self, pid, args):

1675 """Updates cwd."""

1676 if pid not in self._process_lookup:

1677 # Another process tree, ignore.

1678 return

1679 cwd = self.RE_CHDIR.match(args).group(1)

1680 if not cwd.startswith('/'):

1681 cwd2 = os.path.join(self._process_lookup[pid].cwd, cwd)

1682 logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2))

1683 else:

1684 logging.debug('handle_chdir(%d, %s)' % (pid, cwd))

1685 cwd2 = cwd

1686 self._process_lookup[pid].cwd = cwd2

1687

1688 def handle_open_nocancel(self, pid, args):

1689 """Redirects to handle_open()."""

1690 return self.handle_open(pid, args)

1691

1692 def handle_open(self, pid, args):

1693 if pid not in self._process_lookup:

1694 # Another process tree, ignore.

1695 return

1696 match = self.RE_OPEN.match(args)

1697 if not match:

1698 raise TracingFailure(

1699 'Failed to parse arguments: %s' % args,

1700 None, None, None)

1701 flag = int(match.group(2), 16)

1702 if self.O_DIRECTORY & flag == self.O_DIRECTORY:

1703 # Ignore directories.

1704 return

1705 self._handle_file(pid, match.group(1))

1706

1707 def handle_rename(self, pid, args):

1708 if pid not in self._process_lookup:

1709 # Another process tree, ignore.

1710 return

1711 match = self.RE_RENAME.match(args)

1712 if not match:

1713 raise TracingFailure(

1714 'Failed to parse arguments: %s' % args,

1715 None, None, None)

1716 self._handle_file(pid, match.group(1))

1717 self._handle_file(pid, match.group(2))

1718

1719 def _handle_file(self, pid, filepath):

1720 if not filepath.startswith('/'):

1721 filepath = os.path.join(self._process_lookup[pid].cwd, filepath)

1722 # We can get '..' in the path.

1723 filepath = os.path.normpath(filepath)

1724 # Sadly, still need to filter out directories here;

1725 # saw open_nocancel(".", 0, 0) = 0 lines.

1726 if os.path.isdir(filepath):

1727 return

1728 self._process_lookup[pid].add_file(filepath, False)

1729

1730 def handle_ftruncate(self, pid, args):

1731 """Just used as a signal to kill dtrace, ignoring."""

1732 pass

1733

1734 @staticmethod

1735 def _handle_ignored(pid, args):

1736 """Is called for all the event traces that are not handled."""

1737 raise NotImplementedError('Please implement me')

1738

1739 @staticmethod

1740 def process_escaped_arguments(text):

1741 """Extracts escaped arguments on a string and return the arguments as a

1742 list.

1743

1744 Implemented as an automaton.

1745

1746 Example:

1747 With text = '\\001python2.7\\001-c\\001print(\\"hi\\")\\0', the

1748 function will return ['python2.7', '-c', 'print("hi")]

1749 """

1750 if not text.endswith('\\0'):

1751 raise ValueError('String is not null terminated: %r' % text, text)

1752 text = text[:-2]

1753

1754 def unescape(x):

1755 """Replaces '\\' with '\' and '\?' (where ? is anything) with ?."""

1756 out = []

1757 escaped = False

1758 for i in x:

1759 if i == '\\' and not escaped:

1760 escaped = True

1761 continue

1762 escaped = False

1763 out.append(i)

1764 return ''.join(out)

1765

1766 return [unescape(i) for i in text.split('\\001')]

1767

1768 class Tracer(ApiBase.Tracer):

1769 # pylint: disable=C0301

1770 #

1771 # To understand the following code, you'll want to take a look at:

1772 # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref .html

1773 # https://wikis.oracle.com/display/DTrace/Variables

1774 # http://docs.oracle.com/cd/E19205-01/820-4221/

1775 #

1776 # 0. Dump all the valid probes into a text file. It is important, you

1777 # want to redirect into a file and you don't want to constantly 'sudo'.

1778 # $ sudo dtrace -l > probes.txt

1779 #

1780 # 1. Count the number of probes:

1781 # $ wc -l probes.txt

1782 # 81823 # On OSX 10.7, including 1 header line.

1783 #

1784 # 2. List providers, intentionally skipping all the 'syspolicy10925' and the

1785 # likes and skipping the header with NR>1:

1786 # $ awk 'NR>1 { print $2 }' probes.txt \| sort \| uniq \| grep -v '[[:digit:]]'

1787 # dtrace

1788 # fbt

1789 # io

1790 # ip

1791 # lockstat

1792 # mach_trap

1793 # proc

1794 # profile

1795 # sched

1796 # syscall

1797 # tcp

1798 # vminfo

1799 #

1800 # 3. List of valid probes:

1801 # $ grep syscall probes.txt \| less

1802 # or use dtrace directly:

1803 # $ sudo dtrace -l -P syscall \| less

1804 #

1805 # trackedpid is an associative array where its value can be 0, 1 or 2.

1806 # 0 is for untracked processes and is the default value for items not

1807 # in the associative array.

1808 # 1 is for tracked processes.

1809 # 2 is for the script created by create_thunk() only. It is not tracked

1810 # itself but all its decendants are.

1811 #

1812 # The script will kill itself only once waiting_to_die == 1 and

1813 # current_processes == 0, so that both getlogin() was called and that

1814 # all traced processes exited.

1815 #

1816 # TODO(maruel): Use cacheable predicates. See

1817 # https://wikis.oracle.com/display/DTrace/Performance+Considerations

1818 D_CODE = """

1819 dtrace:::BEGIN {

1820 waiting_to_die = 0;

1821 current_processes = 0;

1822 logindex = 0;

1823 printf("%d %d %s_%s(\\"%s\\")\\n",

1824 logindex, PID, probeprov, probename, SCRIPT);

1825 logindex++;

1826 }

1827

1828 proc:::start /trackedpid[ppid]/ {

1829 trackedpid[pid] = 1;

1830 current_processes += 1;

1831 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",

1832 logindex, pid, probeprov, probename,

1833 ppid,

1834 execname,

1835 current_processes);

1836 logindex++;

1837 }

1838 /* Should use SCRIPT but there is no access to this variable at that

1839 * point. */

1840 proc:::start /ppid == PID && execname == "Python"/ {

1841 trackedpid[pid] = 2;

1842 current_processes += 1;

1843 printf("%d %d %s_%s(%d, \\"%s\\", %d)\\n",

1844 logindex, pid, probeprov, probename,

1845 ppid,

1846 execname,

1847 current_processes);

1848 logindex++;

1849 }

1850 proc:::exit /trackedpid[pid] &&

1851 current_processes == 1 &&

1852 waiting_to_die == 1/ {

1853 trackedpid[pid] = 0;

1854 current_processes -= 1;

1855 printf("%d %d %s_%s(%d)\\n",

1856 logindex, pid, probeprov, probename,

1857 current_processes);

1858 logindex++;

1859 exit(0);

1860 }

1861 proc:::exit /trackedpid[pid]/ {

1862 trackedpid[pid] = 0;

1863 current_processes -= 1;

1864 printf("%d %d %s_%s(%d)\\n",

1865 logindex, pid, probeprov, probename,

1866 current_processes);

1867 logindex++;

1868 }

1869

1870 /* Use an arcane function to detect when we need to die */

1871 syscall::ftruncate:entry /pid == PID && arg0 == FILE_ID/ {

1872 waiting_to_die = 1;

1873 printf("%d %d %s()\\n", logindex, pid, probefunc);

1874 logindex++;

1875 }

1876 syscall::ftruncate:entry /

1877 pid == PID && arg0 == FILE_ID && current_processes == 0/ {

1878 exit(0);

1879 }

1880

1881 syscall::open*:entry /trackedpid[pid] == 1/ {

1882 self->open_arg0 = arg0;

1883 self->open_arg1 = arg1;

1884 self->open_arg2 = arg2;

1885 }

1886 syscall::open*:return /trackedpid[pid] == 1 && errno == 0/ {

1887 this->open_arg0 = copyinstr(self->open_arg0);

1888 printf("%d %d %s(\\"%s\\", 0x%x, 0x%x)\\n",

1889 logindex, pid, probefunc,

1890 this->open_arg0,

1891 self->open_arg1,

1892 self->open_arg2);

1893 logindex++;

1894 this->open_arg0 = 0;

1895 }

1896 syscall::open*:return /trackedpid[pid] == 1/ {

1897 self->open_arg0 = 0;

1898 self->open_arg1 = 0;

1899 self->open_arg2 = 0;

1900 }

1901

1902 syscall::rename:entry /trackedpid[pid] == 1/ {

1903 self->rename_arg0 = arg0;

1904 self->rename_arg1 = arg1;

1905 }

1906 syscall::rename:return /trackedpid[pid] == 1 && errno == 0/ {

1907 this->rename_arg0 = copyinstr(self->rename_arg0);

1908 this->rename_arg1 = copyinstr(self->rename_arg1);

1909 printf("%d %d %s(\\"%s\\", \\"%s\\")\\n",

1910 logindex, pid, probefunc,

1911 this->rename_arg0,

1912 this->rename_arg1);

1913 logindex++;

1914 this->rename_arg0 = 0;

1915 this->rename_arg1 = 0;

1916 }

1917 syscall::rename:return /trackedpid[pid] == 1/ {

1918 self->rename_arg0 = 0;

1919 self->rename_arg1 = 0;

1920 }

1921

1922 /* Track chdir, it's painful because it is only receiving relative path.

1923 */

1924 syscall::chdir:entry /trackedpid[pid] == 1/ {

1925 self->chdir_arg0 = arg0;

1926 }

1927 syscall::chdir:return /trackedpid[pid] == 1 && errno == 0/ {

1928 this->chdir_arg0 = copyinstr(self->chdir_arg0);

1929 printf("%d %d %s(\\"%s\\")\\n",

1930 logindex, pid, probefunc,

1931 this->chdir_arg0);

1932 logindex++;

1933 this->chdir_arg0 = 0;

1934 }

1935 syscall::chdir:return /trackedpid[pid] == 1/ {

1936 self->chdir_arg0 = 0;

1937 }

1938 """

1939

1940 # execve-specific code, tends to throw a lot of exceptions.

1941 D_CODE_EXECVE = """

1942 /* Finally what we care about! */

1943 syscall::exec*:entry /trackedpid[pid]/ {

1944 self->exec_arg0 = copyinstr(arg0);

1945 /* Incrementally probe for a NULL in the argv parameter of execve() to

1946 * figure out argc. */

1947 /* TODO(maruel): Skip the remaining copyin() when a NULL pointer was

1948 * found. */

1949 self->exec_argc = 0;

1950 /* Probe for argc==1 */

1951 this->exec_argv = (user_addr_t*)copyin(

1952 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

1953 self->exec_argc = this->exec_argv[self->exec_argc] ?

1954 (self->exec_argc + 1) : self->exec_argc;

1955

1956 /* Probe for argc==2 */

1957 this->exec_argv = (user_addr_t*)copyin(

1958 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

1959 self->exec_argc = this->exec_argv[self->exec_argc] ?

1960 (self->exec_argc + 1) : self->exec_argc;

1961

1962 /* Probe for argc==3 */

1963 this->exec_argv = (user_addr_t*)copyin(

1964 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

1965 self->exec_argc = this->exec_argv[self->exec_argc] ?

1966 (self->exec_argc + 1) : self->exec_argc;

1967

1968 /* Probe for argc==4 */

1969 this->exec_argv = (user_addr_t*)copyin(

1970 arg1, sizeof(user_addr_t) * (self->exec_argc + 1));

1971 self->exec_argc = this->exec_argv[self->exec_argc] ?

1972 (self->exec_argc + 1) : self->exec_argc;

1973

1974 /* Copy the inputs strings since there is no guarantee they'll be

1975 * present after the call completed. */

1976 self->exec_argv0 = (self->exec_argc > 0) ?

1977 copyinstr(this->exec_argv[0]) : "";

1978 self->exec_argv1 = (self->exec_argc > 1) ?

1979 copyinstr(this->exec_argv[1]) : "";

1980 self->exec_argv2 = (self->exec_argc > 2) ?

1981 copyinstr(this->exec_argv[2]) : "";

1982 self->exec_argv3 = (self->exec_argc > 3) ?

1983 copyinstr(this->exec_argv[3]) : "";

1984 this->exec_argv = 0;

1985 }

1986 syscall::exec*:return /trackedpid[pid] && errno == 0/ {

1987 /* We need to join strings here, as using multiple printf() would

1988 * cause tearing when multiple threads/processes are traced.

1989 * Since it is impossible to escape a string and join it to another one,

1990 * like sprintf("%s%S", previous, more), use hackery.

1991 * Each of the elements are split with a \\1. \\0 cannot be used because

1992 * it is simply ignored. This will conflict with any program putting a

1993 * \\1 in their execve() string but this should be "rare enough" */

1994 this->args = "";

1995 /* Process exec_argv[0] */

1996 this->args = strjoin(

1997 this->args, (self->exec_argc > 0) ? self->exec_argv0 : "");

1998

1999 /* Process exec_argv[1] */

2000 this->args = strjoin(

2001 this->args, (self->exec_argc > 1) ? "\\1" : "");

2002 this->args = strjoin(

2003 this->args, (self->exec_argc > 1) ? self->exec_argv1 : "");

2004

2005 /* Process exec_argv[2] */

2006 this->args = strjoin(

2007 this->args, (self->exec_argc > 2) ? "\\1" : "");

2008 this->args = strjoin(

2009 this->args, (self->exec_argc > 2) ? self->exec_argv2 : "");

2010

2011 /* Process exec_argv[3] */

2012 this->args = strjoin(

2013 this->args, (self->exec_argc > 3) ? "\\1" : "");

2014 this->args = strjoin(

2015 this->args, (self->exec_argc > 3) ? self->exec_argv3 : "");

2016

2017 /* Prints self->exec_argc to permits verifying the internal

2018 * consistency since this code is quite fishy. */

2019 printf("%d %d %s(\\"%s\\", [%d, %S])\\n",

2020 logindex, pid, probefunc,

2021 self->exec_arg0,

2022 self->exec_argc,

2023 this->args);

2024 logindex++;

2025 this->args = 0;

2026 }

2027 syscall::exec*:return /trackedpid[pid]/ {

2028 self->exec_arg0 = 0;

2029 self->exec_argc = 0;

2030 self->exec_argv0 = 0;

2031 self->exec_argv1 = 0;

2032 self->exec_argv2 = 0;

2033 self->exec_argv3 = 0;

2034 }

2035 """

2036

2037 # Code currently not used.

2038 D_EXTRANEOUS = """

2039 /* This is a good learning experience, since it traces a lot of things

2040 * related to the process and child processes.

2041 * Warning: it generates a gigantic log. For example, tracing

2042 * "data/trace_inputs/child1.py --child" generates a 2mb log and takes

2043 * several minutes to execute.

2044 */

2045 /*

2046 mach_trap::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {

2047 printf("%d %d %s_%s() = %d\\n",

2048 logindex, pid, probeprov, probefunc, errno);

2049 logindex++;

2050 }

2051 proc::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {

2052 printf("%d %d %s_%s() = %d\\n",

2053 logindex, pid, probeprov, probefunc, errno);

2054 logindex++;

2055 }

2056 sched::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {

2057 printf("%d %d %s_%s() = %d\\n",

2058 logindex, pid, probeprov, probefunc, errno);

2059 logindex++;

2060 }

2061 syscall::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {

2062 printf("%d %d %s_%s() = %d\\n",

2063 logindex, pid, probeprov, probefunc, errno);

2064 logindex++;

2065 }

2066 vminfo::: /trackedpid[pid] == 1 \|\| trackedpid[ppid]/ {

2067 printf("%d %d %s_%s() = %d\\n",

2068 logindex, pid, probeprov, probefunc, errno);

2069 logindex++;

2070 }

2071 */

2072 /* TODO(maruel): stat functions and friends

2073 syscall::access:return,

2074 syscall::chdir:return,

2075 syscall::chflags:return,

2076 syscall::chown:return,

2077 syscall::chroot:return,

2078 syscall::getattrlist:return,

2079 syscall::getxattr:return,

2080 syscall::lchown:return,

2081 syscall::lstat64:return,

2082 syscall::lstat:return,

2083 syscall::mkdir:return,

2084 syscall::pathconf:return,

2085 syscall::readlink:return,

2086 syscall::removexattr:return,

2087 syscall::setxattr:return,

2088 syscall::stat64:return,

2089 syscall::stat:return,

2090 syscall::truncate:return,

2091 syscall::unlink:return,

2092 syscall::utimes:return,

2093 */

2094 """

2095

2096 def __init__(self, logname):

2097 """Starts the log collection with dtrace.

2098

2099 Requires root access or chmod 4555 on dtrace. dtrace is asynchronous so

2100 this needs to wait for dtrace to be "warmed up".

2101 """

2102 super(Dtrace.Tracer, self).__init__(logname)

2103 self._script = create_thunk()

2104 # This unique dummy temp file is used to signal the dtrace script that it

2105 # should stop as soon as all the child processes are done. A bit hackish

2106 # but works fine enough.

2107 self._dummy_file_id, self._dummy_file_name = tempfile.mkstemp(

2108 prefix='trace_signal_file')

2109

2110 # Note: do not use the -p flag. It's useless if the initial process quits

2111 # too fast, resulting in missing traces from the grand-children. The D

2112 # code manages the dtrace lifetime itself.

2113 trace_cmd = [

2114 'sudo',

2115 'dtrace',

2116 # Use a larger buffer if getting 'out of scratch space' errors.

2117 # Ref: https://wikis.oracle.com/display/DTrace/Options+and+Tunables

2118 '-b', '10m',

2119 '-x', 'dynvarsize=10m',

2120 #'-x', 'dtrace_global_maxsize=1m',

2121 '-x', 'evaltime=exec',

2122 '-o', '/dev/stderr',

2123 '-q',

2124 '-n', self._get_dtrace_code(),

2125 ]

2126 with open(self._logname + '.log', 'wb') as logfile:

2127 self._dtrace = subprocess.Popen(

2128 trace_cmd, stdout=logfile, stderr=subprocess.STDOUT)

2129 logging.debug('Started dtrace pid: %d' % self._dtrace.pid)

2130

2131 # Reads until one line is printed, which signifies dtrace is up and ready.

2132 with open(self._logname + '.log', 'rb') as logfile:

2133 while 'dtrace_BEGIN' not in logfile.readline():

2134 if self._dtrace.poll() is not None:

2135 # Do a busy wait. :/

2136 break

2137 logging.debug('dtrace started')

2138

2139 def _get_dtrace_code(self):

2140 """Setups the D code to implement child process tracking.

2141

2142 Injects the cookie in the script so it knows when to stop.

2143

2144 The script will detect any instance of the script created with

2145 create_thunk() and will start tracing it.

2146 """

2147 return (

2148 'inline int PID = %d;\n'

2149 'inline string SCRIPT = "%s";\n'

2150 'inline int FILE_ID = %d;\n'

2151 '\n'

2152 '%s') % (

2153 os.getpid(),

2154 self._script,

2155 self._dummy_file_id,

2156 self.D_CODE) + self.D_CODE_EXECVE

2157

2158 def trace(self, cmd, cwd, tracename, output):

2159 """Runs dtrace on an executable.

2160

2161 This dtruss is broken when it starts the process itself or when tracing

2162 child processes, this code starts a wrapper process

2163 generated with create_thunk() which starts the executable to trace.

2164 """

2165 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))

2166 assert os.path.isabs(cmd[0]), cmd[0]

2167 assert os.path.isabs(cwd), cwd

2168 assert os.path.normpath(cwd) == cwd, cwd

2169 with self._lock:

2170 if not self._initialized:

2171 raise TracingFailure(

2172 'Called Tracer.trace() on an unitialized object',

2173 None, None, None, tracename)

2174 assert tracename not in (i['trace'] for i in self._traces)

2175

2176 # Starts the script wrapper to start the child process. This signals the

2177 # dtrace script that this process is to be traced.

2178 stdout = stderr = None

2179 if output:

2180 stdout = subprocess.PIPE

2181 stderr = subprocess.STDOUT

2182 child_cmd = [

2183 sys.executable,

2184 self._script,

2185 tracename,

2186 ]

2187 # Call a dummy function so that dtrace knows I'm about to launch a process

2188 # that needs to be traced.

2189 # Yummy.

2190 child = subprocess.Popen(

2191 child_cmd + fix_python_path(cmd),

2192 stdin=subprocess.PIPE,

2193 stdout=stdout,

2194 stderr=stderr,

2195 cwd=cwd)

2196 logging.debug('Started child pid: %d' % child.pid)

2197

2198 out = child.communicate()[0]

2199 # This doesn't mean tracing is done, one of the grand-child process may

2200 # still be alive. It will be tracked with the dtrace script.

2201

2202 with self._lock:

2203 assert tracename not in (i['trace'] for i in self._traces)

2204 self._traces.append(

2205 {

2206 'cmd': cmd,

2207 'cwd': cwd,

2208 # The pid of strace process, not very useful.

2209 'pid': child.pid,

2210 'trace': tracename,

2211 'output': out,

2212 })

2213 return child.returncode, out

2214

2215 def close(self, timeout=None):

2216 """Terminates dtrace."""

2217 logging.debug('close(%s)' % timeout)

2218 try:

2219 try:

2220 super(Dtrace.Tracer, self).close(timeout)

2221 # Signal dtrace that it should stop now.

2222 os.ftruncate(self._dummy_file_id, 0)

2223 if timeout:

2224 start = time.time()

2225 # Use polling. :/

2226 while (self._dtrace.poll() is None and

2227 (time.time() - start) < timeout):

2228 time.sleep(0.1)

2229 self._dtrace.kill()

2230 self._dtrace.wait()

2231 finally:

2232 # Make sure to kill it in any case.

2233 if self._dtrace.poll() is None:

2234 try:

2235 self._dtrace.kill()

2236 self._dtrace.wait()

2237 except OSError:

2238 pass

2239

2240 if self._dtrace.returncode != 0:

2241 # Warn about any dtrace failure but basically ignore it.

2242 print 'dtrace failure: %s' % self._dtrace.returncode

2243 finally:

2244 os.close(self._dummy_file_id)

2245 os.remove(self._dummy_file_name)

2246 os.remove(self._script)

2247

2248 def post_process_log(self):

2249 """Sorts the log back in order when each call occured.

2250

2251 dtrace doesn't save the buffer in strict order since it keeps one buffer

2252 per CPU.

2253 """

2254 super(Dtrace.Tracer, self).post_process_log()

2255 logname = self._logname + '.log'

2256 with open(logname, 'rb') as logfile:

2257 lines = [l for l in logfile if l.strip()]

2258 errors = [l for l in lines if l.startswith('dtrace:')]

2259 if errors:

2260 raise TracingFailure(

2261 'Found errors in the trace: %s' % '\n'.join(errors),

2262 None, None, None, logname)

2263 try:

2264 lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0]))

2265 except ValueError:

2266 raise TracingFailure(

2267 'Found errors in the trace: %s' % '\n'.join(

2268 l for l in lines if l.split(' ', 1)[0].isdigit()),

2269 None, None, None, logname)

2270 with open(logname, 'wb') as logfile:

2271 logfile.write(''.join(lines))

2272

2273 @staticmethod

2274 def clean_trace(logname):

2275 for ext in ('', '.log'):

2276 if os.path.isfile(logname + ext):

2277 os.remove(logname + ext)

2278

2279 @classmethod

2280 def parse_log(cls, logname, blacklist):

2281 logging.info('parse_log(%s, ...)' % logname)

2282

2283 def blacklist_more(filepath):

2284 # All the HFS metadata is in the form /.vol/...

2285 return blacklist(filepath) or re.match(r'^\/\.vol\/.+$', filepath)

2286

2287 data = read_json(logname)

2288 out = []

2289 for item in data['traces']:

2290 context = cls.Context(blacklist_more, item['pid'], item['cwd'])

2291 for line in open(logname + '.log', 'rb'):

2292 context.on_line(line)

2293 out.append(

2294 {

2295 'results': context.to_results(),

2296 'trace': item['trace'],

2297 'output': item['output'],

2298 })

2299 return out

2300

2301

2302 class LogmanTrace(ApiBase):

2303 """Uses the native Windows ETW based tracing functionality to trace a child

2304 process.

2305

2306 Caveat: this implementations doesn't track cwd or initial_cwd. It is because

2307 the Windows Kernel doesn't have a concept of 'current working directory' at

2308 all. A Win32 process has a map of current directories, one per drive letter

2309 and it is managed by the user mode kernel32.dll. In kernel, a file is always

2310 opened relative to another file_object or as an absolute path. All the current

2311 working directory logic is done in user mode.

2312 """

2313 class Context(ApiBase.Context):

2314 """Processes a ETW log line and keeps the list of existent and non

2315 existent files accessed.

2316

2317 Ignores directories.

2318 """

2319 # These indexes are for the stripped version in json.

2320 EVENT_NAME = 0

2321 TYPE = 1

2322 PID = 2

2323 TID = 3

2324 PROCESSOR_ID = 4

2325 TIMESTAMP = 5

2326 USER_DATA = 6

2327

2328 class Process(ApiBase.Context.Process):

2329 def __init__(self, *args):

2330 super(LogmanTrace.Context.Process, self).__init__(*args)

2331 # Handle file objects that succeeded.

2332 self.file_objects = {}

2333

2334 def __init__(self, blacklist, tracer_pid):

2335 logging.info('%s(%d)' % (self.__class__.__name__, tracer_pid))

2336 super(LogmanTrace.Context, self).__init__(blacklist)

2337 self._drive_map = DosDriveMap()

2338 # Threads mapping to the corresponding process id.

2339 self._threads_active = {}

2340 # Process ID of the tracer, e.g. tracer_inputs.py

2341 self._tracer_pid = tracer_pid

2342 self._line_number = 0

2343

2344 def on_line(self, line):

2345 """Processes a json Event line."""

2346 self._line_number += 1

2347 try:

2348 # By Opcode

2349 handler = getattr(

2350 self,

2351 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),

2352 None)

2353 if not handler:

2354 raise TracingFailure(

2355 'Unexpected event %s_%s' % (

2356 line[self.EVENT_NAME], line[self.TYPE]),

2357 None, None, None)

2358 handler(line)

2359 except TracingFailure, e:

2360 # Hack in the values since the handler could be a static function.

2361 e.pid = line[self.PID]

2362 e.line = line

2363 e.line_number = self._line_number

2364 # Re-raise the modified exception.

2365 raise

2366 except (KeyError, NotImplementedError, ValueError), e:

2367 raise TracingFailure(

2368 'Trace generated a %s exception: %s' % (

2369 e.__class__.__name__, str(e)),

2370 line[self.PID],

2371 self._line_number,

2372 line,

2373 e)

2374

2375 def to_results(self):

2376 if not self.root_process:

2377 raise TracingFailure(

2378 'Failed to detect the initial process',

2379 None, None, None)

2380 process = self.root_process.to_results_process()

2381 return Results(process)

2382

2383 def _thread_to_process(self, tid):

2384 """Finds the process from the thread id."""

2385 tid = int(tid, 16)

2386 pid = self._threads_active.get(tid)

2387 if not pid or not self._process_lookup.get(pid):

2388 return

2389 return self._process_lookup[pid]

2390

2391 @classmethod

2392 def handle_EventTrace_Header(cls, line):

2393 """Verifies no event was dropped, e.g. no buffer overrun occured."""

2394 BUFFER_SIZE = cls.USER_DATA

2395 #VERSION = cls.USER_DATA + 1

2396 #PROVIDER_VERSION = cls.USER_DATA + 2

2397 #NUMBER_OF_PROCESSORS = cls.USER_DATA + 3

2398 #END_TIME = cls.USER_DATA + 4

2399 #TIMER_RESOLUTION = cls.USER_DATA + 5

2400 #MAX_FILE_SIZE = cls.USER_DATA + 6

2401 #LOG_FILE_MODE = cls.USER_DATA + 7

2402 #BUFFERS_WRITTEN = cls.USER_DATA + 8

2403 #START_BUFFERS = cls.USER_DATA + 9

2404 #POINTER_SIZE = cls.USER_DATA + 10

2405 EVENTS_LOST = cls.USER_DATA + 11

2406 #CPU_SPEED = cls.USER_DATA + 12

2407 #LOGGER_NAME = cls.USER_DATA + 13

2408 #LOG_FILE_NAME = cls.USER_DATA + 14

2409 #BOOT_TIME = cls.USER_DATA + 15

2410 #PERF_FREQ = cls.USER_DATA + 16

2411 #START_TIME = cls.USER_DATA + 17

2412 #RESERVED_FLAGS = cls.USER_DATA + 18

2413 #BUFFERS_LOST = cls.USER_DATA + 19

2414 #SESSION_NAME_STRING = cls.USER_DATA + 20

2415 #LOG_FILE_NAME_STRING = cls.USER_DATA + 21

2416 if line[EVENTS_LOST] != '0':

2417 raise TracingFailure(

2418 ( '%s events were lost during trace, please increase the buffer '

2419 'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),

2420 None, None, None)

2421

2422 def handle_FileIo_Cleanup(self, line):

2423 """General wisdom: if a file is closed, it's because it was opened.

2424

2425 Note that FileIo_Close is not used since if a file was opened properly but

2426 not closed before the process exits, only Cleanup will be logged.

2427 """

2428 #IRP = self.USER_DATA

2429 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.

2430 FILE_OBJECT = self.USER_DATA + 2

2431 #FILE_KEY = self.USER_DATA + 3

2432 proc = self._thread_to_process(line[TTID])

2433 if not proc:

2434 # Not a process we care about.

2435 return

2436 file_object = line[FILE_OBJECT]

2437 if file_object in proc.file_objects:

2438 proc.add_file(proc.file_objects.pop(file_object), False)

2439

2440 def handle_FileIo_Create(self, line):

2441 """Handles a file open.

2442

2443 All FileIo events are described at

2444 http://msdn.microsoft.com/library/windows/desktop/aa363884.aspx

2445 for some value of 'description'.

2446

2447 " (..) process and thread id values of the IO events (..) are not valid "

2448 http://msdn.microsoft.com/magazine/ee358703.aspx

2449

2450 The FileIo.Create event doesn't return if the CreateFile() call

2451 succeeded, so keep track of the file_object and check that it is

2452 eventually closed with FileIo_Cleanup.

2453 """

2454 #IRP = self.USER_DATA

2455 TTID = self.USER_DATA + 1 # Thread ID, that's what we want.

2456 FILE_OBJECT = self.USER_DATA + 2

2457 #CREATE_OPTIONS = self.USER_DATA + 3

2458 #FILE_ATTRIBUTES = self.USER_DATA + 4

2459 #self.USER_DATA + SHARE_ACCESS = 5

2460 OPEN_PATH = self.USER_DATA + 6

2461

2462 proc = self._thread_to_process(line[TTID])

2463 if not proc:

2464 # Not a process we care about.

2465 return

2466

2467 match = re.match(r'^\"(.+)\"$', line[OPEN_PATH])

2468 raw_path = match.group(1)

2469 # Ignore directories and bare drive right away.

2470 if raw_path.endswith(os.path.sep):

2471 return

2472 filepath = self._drive_map.to_win32(raw_path)

2473 # Ignore bare drive right away. Some may still fall through with format

2474 # like '\\?\X:'

2475 if len(filepath) == 2:

2476 return

2477 file_object = line[FILE_OBJECT]

2478 if os.path.isdir(filepath):

2479 # There is no O_DIRECTORY equivalent on Windows. The closed is

2480 # FILE_FLAG_BACKUP_SEMANTICS but it's not exactly right either. So

2481 # simply discard directories are they are found.

2482 return

2483 # Override any stale file object

2484 proc.file_objects[file_object] = filepath

2485

2486 def handle_FileIo_Rename(self, line):

2487 # TODO(maruel): Handle?

2488 pass

2489

2490 def handle_Process_End(self, line):

2491 pid = line[self.PID]

2492 if self._process_lookup.get(pid):

2493 logging.info('Terminated: %d' % pid)

2494 self._process_lookup[pid] = None

2495 else:

2496 logging.debug('Terminated: %d' % pid)

2497

2498 def handle_Process_Start(self, line):

2499 """Handles a new child process started by PID."""

2500 #UNIQUE_PROCESS_KEY = self.USER_DATA

2501 PROCESS_ID = self.USER_DATA + 1

2502 #PARENT_PID = self.USER_DATA + 2

2503 #SESSION_ID = self.USER_DATA + 3

2504 #EXIT_STATUS = self.USER_DATA + 4

2505 #DIRECTORY_TABLE_BASE = self.USER_DATA + 5

2506 #USER_SID = self.USER_DATA + 6

2507 IMAGE_FILE_NAME = self.USER_DATA + 7

2508 COMMAND_LINE = self.USER_DATA + 8

2509

2510 ppid = line[self.PID]

2511 pid = int(line[PROCESS_ID], 16)

2512 logging.debug(

2513 'New process %d->%d (%s) %s' %

2514 (ppid, pid, line[IMAGE_FILE_NAME], line[COMMAND_LINE]))

2515

2516 if ppid == self._tracer_pid:

2517 # Need to ignore processes we don't know about because the log is

2518 # system-wide. self._tracer_pid shall start only one process.

2519 if self.root_process:

2520 raise TracingFailure(

2521 ( 'Parent process is _tracer_pid(%d) but root_process(%d) is '

2522 'already set') % (self._tracer_pid, self.root_process.pid),

2523 None, None, None)

2524 proc = self.Process(self.blacklist, pid, None)

2525 self.root_process = proc

2526 ppid = None

2527 elif self._process_lookup.get(ppid):

2528 proc = self.Process(self.blacklist, pid, None)

2529 self._process_lookup[ppid].children.append(proc)

2530 else:

2531 # Ignore

2532 return

2533 self._process_lookup[pid] = proc

2534

2535 if (not line[IMAGE_FILE_NAME].startswith('"') or

2536 not line[IMAGE_FILE_NAME].endswith('"')):

2537 raise TracingFailure(

2538 'Command line is not properly quoted: %s' % line[IMAGE_FILE_NAME],

2539 None, None, None)

2540

2541 # TODO(maruel): Process escapes.

2542 if (not line[COMMAND_LINE].startswith('"') or

2543 not line[COMMAND_LINE].endswith('"')):

2544 raise TracingFailure(

2545 'Command line is not properly quoted: %s' % line[COMMAND_LINE],

2546 None, None, None)

2547 proc.command = CommandLineToArgvW(line[COMMAND_LINE][1:-1])

2548 proc.executable = line[IMAGE_FILE_NAME][1:-1]

2549 # proc.command[0] may be the absolute path of 'executable' but it may be

2550 # anything else too. If it happens that command[0] ends with executable,

2551 # use it, otherwise defaults to the base name.

2552 cmd0 = proc.command[0].lower()

2553 if not cmd0.endswith('.exe'):

2554 # TODO(maruel): That's not strictly true either.

2555 cmd0 += '.exe'

2556 if cmd0.endswith(proc.executable) and os.path.isfile(cmd0):

2557 # Fix the path.

2558 cmd0 = cmd0.replace('/', os.path.sep)

2559 cmd0 = os.path.normpath(cmd0)

2560 proc.executable = get_native_path_case(cmd0)

2561 logging.info(

2562 'New child: %s -> %d %s' % (ppid, pid, proc.executable))

2563

2564 def handle_Thread_End(self, line):

2565 """Has the same parameters as Thread_Start."""

2566 tid = int(line[self.TID], 16)

2567 self._threads_active.pop(tid, None)

2568

2569 def handle_Thread_Start(self, line):

2570 """Handles a new thread created.

2571

2572 Do not use self.PID here since a process' initial thread is created by

2573 the parent process.

2574 """

2575 PROCESS_ID = self.USER_DATA

2576 TTHREAD_ID = self.USER_DATA + 1

2577 #STACK_BASE = self.USER_DATA + 2

2578 #STACK_LIMIT = self.USER_DATA + 3

2579 #USER_STACK_BASE = self.USER_DATA + 4

2580 #USER_STACK_LIMIT = self.USER_DATA + 5

2581 #AFFINITY = self.USER_DATA + 6

2582 #WIN32_START_ADDR = self.USER_DATA + 7

2583 #TEB_BASE = self.USER_DATA + 8

2584 #SUB_PROCESS_TAG = self.USER_DATA + 9

2585 #BASE_PRIORITY = self.USER_DATA + 10

2586 #PAGE_PRIORITY = self.USER_DATA + 11

2587 #IO_PRIORITY = self.USER_DATA + 12

2588 #THREAD_FLAGS = self.USER_DATA + 13

2589 # Do not use self.PID here since a process' initial thread is created by

2590 # the parent process.

2591 pid = int(line[PROCESS_ID], 16)

2592 tid = int(line[TTHREAD_ID], 16)

2593 logging.debug('New thread pid:%d, tid:%d' % (pid, tid))

2594 self._threads_active[tid] = pid

2595

2596 @classmethod

2597 def supported_events(cls):

2598 """Returns all the procesed events."""

2599 out = []

2600 for member in dir(cls):

2601 match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)

2602 if match:

2603 out.append(match.groups())

2604 return out

2605

2606 class Tracer(ApiBase.Tracer):

2607 # The basic headers.

2608 EXPECTED_HEADER = [

2609 u'Event Name',

2610 u'Type',

2611 u'Event ID',

2612 u'Version',

2613 u'Channel',

2614 u'Level', # 5

2615 u'Opcode',

2616 u'Task',

2617 u'Keyword',

2618 u'PID',

2619 u'TID', # 10

2620 u'Processor Number',

2621 u'Instance ID',

2622 u'Parent Instance ID',

2623 u'Activity ID',

2624 u'Related Activity ID', # 15

2625 u'Clock-Time',

2626 u'Kernel(ms)', # Both have a resolution of ~15ms which makes them

2627 u'User(ms)', # pretty much useless.

2628 u'User Data', # Extra arguments that are event-specific.

2629 ]

2630 # Only the useful headers common to all entries are listed there. Any column

2631 # at 19 or higher is dependent on the specific event.

2632 EVENT_NAME = 0

2633 TYPE = 1

2634 PID = 9

2635 TID = 10

2636 PROCESSOR_ID = 11

2637 TIMESTAMP = 16

2638 NULL_GUID = '{00000000-0000-0000-0000-000000000000}'

2639 USER_DATA = 19

2640

2641 def __init__(self, logname):

2642 """Starts the log collection.

2643

2644 Requires administrative access. logman.exe is synchronous so no need for a

2645 "warmup" call. 'Windows Kernel Trace' is localized so use its GUID

2646 instead. The GUID constant name is SystemTraceControlGuid. Lovely.

2647

2648 One can get the list of potentially interesting providers with:

2649 "logman query providers \| findstr /i file"

2650 """

2651 super(LogmanTrace.Tracer, self).__init__(logname)

2652 self._script = create_thunk()

2653 cmd_start = [

2654 'logman.exe',

2655 'start',

2656 'NT Kernel Logger',

2657 '-p', '{9e814aad-3204-11d2-9a82-006008a86939}',

2658 # splitio,fileiocompletion,syscall,file,cswitch,img

2659 '(process,fileio,thread)',

2660 '-o', self._logname + '.etl',

2661 '-ets', # Send directly to kernel

2662 # Values extracted out of thin air.

2663 # Event Trace Session buffer size in kb.

2664 '-bs', '10240',

2665 # Number of Event Trace Session buffers.

2666 '-nb', '16', '256',

2667 ]

2668 logging.debug('Running: %s' % cmd_start)

2669 try:

2670 subprocess.check_call(

2671 cmd_start,

2672 stdin=subprocess.PIPE,

2673 stdout=subprocess.PIPE,

2674 stderr=subprocess.STDOUT)

2675 except subprocess.CalledProcessError, e:

2676 if e.returncode == -2147024891:

2677 print >> sys.stderr, 'Please restart with an elevated admin prompt'

2678 elif e.returncode == -2144337737:

2679 print >> sys.stderr, (

2680 'A kernel trace was already running, stop it and try again')

2681 raise

2682

2683 def trace(self, cmd, cwd, tracename, output):

2684 logging.info('trace(%s, %s, %s, %s)' % (cmd, cwd, tracename, output))

2685 assert os.path.isabs(cmd[0]), cmd[0]

2686 assert os.path.isabs(cwd), cwd

2687 assert os.path.normpath(cwd) == cwd, cwd

2688 with self._lock:

2689 if not self._initialized:

2690 raise TracingFailure(

2691 'Called Tracer.trace() on an unitialized object',

2692 None, None, None, tracename)

2693 assert tracename not in (i['trace'] for i in self._traces)

2694

2695 # Use "logman -?" for help.

2696

2697 stdout = stderr = None

2698 if output:

2699 stdout = subprocess.PIPE

2700 stderr = subprocess.STDOUT

2701

2702 # Run the child process.

2703 logging.debug('Running: %s' % cmd)

2704 # Use the temporary script generated with create_thunk() so we have a

2705 # clear pid owner. Since trace_inputs.py can be used as a library and

2706 # could trace multiple processes simultaneously, it makes it more complex

2707 # if the executable to be traced is executed directly here. It also solves

2708 # issues related to logman.exe that needs to be executed to control the

2709 # kernel trace.

2710 child_cmd = [

2711 sys.executable,

2712 self._script,

2713 tracename,

2714 ]

2715 child = subprocess.Popen(

2716 child_cmd + fix_python_path(cmd),

2717 cwd=cwd,

2718 stdin=subprocess.PIPE,

2719 stdout=stdout,

2720 stderr=stderr)

2721 logging.debug('Started child pid: %d' % child.pid)

2722 out = child.communicate()[0]

2723 # This doesn't mean all the grand-children are done. Sadly, we don't have

2724 # a good way to determine that.

2725

2726 with self._lock:

2727 assert tracename not in (i['trace'] for i in self._traces)

2728 self._traces.append({

2729 'command': cmd,

2730 'cwd': cwd,

2731 'pid': child.pid,

2732 'trace': tracename,

2733 'output': out,

2734 })

2735

2736 return child.returncode, out

2737

2738 def close(self, _timeout=None):

2739 """Stops the kernel log collection and converts the traces to text

2740 representation.

2741 """

2742 with self._lock:

2743 if not self._initialized:

2744 raise TracingFailure(

2745 'Called Tracer.close() on an unitialized object',

2746 None, None, None)

2747 os.remove(self._script)

2748 # Save metadata, add 'format' key..

2749 data = {

2750 'format': 'csv',

2751 'traces': self._traces,

2752 }

2753 write_json(self._logname, data, False)

2754

2755 cmd_stop = [

2756 'logman.exe',

2757 'stop',

2758 'NT Kernel Logger',

2759 '-ets', # Sends the command directly to the kernel.

2760 ]

2761 logging.debug('Running: %s' % cmd_stop)

2762 subprocess.check_call(

2763 cmd_stop,

2764 stdin=subprocess.PIPE,

2765 stdout=subprocess.PIPE,

2766 stderr=subprocess.STDOUT)

2767 self._initialized = False

2768

2769 def post_process_log(self):

2770 """Converts the .etl file into .csv then into .json."""

2771 super(LogmanTrace.Tracer, self).post_process_log()

2772 logformat = 'csv'

2773 self._convert_log(logformat)

2774

2775 if logformat == 'csv_utf16':

2776 def load_file():

2777 def utf_8_encoder(unicode_csv_data):

2778 """Encodes the unicode object as utf-8 encoded str instance"""

2779 for line in unicode_csv_data:

2780 yield line.encode('utf-8')

2781

2782 def unicode_csv_reader(unicode_csv_data, **kwargs):

2783 """Encodes temporarily as UTF-8 since csv module doesn't do unicode.

2784 """

2785 csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)

2786 for row in csv_reader:

2787 # Decode str utf-8 instances back to unicode instances, cell by

2788 # cell:

2789 yield [cell.decode('utf-8') for cell in row]

2790

2791 # The CSV file is UTF-16 so use codecs.open() to load the file into

2792 # the python internal unicode format (utf-8). Then explicitly

2793 # re-encode as utf8 as str instances so csv can parse it fine. Then

2794 # decode the utf-8 str back into python unicode instances. This

2795 # sounds about right.

2796 for line in unicode_csv_reader(

2797 codecs.open(self._logname + '.' + logformat, 'r', 'utf-16')):

2798 # line is a list of unicode objects

2799 # So much white space!

2800 yield [i.strip() for i in line]

2801

2802 elif logformat == 'csv':

2803 def load_file():

2804 def ansi_csv_reader(ansi_csv_data, **kwargs):

2805 """Loads an 'ANSI' code page and returns unicode() objects."""

2806 assert sys.getfilesystemencoding() == 'mbcs'

2807 encoding = get_current_encoding()

2808 for row in csv.reader(ansi_csv_data, **kwargs):

2809 # Decode str 'ansi' instances to unicode instances, cell by cell:

2810 yield [cell.decode(encoding) for cell in row]

2811

2812 # The fastest and smallest format but only supports 'ANSI' file paths.

2813 # E.g. the filenames are encoding in the 'current' encoding.

2814 for line in ansi_csv_reader(open(self._logname + '.' + logformat)):

2815 # line is a list of unicode objects.

2816 yield [i.strip() for i in line]

2817

2818 supported_events = LogmanTrace.Context.supported_events()

2819

2820 def trim(generator):

2821 for index, line in enumerate(generator):

2822 if not index:

2823 if line != self.EXPECTED_HEADER:

2824 raise TracingFailure(

2825 'Found malformed header: %s' % ' '.join(line),

2826 None, None, None)

2827 continue

2828 # As you can see, the CSV is full of useful non-redundant information:

2829 if (line[2] != '0' or # Event ID

2830 line[3] not in ('2', '3') or # Version

2831 line[4] != '0' or # Channel

2832 line[5] != '0' or # Level

2833 line[7] != '0' or # Task

2834 line[8] != '0x0000000000000000' or # Keyword

2835 line[12] != '' or # Instance ID

2836 line[13] != '' or # Parent Instance ID

2837 line[14] != self.NULL_GUID or # Activity ID

2838 line[15] != ''): # Related Activity ID

2839 raise TracingFailure(

2840 'Found unexpected values in line: %s' % ' '.join(line),

2841 None, None, None)

2842

2843 if (line[self.EVENT_NAME], line[self.TYPE]) not in supported_events:

2844 continue

2845

2846 # Convert the PID in-place from hex.

2847 line[self.PID] = int(line[self.PID], 16)

2848

2849 yield [

2850 line[self.EVENT_NAME],

2851 line[self.TYPE],

2852 line[self.PID],

2853 line[self.TID],

2854 line[self.PROCESSOR_ID],

2855 line[self.TIMESTAMP],

2856 ] + line[self.USER_DATA:]

2857

2858 write_json('%s.json' % self._logname, list(trim(load_file())), True)

2859

2860 def _convert_log(self, logformat):

2861 """Converts the ETL trace to text representation.

2862

2863 Normally, 'csv' is sufficient. If complex scripts are used (like eastern

2864 languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.

2865

2866 Arguments:

2867 - logformat: Text format to be generated, csv, csv_utf16 or xml.

2868

2869 Use "tracerpt -?" for help.

2870 """

2871 LOCALE_INVARIANT = 0x7F

2872 windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)

2873 cmd_convert = [

2874 'tracerpt.exe',

2875 '-l', self._logname + '.etl',

2876 '-o', self._logname + '.' + logformat,

2877 '-gmt', # Use UTC

2878 '-y', # No prompt

2879 # Use -of XML to get the header of each items after column 19, e.g. all

2880 # the actual headers of 'User Data'.

2881 ]

2882

2883 if logformat == 'csv':

2884 # tracerpt localizes the 'Type' column, for major brainfuck

2885 # entertainment. I can't imagine any sane reason to do that.

2886 cmd_convert.extend(['-of', 'CSV'])

2887 elif logformat == 'csv_utf16':

2888 # This causes it to use UTF-16, which doubles the log size but ensures

2889 # the log is readable for non-ASCII characters.

2890 cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])

2891 elif logformat == 'xml':

2892 cmd_convert.extend(['-of', 'XML'])

2893 else:

2894 raise ValueError('Unexpected log format \'%s\'' % logformat)

2895 logging.debug('Running: %s' % cmd_convert)

2896 # This can takes tens of minutes for large logs.

2897 # Redirects all output to stderr.

2898 subprocess.check_call(

2899 cmd_convert,

2900 stdin=subprocess.PIPE,

2901 stdout=sys.stderr,

2902 stderr=sys.stderr)

2903

2904 @staticmethod

2905 def clean_trace(logname):

2906 for ext in ('', '.csv', '.etl', '.json', '.xml'):

2907 if os.path.isfile(logname + ext):

2908 os.remove(logname + ext)

2909

2910 @classmethod

2911 def parse_log(cls, logname, blacklist):

2912 logging.info('parse_log(%s, %s)' % (logname, blacklist))

2913

2914 def blacklist_more(filepath):

2915 # All the NTFS metadata is in the form x:\$EXTEND or stuff like that.

2916 return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)

2917

2918 data = read_json(logname)

2919 lines = read_json(logname + '.json')

2920 out = []

2921 for item in data['traces']:

2922 context = cls.Context(blacklist_more, item['pid'])

2923 for line in lines:

2924 context.on_line(line)

2925 out.append(

2926 {

2927 'results': context.to_results(),

2928 'trace': item['trace'],

2929 'output': item['output'],

2930 })

2931 return out

2932

2933

2934 def get_api():

2935 """Returns the correct implementation for the current OS."""

2936 if sys.platform == 'cygwin':

2937 raise NotImplementedError(

2938 'Not implemented for cygwin, start the script from Win32 python')

2939 flavors = {

2940 'win32': LogmanTrace,

2941 'darwin': Dtrace,

2942 'sunos5': Dtrace,

2943 'freebsd7': Dtrace,

2944 'freebsd8': Dtrace,

2945 }

2946 # Defaults to strace.

2947 return flavors.get(sys.platform, Strace)()

2948

2949

2950 def extract_directories(root_dir, files, blacklist):

2951 """Detects if all the files in a directory are in \|files\| and if so, replace

2952 the individual files by a Results.Directory instance.

2953

2954 Takes a list of Results.File instances and returns a shorter list of

2955 Results.File and Results.Directory instances.

2956

2957 Arguments:

2958 - root_dir: Optional base directory that shouldn't be search further.

2959 - files: list of Results.File instances.

2960 - blacklist: regexp of files to ignore, for example r'.+\.pyc'.

2961 """

2962 logging.info(

2963 'extract_directories(%s, %d files, ...)' % (root_dir, len(files)))

2964 assert not (root_dir or '').endswith(os.path.sep), root_dir

2965 assert not root_dir or (get_native_path_case(root_dir) == root_dir)

2966 assert not any(isinstance(f, Results.Directory) for f in files)

2967 # Remove non existent files.

2968 files = [f for f in files if f.existent]

2969 if not files:

2970 return files

2971 # All files must share the same root, which can be None.

2972 assert len(set(f.root for f in files)) == 1, set(f.root for f in files)

2973

2974 # Creates a {directory: {filename: File}} mapping, up to root.

2975 buckets = {}

2976 if root_dir:

2977 buckets[root_dir] = {}

2978 for fileobj in files:

2979 path = fileobj.full_path

2980 directory = os.path.dirname(path)

2981 assert directory

2982 # Do not use os.path.basename() so trailing os.path.sep is kept.

2983 basename = path[len(directory)+1:]

2984 files_in_directory = buckets.setdefault(directory, {})

2985 files_in_directory[basename] = fileobj

2986 # Add all the directories recursively up to root.

2987 while True:

2988 old_d = directory

2989 directory = os.path.dirname(directory)

2990 if directory + os.path.sep == root_dir or directory == old_d:

2991 break

2992 buckets.setdefault(directory, {})

2993

2994 root_prefix = len(root_dir) + 1 if root_dir else 0

2995 for directory in sorted(buckets, reverse=True):

2996 actual = set(f for f in os.listdir(directory) if not blacklist(f))

2997 expected = set(buckets[directory])

2998 if not (actual - expected):

2999 parent = os.path.dirname(directory)

3000 buckets[parent][os.path.basename(directory)] = Results.Directory(

3001 root_dir,

3002 directory[root_prefix:],

3003 False,

3004 sum(f.size for f in buckets[directory].itervalues()),

3005 sum(f.nb_files for f in buckets[directory].itervalues()))

3006 # Remove the whole bucket.

3007 del buckets[directory]

3008

3009 # Reverse the mapping with what remains. The original instances are returned,

3010 # so the cached meta data is kept.

3011 files = sum((x.values() for x in buckets.itervalues()), [])

3012 return sorted(files, key=lambda x: x.path)

3013

3014

3015 def trace(logfile, cmd, cwd, api, output):

3016 """Traces an executable. Returns (returncode, output) from api.

3017

3018 Arguments:

3019 - logfile: file to write to.

3020 - cmd: command to run.

3021 - cwd: current directory to start the process in.

3022 - api: a tracing api instance.

3023 - output: if True, returns output, otherwise prints it at the console.

3024 """

3025 cmd = fix_python_path(cmd)

3026 api.clean_trace(logfile)

3027 with api.get_tracer(logfile) as tracer:

3028 return tracer.trace(cmd, cwd, 'default', output)

3029

3030

3031 def load_trace(logfile, root_dir, api, blacklist):

3032 """Loads a trace file and returns the Results instance.

3033

3034 Arguments:

3035 - logfile: File to load.

3036 - root_dir: Root directory to use to determine if a file is relevant to the

3037 trace or not.

3038 - api: A tracing api instance.

3039 - blacklist: Optional blacklist function to filter out unimportant files.

3040 """

3041 data = api.parse_log(logfile, (blacklist or (lambda _: False)))

3042 assert len(data) == 1, 'More than one trace was detected!'

3043 if 'exception' in data[0]:

3044 # It got an exception, raise it.

3045 raise data[0]['exception']

3046 results = data[0]['results']

3047 if root_dir:

3048 results = results.strip_root(root_dir)

3049 return results

3050

3051

3052 def CMDclean(args):

3053 """Cleans up traces."""

3054 parser = OptionParserTraceInputs(command='clean')

3055 options, args = parser.parse_args(args)

3056 api = get_api()

3057 api.clean_trace(options.log)

3058 return 0

3059

3060

3061 def CMDtrace(args):

3062 """Traces an executable."""

3063 parser = OptionParserTraceInputs(command='trace')

3064 parser.allow_interspersed_args = False

3065 parser.add_option(

3066 '-q', '--quiet', action='store_true',

3067 help='Redirects traced executable output to /dev/null')

3068 options, args = parser.parse_args(args)

3069

3070 if not args:

3071 parser.error('Please provide a command to run')

3072

3073 if not os.path.isabs(args[0]) and os.access(args[0], os.X_OK):

3074 args[0] = os.path.abspath(args[0])

3075

3076 api = get_api()

3077 return trace(options.log, args, os.getcwd(), api, options.quiet)[0]

3078

3079

3080 def CMDread(args):

3081 """Reads the logs and prints the result."""

3082 parser = OptionParserTraceInputs(command='read')

3083 parser.add_option(

3084 '-V', '--variable',

3085 nargs=2,

3086 action='append',

3087 dest='variables',

3088 metavar='VAR_NAME directory',

3089 default=[],

3090 help=('Variables to replace relative directories against. Example: '

3091 '"-v \'$HOME\' \'/home/%s\'" will replace all occurence of your '

3092 'home dir with $HOME') % getpass.getuser())

3093 parser.add_option(

3094 '--root-dir',

3095 help='Root directory to base everything off it. Anything outside of this '

3096 'this directory will not be reported')

3097 parser.add_option(

3098 '-j', '--json', action='store_true',

3099 help='Outputs raw result data as json')

3100 parser.add_option(

3101 '-b', '--blacklist', action='append', default=[],

3102 help='List of regexp to use as blacklist filter')

3103 options, args = parser.parse_args(args)

3104

3105 if options.root_dir:

3106 options.root_dir = os.path.abspath(options.root_dir)

3107

3108 variables = dict(options.variables)

3109 api = get_api()

3110 def blacklist(f):

3111 return any(re.match(b, f) for b in options.blacklist)

3112 data = api.parse_log(options.log, blacklist)

3113 # Process each trace.

3114 output_as_json = []

3115 for item in data:

3116 if 'exception' in item:

3117 print >> sys.stderr, (

3118 'Trace %s: Got an exception: %s' % (item['trace'], item['exception']))

3119 continue

3120 results = item['results']

3121 if options.root_dir:

3122 results = results.strip_root(options.root_dir)

3123

3124 if options.json:

3125 output_as_json.append(results.flatten())

3126 else:

3127 simplified = extract_directories(

3128 options.root_dir, results.files, blacklist)

3129 simplified = [f.replace_variables(variables) for f in simplified]

3130 if len(data) > 1:

3131 print('Trace: %s' % item['trace'])

3132 print('Total: %d' % len(results.files))

3133 print('Non existent: %d' % len(results.non_existent))

3134 for f in results.non_existent:

3135 print(' %s' % f.path)

3136 print(

3137 'Interesting: %d reduced to %d' % (

3138 len(results.existent), len(simplified)))

3139 for f in simplified:

3140 print(' %s' % f.path)

3141

3142 if options.json:

3143 write_json(sys.stdout, output_as_json, False)

3144 return 0

3145

3146

3147 class OptionParserWithLogging(optparse.OptionParser):

3148 """Adds --verbose option."""

3149 def __init__(self, verbose=0, **kwargs):

3150 optparse.OptionParser.__init__(self, **kwargs)

3151 self.add_option(

3152 '-v', '--verbose',

3153 action='count',

3154 default=verbose,

3155 help='Use multiple times to increase verbosity')

3156

3157 def parse_args(self, args, *kwargs):

3158 options, args = optparse.OptionParser.parse_args(self, args, *kwargs)

3159 levels = [logging.ERROR, logging.INFO, logging.DEBUG]

3160 logging.basicConfig(

3161 level=levels[min(len(levels)-1, options.verbose)],

3162 format='%(levelname)5s %(module)15s(%(lineno)3d): %(message)s')

3163 return options, args

3164

3165

3166 class OptionParserWithNiceDescription(OptionParserWithLogging):

3167 """Generates the description with the command's docstring."""

3168 def __init__(self, **kwargs):

3169 """Sets 'description' and 'usage' if not already specified."""

3170 command = kwargs.pop('command', 'help')

3171 kwargs.setdefault(

3172 'description',

3173 re.sub('[\r\n ]{2,}', ' ', get_command_handler(command).__doc__))

3174 kwargs.setdefault('usage', '%%prog %s [options]' % command)

3175 OptionParserWithLogging.__init__(self, **kwargs)

3176

3177

3178 class OptionParserTraceInputs(OptionParserWithNiceDescription):

3179 """Adds automatic --log handling."""

3180 def __init__(self, **kwargs):

3181 OptionParserWithNiceDescription.__init__(self, **kwargs)

3182 self.add_option(

3183 '-l', '--log', help='Log file to generate or read, required')

3184

3185 def parse_args(self, args, *kwargs):

3186 """Makes sure the paths make sense.

3187

3188 On Windows, / and \ are often mixed together in a path.

3189 """

3190 options, args = OptionParserWithNiceDescription.parse_args(

3191 self, args, *kwargs)

3192 if not options.log:

3193 self.error('Must supply a log file with -l')

3194 options.log = os.path.abspath(options.log)

3195 return options, args

3196

3197

3198 def extract_documentation():

3199 """Returns a dict {command: description} for each of documented command."""

3200 commands = (

3201 fn[3:]

3202 for fn in dir(sys.modules['__main__'])

3203 if fn.startswith('CMD') and get_command_handler(fn[3:]).__doc__)

3204 return dict((fn, get_command_handler(fn).__doc__) for fn in commands)

3205

3206

3207 def CMDhelp(args):

3208 """Prints list of commands or help for a specific command."""

3209 doc = extract_documentation()

3210 # Calculates the optimal offset.

3211 offset = max(len(cmd) for cmd in doc)

3212 format_str = ' %-' + str(offset + 2) + 's %s'

3213 # Generate a one-liner documentation of each commands.

3214 commands_description = '\n'.join(

3215 format_str % (cmd, doc[cmd].split('\n')[0]) for cmd in sorted(doc))

3216

3217 parser = OptionParserWithNiceDescription(

3218 usage='%prog <command> [options]',

3219 description='Commands are:\n%s\n' % commands_description)

3220 parser.format_description = lambda _: parser.description

3221

3222 # Strip out any -h or --help argument.

3223 _, args = parser.parse_args([i for i in args if not i in ('-h', '--help')])

3224 if len(args) == 1:

3225 if not get_command_handler(args[0]):

3226 parser.error('Unknown command %s' % args[0])

3227 # The command was "%prog help command", replaces ourself with

3228 # "%prog command --help" so help is correctly printed out.

3229 return main(args + ['--help'])

3230 elif args:

3231 parser.error('Unknown argument "%s"' % ' '.join(args))

3232 parser.print_help()

3233 return 0

3234

3235

3236 def get_command_handler(name):

3237 """Returns the command handler or CMDhelp if it doesn't exist."""

3238 return getattr(sys.modules['__main__'], 'CMD%s' % name, None)

3239

3240

3241 def main_impl(argv):

3242 command = get_command_handler(argv[0] if argv else 'help')

3243 if not command:

3244 return CMDhelp(argv)

3245 return command(argv[1:])

3246

3247 def main(argv):

3248 try:

3249 main_impl(argv)

3250 except TracingFailure, e:

3251 sys.stderr.write('\nError: ')

3252 sys.stderr.write(str(e))

3253 sys.stderr.write('\n')

3254 return 1

3255

3256

3257 if __name__ == '__main__':

3258 sys.exit(main(sys.argv[1:]))

OLD	NEW

« no previous file with comments | « tools/isolate/tests/trace_test_cases_smoke_test.py ('k') | tools/isolate/trace_test_cases.py » ('j') | no next file with comments »