telemetry/third_party/webpagereplay/httparchive.py - Issue 2210063003: Rename third_party/webpagereplay to third_party/web-page-replay

Side by Side Diff: telemetry/third_party/webpagereplay/httparchive.py

Issue 2210063003: Rename third_party/webpagereplay to third_party/web-page-replay (Closed) Base URL: https://github.com/catapult-project/catapult@master

Patch Set: Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #!/usr/bin/env python

2 # Copyright 2010 Google Inc. All Rights Reserved.

3 #

4 # Licensed under the Apache License, Version 2.0 (the "License");

5 # you may not use this file except in compliance with the License.

6 # You may obtain a copy of the License at

7 #

8 # http://www.apache.org/licenses/LICENSE-2.0

9 #

10 # Unless required by applicable law or agreed to in writing, software

11 # distributed under the License is distributed on an "AS IS" BASIS,

12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13 # See the License for the specific language governing permissions and

14 # limitations under the License.

15

16 """View and edit HTTP Archives.

17

18 To list all URLs in an archive:

19 $ ./httparchive.py ls archive.wpr

20

21 To view the content of all URLs from example.com:

22 $ ./httparchive.py cat --host example.com archive.wpr

23

24 To view the content of a particular URL:

25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr

26

27 To view the content of all URLs:

28 $ ./httparchive.py cat archive.wpr

29

30 To edit a particular URL:

31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr

32

33 To print statistics of an archive:

34 $ ./httparchive.py stats archive.wpr

35

36 To print statistics of a set of URLs:

37 $ ./httparchive.py stats --host www.example.com archive.wpr

38

39 To merge multiple archives

40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...

41 """

42

43 import calendar

44 import certutils

45 import cPickle

46 import difflib

47 import email.utils

48 import httplib

49 import httpzlib

50 import json

51 import logging

52 import optparse

53 import os

54 import StringIO

55 import subprocess

56 import sys

57 import tempfile

58 import time

59 import urlparse

60 from collections import defaultdict

61

62

63

64 def LogRunTime(fn):

65 """Annotation which logs the run time of the function."""

66 def wrapped(self, args, *kwargs):

67 start_time = time.time()

68 try:

69 return fn(self, args, *kwargs)

70 finally:

71 run_time = (time.time() - start_time) * 1000.0

72 logging.debug('%s: %dms', fn.__name__, run_time)

73 return wrapped

74

75

76 class HttpArchiveException(Exception):

77 """Base class for all exceptions in httparchive."""

78 pass

79

80

81 class HttpArchive(dict):

82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.

83

84 Attributes:

85 responses_by_host: dict of {hostname, {request: response}}. This must remain

86 in sync with the underlying dict of self. It is used as an optimization

87 so that get_requests() doesn't have to linearly search all requests in

88 the archive to find potential matches.

89 """

90

91 def __init__(self): # pylint: disable=super-init-not-called

92 self.responses_by_host = defaultdict(dict)

93

94 def __setstate__(self, state):

95 """Influence how to unpickle.

96

97 Args:

98 state: a dictionary for __dict__

99 """

100 self.__dict__.update(state)

101 self.responses_by_host = defaultdict(dict)

102 for request in self:

103 self.responses_by_host[request.host][request] = self[request]

104

105 def __getstate__(self):

106 """Influence how to pickle.

107

108 Returns:

109 a dict to use for pickling

110 """

111 state = self.__dict__.copy()

112 del state['responses_by_host']

113 return state

114

115 def __setitem__(self, key, value):

116 super(HttpArchive, self).__setitem__(key, value)

117 if hasattr(self, 'responses_by_host'):

118 self.responses_by_host[key.host][key] = value

119

120 def __delitem__(self, key):

121 super(HttpArchive, self).__delitem__(key)

122 del self.responses_by_host[key.host][key]

123

124 def get(self, request, default=None):

125 """Return the archived response for a given request.

126

127 Does extra checking for handling some HTTP request headers.

128

129 Args:

130 request: instance of ArchivedHttpRequest

131 default: default value to return if request is not found

132

133 Returns:

134 Instance of ArchivedHttpResponse or default if no matching

135 response is found

136 """

137 if request in self:

138 return self[request]

139 return self.get_conditional_response(request, default)

140

141 def get_conditional_response(self, request, default):

142 """Get the response based on the conditional HTTP request headers.

143

144 Args:

145 request: an ArchivedHttpRequest representing the original request.

146 default: default ArchivedHttpResponse

147 original request with matched headers removed.

148

149 Returns:

150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or

151 412 (precondition failed)

152 """

153 response = default

154 if request.is_conditional():

155 stripped_request = request.create_request_without_conditions()

156 if stripped_request in self:

157 response = self[stripped_request]

158 if response.status == 200:

159 status = self.get_conditional_status(request, response)

160 if status != 200:

161 response = create_response(status)

162 return response

163

164 def get_conditional_status(self, request, response):

165 status = 200

166 last_modified = email.utils.parsedate(

167 response.update_date(response.get_header('last-modified')))

168 response_etag = response.get_header('etag')

169 is_get_or_head = request.command.upper() in ('GET', 'HEAD')

170

171 match_value = request.headers.get('if-match', None)

172 if match_value:

173 if self.is_etag_match(match_value, response_etag):

174 status = 200

175 else:

176 status = 412 # precondition failed

177 none_match_value = request.headers.get('if-none-match', None)

178 if none_match_value:

179 if self.is_etag_match(none_match_value, response_etag):

180 status = 304

181 elif is_get_or_head:

182 status = 200

183 else:

184 status = 412

185 if is_get_or_head and last_modified:

186 for header in ('if-modified-since', 'if-unmodified-since'):

187 date = email.utils.parsedate(request.headers.get(header, None))

188 if date:

189 if ((header == 'if-modified-since' and last_modified > date) or

190 (header == 'if-unmodified-since' and last_modified < date)):

191 if status != 412:

192 status = 200

193 else:

194 status = 304 # not modified

195 return status

196

197 @staticmethod

198 def is_etag_match(request_etag, response_etag):

199 """Determines whether the entity tags of the request/response matches.

200

201 Args:

202 request_etag: the value string of the "if-(none)-match:"

203 portion of the request header

204 response_etag: the etag value of the response

205

206 Returns:

207 True on match, False otherwise

208 """

209 response_etag = response_etag.strip('" ')

210 for etag in request_etag.split(','):

211 etag = etag.strip('" ')

212 if etag in ('*', response_etag):

213 return True

214 return False

215

216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,

217 use_query=True):

218 """Return a list of requests that match the given args."""

219 if host:

220 return [r for r in self.responses_by_host[host]

221 if r.matches(command, None, full_path, is_ssl,

222 use_query=use_query)]

223 else:

224 return [r for r in self

225 if r.matches(command, host, full_path, is_ssl,

226 use_query=use_query)]

227

228 def ls(self, command=None, host=None, full_path=None):

229 """List all URLs that match given params."""

230 return ''.join(sorted(

231 '%s\n' % r for r in self.get_requests(command, host, full_path)))

232

233 def cat(self, command=None, host=None, full_path=None):

234 """Print the contents of all URLs that match given params."""

235 out = StringIO.StringIO()

236 for request in self.get_requests(command, host, full_path):

237 print >>out, str(request)

238 print >>out, 'Untrimmed request headers:'

239 for k in request.headers:

240 print >>out, ' %s: %s' % (k, request.headers[k])

241 if request.request_body:

242 print >>out, request.request_body

243 print >>out, '---- Response Info', '-' * 51

244 response = self[request]

245 chunk_lengths = [len(x) for x in response.response_data]

246 print >>out, ('Status: %s\n'

247 'Reason: %s\n'

248 'Headers delay: %s\n'

249 'Untrimmed response headers:') % (

250 response.status, response.reason, response.delays['headers'])

251 for k, v in response.original_headers:

252 print >>out, ' %s: %s' % (k, v)

253 print >>out, ('Chunk count: %s\n'

254 'Chunk lengths: %s\n'

255 'Chunk delays: %s') % (

256 len(chunk_lengths), chunk_lengths, response.delays['data'])

257 body = response.get_data_as_text()

258 print >>out, '---- Response Data', '-' * 51

259 if body:

260 print >>out, body

261 else:

262 print >>out, '[binary data]'

263 print >>out, '=' * 70

264 return out.getvalue()

265

266 def stats(self, command=None, host=None, full_path=None):

267 """Print stats about the archive for all URLs that match given params."""

268 matching_requests = self.get_requests(command, host, full_path)

269 if not matching_requests:

270 print 'Failed to find any requests matching given command, host, path.'

271 return

272

273 out = StringIO.StringIO()

274 stats = {

275 'Total': len(matching_requests),

276 'Domains': defaultdict(int),

277 'HTTP_response_code': defaultdict(int),

278 'content_type': defaultdict(int),

279 'Documents': defaultdict(int),

280 }

281

282 for request in matching_requests:

283 stats['Domains'][request.host] += 1

284 stats['HTTP_response_code'][self[request].status] += 1

285

286 content_type = self[request].get_header('content-type')

287 # Remove content type options for readability and higher level groupings.

288 str_content_type = str(content_type.split(';')[0]

289 if content_type else None)

290 stats['content_type'][str_content_type] += 1

291

292 # Documents are the main URL requested and not a referenced resource.

293 if str_content_type == 'text/html' and not 'referer' in request.headers:

294 stats['Documents'][request.host] += 1

295

296 print >>out, json.dumps(stats, indent=4)

297 return out.getvalue()

298

299 def merge(self, merged_archive=None, other_archives=None):

300 """Merge multiple archives into merged_archive by 'chaining' resources,

301 only resources that are not part of the accumlated archive are added"""

302 if not other_archives:

303 print 'No archives passed to merge'

304 return

305

306 # Note we already loaded 'replay_file'.

307 print 'Loaded %d responses' % len(self)

308

309 for archive in other_archives:

310 if not os.path.exists(archive):

311 print 'Error: Replay file "%s" does not exist' % archive

312 return

313

314 http_archive_other = HttpArchive.Load(archive)

315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive)

316 for r in http_archive_other:

317 # Only resources that are not already part of the current archive

318 # get added.

319 if r not in self:

320 print '\t %s ' % r

321 self[r] = http_archive_other[r]

322 self.Persist('%s' % merged_archive)

323

324 def edit(self, command=None, host=None, full_path=None):

325 """Edits the single request which matches given params."""

326 editor = os.getenv('EDITOR')

327 if not editor:

328 print 'You must set the EDITOR environmental variable.'

329 return

330

331 matching_requests = self.get_requests(command, host, full_path)

332 if not matching_requests:

333 print ('Failed to find any requests matching given command, host, '

334 'full_path.')

335 return

336

337 if len(matching_requests) > 1:

338 print 'Found multiple matching requests. Please refine.'

339 print self.ls(command, host, full_path)

340

341 response = self[matching_requests[0]]

342 tmp_file = tempfile.NamedTemporaryFile(delete=False)

343 tmp_file.write(response.get_response_as_text())

344 tmp_file.close()

345 subprocess.check_call([editor, tmp_file.name])

346 response.set_response_from_text(''.join(open(tmp_file.name).readlines()))

347 os.remove(tmp_file.name)

348

349 def find_closest_request(self, request, use_path=False):

350 """Find the closest matching request in the archive to the given request.

351

352 Args:

353 request: an ArchivedHttpRequest

354 use_path: If True, closest matching request's path component must match.

355 (Note: this refers to the 'path' component within the URL, not the

356 'full path' which includes the query string component.)

357

358 If use_path=True, candidate will NOT match in example below

359 e.g. request = GET www.test.com/a?p=1

360 candidate = GET www.test.com/b?p=1

361

362 Even if use_path=False, urls with same paths are always favored.

363 For example, candidate1 is considered a better match than candidate2.

364 request = GET www.test.com/a?p=1&q=2&r=3

365 candidate1 = GET www.test.com/a?s=4

366 candidate2 = GET www.test.com/b?p=1&q=2&r=3

367

368 Returns:

369 If a close match is found, return the instance of ArchivedHttpRequest.

370 Otherwise, return None.

371 """

372 # Start with strictest constraints. This trims search space considerably.

373 requests = self.get_requests(request.command, request.host,

374 request.full_path, is_ssl=request.is_ssl,

375 use_query=True)

376 # Relax constraint: use_query if there is no match.

377 if not requests:

378 requests = self.get_requests(request.command, request.host,

379 request.full_path, is_ssl=request.is_ssl,

380 use_query=False)

381 # Relax constraint: full_path if there is no match and use_path=False.

382 if not requests and not use_path:

383 requests = self.get_requests(request.command, request.host,

384 None, is_ssl=request.is_ssl,

385 use_query=False)

386

387 if not requests:

388 return None

389

390 if len(requests) == 1:

391 return requests[0]

392

393 matcher = difflib.SequenceMatcher(b=request.cmp_seq)

394

395 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call

396 # quick_ratio() on all requests, sort them descending, and then loop through

397 # until we find a candidate whose ratio() is >= the next quick_ratio().

398 # This works because quick_ratio() is guaranteed to be an upper bound on

399 # ratio().

400 candidates = []

401 for candidate in requests:

402 matcher.set_seq1(candidate.cmp_seq)

403 candidates.append((matcher.quick_ratio(), candidate))

404

405 candidates.sort(reverse=True, key=lambda c: c[0])

406

407 best_match = (0, None)

408 for i in xrange(len(candidates)):

409 matcher.set_seq1(candidates[i][1].cmp_seq)

410 best_match = max(best_match, (matcher.ratio(), candidates[i][1]))

411 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:

412 break

413 return best_match[1]

414

415 def diff(self, request):

416 """Diff the given request to the closest matching request in the archive.

417

418 Args:

419 request: an ArchivedHttpRequest

420 Returns:

421 If a close match is found, return a textual diff between the requests.

422 Otherwise, return None.

423 """

424 request_lines = request.formatted_request.split('\n')

425 closest_request = self.find_closest_request(request)

426 if closest_request:

427 closest_request_lines = closest_request.formatted_request.split('\n')

428 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))

429 return None

430

431 def get_server_cert(self, host):

432 """Gets certificate from the server and stores it in archive"""

433 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})

434 if request not in self:

435 self[request] = create_response(200, body=certutils.get_host_cert(host))

436 return self[request].response_data[0]

437

438 def get_certificate(self, host):

439 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})

440 if request not in self:

441 self[request] = create_response(200, body=self._generate_cert(host))

442 return self[request].response_data[0]

443

444 @classmethod

445 def AssertWritable(cls, filename):

446 """Raises an IOError if filename is not writable."""

447 persist_dir = os.path.dirname(os.path.abspath(filename))

448 if not os.path.exists(persist_dir):

449 raise IOError('Directory does not exist: %s' % persist_dir)

450 if os.path.exists(filename):

451 if not os.access(filename, os.W_OK):

452 raise IOError('Need write permission on file: %s' % filename)

453 elif not os.access(persist_dir, os.W_OK):

454 raise IOError('Need write permission on directory: %s' % persist_dir)

455

456 @classmethod

457 def Load(cls, filename):

458 """Load an instance from filename."""

459 return cPickle.load(open(filename, 'rb'))

460

461 def Persist(self, filename):

462 """Persist all state to filename."""

463 try:

464 original_checkinterval = sys.getcheckinterval()

465 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can

466 # modify \|self\| during pickling.

467 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)

468 finally:

469 sys.setcheckinterval(original_checkinterval)

470 with open(filename, 'wb') as f:

471 f.write(pickled_self)

472

473

474 class ArchivedHttpRequest(object):

475 """Record all the state that goes into a request.

476

477 ArchivedHttpRequest instances are considered immutable so they can

478 serve as keys for HttpArchive instances.

479 (The immutability is not enforced.)

480

481 Upon creation, the headers are "trimmed" (i.e. edited or dropped)

482 and saved to self.trimmed_headers to allow requests to match in a wider

483 variety of playback situations (e.g. using different user agents).

484

485 For unpickling, 'trimmed_headers' is recreated from 'headers'. That

486 allows for changes to the trim function and can help with debugging.

487 """

488 CONDITIONAL_HEADERS = [

489 'if-none-match', 'if-match',

490 'if-modified-since', 'if-unmodified-since']

491

492 def __init__(self, command, host, full_path, request_body, headers,

493 is_ssl=False):

494 """Initialize an ArchivedHttpRequest.

495

496 Args:

497 command: a string (e.g. 'GET' or 'POST').

498 host: a host name (e.g. 'www.google.com').

499 full_path: a request path. Includes everything after the host & port in

500 the URL (e.g. '/search?q=dogs').

501 request_body: a request body string for a POST or None.

502 headers: {key: value, ...} where key and value are strings.

503 is_ssl: a boolean which is True iff request is make via SSL.

504 """

505 self.command = command

506 self.host = host

507 self.full_path = full_path

508 parsed_url = urlparse.urlparse(full_path) if full_path else None

509 self.path = parsed_url.path if parsed_url else None

510 self.request_body = request_body

511 self.headers = headers

512 self.is_ssl = is_ssl

513 self.trimmed_headers = self._TrimHeaders(headers)

514 self.formatted_request = self._GetFormattedRequest()

515 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)

516

517 def __str__(self):

518 scheme = 'https' if self.is_ssl else 'http'

519 return '%s %s://%s%s %s' % (

520 self.command, scheme, self.host, self.full_path, self.trimmed_headers)

521

522 def __repr__(self):

523 return repr((self.command, self.host, self.full_path, self.request_body,

524 self.trimmed_headers, self.is_ssl))

525

526 def __hash__(self):

527 """Return a integer hash to use for hashed collections including dict."""

528 return hash(repr(self))

529

530 def __eq__(self, other):

531 """Define the __eq__ method to match the hash behavior."""

532 return repr(self) == repr(other)

533

534 def __setstate__(self, state):

535 """Influence how to unpickle.

536

537 "headers" are the original request headers.

538 "trimmed_headers" are the trimmed headers used for matching requests

539 during replay.

540

541 Args:

542 state: a dictionary for __dict__

543 """

544 if 'full_headers' in state:

545 # Fix older version of archive.

546 state['headers'] = state['full_headers']

547 del state['full_headers']

548 if 'headers' not in state:

549 raise HttpArchiveException(

550 'Archived HTTP request is missing "headers". The HTTP archive is'

551 ' likely from a previous version and must be re-recorded.')

552 if 'path' in state:

553 # before, 'path' and 'path_without_query' were used and 'path' was

554 # pickled. Now, 'path' has been renamed to 'full_path' and

555 # 'path_without_query' has been renamed to 'path'. 'full_path' is

556 # pickled, but 'path' is not. If we see 'path' here it means we are

557 # dealing with an older archive.

558 state['full_path'] = state['path']

559 del state['path']

560 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))

561 if 'is_ssl' not in state:

562 state['is_ssl'] = False

563 self.__dict__.update(state)

564 parsed_url = urlparse.urlparse(self.full_path)

565 self.path = parsed_url.path

566 self.formatted_request = self._GetFormattedRequest()

567 self.cmp_seq = self._GetCmpSeq(parsed_url.query)

568

569 def __getstate__(self):

570 """Influence how to pickle.

571

572 Returns:

573 a dict to use for pickling

574 """

575 state = self.__dict__.copy()

576 del state['trimmed_headers']

577 del state['path']

578 del state['formatted_request']

579 del state['cmp_seq']

580 return state

581

582 def _GetFormattedRequest(self):

583 """Format request to make diffs easier to read.

584

585 Returns:

586 A string consisting of the request. Example:

587 'GET www.example.com/path\nHeader-Key: header value\n'

588 """

589 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]

590 if self.request_body:

591 parts.append('%s\n' % self.request_body)

592 for k, v in self.trimmed_headers:

593 k = '-'.join(x.capitalize() for x in k.split('-'))

594 parts.append('%s: %s\n' % (k, v))

595 return ''.join(parts)

596

597 def _GetCmpSeq(self, query=None):

598 """Compute a sequence out of query and header for difflib to compare.

599 For example:

600 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]

601 will be returned for a request with URL:

602 http://example.com/index.html?q1=a2&q2=a2

603 and header:

604 k1: v1

605 k2: v2

606

607 Args:

608 query: the query string in the URL.

609

610 Returns:

611 A sequence for difflib to compare.

612 """

613 if not query:

614 return self.trimmed_headers

615 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers

616

617 def matches(self, command=None, host=None, full_path=None, is_ssl=None,

618 use_query=True):

619 """Returns true iff the request matches all parameters.

620

621 Args:

622 command: a string (e.g. 'GET' or 'POST').

623 host: a host name (e.g. 'www.google.com').

624 full_path: a request path with query string (e.g. '/search?q=dogs')

625 is_ssl: whether the request is secure.

626 use_query:

627 If use_query is True, request matching uses both the hierarchical path

628 and query string component.

629 If use_query is False, request matching only uses the hierarchical path

630

631 e.g. req1 = GET www.test.com/index?aaaa

632 req2 = GET www.test.com/index?bbbb

633

634 If use_query is True, req1.matches(req2) evaluates to False

635 If use_query is False, req1.matches(req2) evaluates to True

636

637 Returns:

638 True iff the request matches all parameters

639 """

640 if command is not None and command != self.command:

641 return False

642 if is_ssl is not None and is_ssl != self.is_ssl:

643 return False

644 if host is not None and host != self.host:

645 return False

646 if full_path is None:

647 return True

648 if use_query:

649 return full_path == self.full_path

650 else:

651 return self.path == urlparse.urlparse(full_path).path

652

653 @classmethod

654 def _TrimHeaders(cls, headers):

655 """Removes headers that are known to cause problems during replay.

656

657 These headers are removed for the following reasons:

658 - accept: Causes problems with www.bing.com. During record, CSS is fetched

659 with *. During replay, it's text/css.

660 - accept-charset, accept-language, referer: vary between clients.

661 - cache-control: sometimes sent from Chrome with 'max-age=0' as value.

662 - connection, method, scheme, url, version: Cause problems with spdy.

663 - cookie: Extremely sensitive to request/response order.

664 - keep-alive: Doesn't affect the content of the request, only some

665 transient state of the transport layer.

666 - user-agent: Changes with every Chrome version.

667 - proxy-connection: Sent for proxy requests.

668 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by

669 Google to collect statistics about Chrome's enabled features.

670

671 Another variant to consider is dropping only the value from the header.

672 However, this is particularly bad for the cookie header, because the

673 presence of the cookie depends on the responses we've seen when the request

674 is made.

675

676 Args:

677 headers: {header_key: header_value, ...}

678

679 Returns:

680 [(header_key, header_value), ...] # (with undesirable headers removed)

681 """

682 # TODO(tonyg): Strip sdch from the request headers because we can't

683 # guarantee that the dictionary will be recorded, so replay may not work.

684 if 'accept-encoding' in headers:

685 accept_encoding = headers['accept-encoding']

686 accept_encoding = accept_encoding.replace('sdch', '')

687 # Strip lzma so Opera's requests matches archives recorded using Chrome.

688 accept_encoding = accept_encoding.replace('lzma', '')

689 stripped_encodings = [e.strip() for e in accept_encoding.split(',')]

690 accept_encoding = ','.join(filter(bool, stripped_encodings))

691 headers['accept-encoding'] = accept_encoding

692 undesirable_keys = [

693 'accept', 'accept-charset', 'accept-language', 'cache-control',

694 'connection', 'cookie', 'keep-alive', 'method',

695 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',

696 'x-chrome-variations', 'x-client-data']

697 return sorted([(k, v) for k, v in headers.items()

698 if k.lower() not in undesirable_keys])

699

700 def is_conditional(self):

701 """Return list of headers that match conditional headers."""

702 for header in self.CONDITIONAL_HEADERS:

703 if header in self.headers:

704 return True

705 return False

706

707 def create_request_without_conditions(self):

708 stripped_headers = dict((k, v) for k, v in self.headers.iteritems()

709 if k.lower() not in self.CONDITIONAL_HEADERS)

710 return ArchivedHttpRequest(

711 self.command, self.host, self.full_path, self.request_body,

712 stripped_headers, self.is_ssl)

713

714 class ArchivedHttpResponse(object):

715 """All the data needed to recreate all HTTP response.

716

717 Upon creation, the headers are "trimmed" (i.e. edited or dropped).

718 The original headers are saved to self.original_headers, while the

719 trimmed ones are used to allow responses to match in a wider variety

720 of playback situations.

721

722 For pickling, 'original_headers' are stored in the archive. For unpickling

723 the headers are trimmed again. That allows for changes to the trim

724 function and can help with debugging.

725 """

726

727 # CHUNK_EDIT_SEPARATOR is used to edit and view text content.

728 # It is not sent in responses. It is added by get_data_as_text()

729 # and removed by set_data().

730 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'

731

732 # DELAY_EDIT_SEPARATOR is used to edit and view server delays.

733 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '

734 'Delays are above. Response content is below.]\n')

735

736 def __init__(self, version, status, reason, headers, response_data,

737 delays=None):

738 """Initialize an ArchivedHttpResponse.

739

740 Args:

741 version: HTTP protocol version used by server.

742 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).

743 status: Status code returned by server (e.g. 200).

744 reason: Reason phrase returned by server (e.g. "OK").

745 headers: list of (header, value) tuples.

746 response_data: list of content chunks.

747 Concatenating the chunks gives the complete contents

748 (i.e. the chunks do not have any lengths or delimiters).

749 Do not include the final, zero-length chunk that marks the end.

750 delays: dict of (ms) delays for 'connect', 'headers' and 'data'.

751 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}

752 connect - The time to connect to the server.

753 Each resource has a value because Replay's record mode captures it.

754 This includes the time for the SYN and SYN/ACK (1 rtt).

755 headers - The time elapsed between the TCP connect and the headers.

756 This typically includes all the server-time to generate a response.

757 data - If the response is chunked, these are the times for each chunk.

758 """

759 self.version = version

760 self.status = status

761 self.reason = reason

762 self.original_headers = headers

763 self.headers = self._TrimHeaders(headers)

764 self.response_data = response_data

765 self.delays = delays

766 self.fix_delays()

767

768 def fix_delays(self):

769 """Initialize delays, or check the number of data delays."""

770 expected_num_delays = len(self.response_data)

771 if not self.delays:

772 self.delays = {

773 'connect': 0,

774 'headers': 0,

775 'data': [0] * expected_num_delays

776 }

777 else:

778 num_delays = len(self.delays['data'])

779 if num_delays != expected_num_delays:

780 raise HttpArchiveException(

781 'Server delay length mismatch: %d (expected %d): %s',

782 num_delays, expected_num_delays, self.delays['data'])

783

784 @classmethod

785 def _TrimHeaders(cls, headers):

786 """Removes headers that are known to cause problems during replay.

787

788 These headers are removed for the following reasons:

789 - content-security-policy: Causes problems with script injection.

790 """

791 undesirable_keys = ['content-security-policy']

792 return [(k, v) for k, v in headers if k.lower() not in undesirable_keys]

793

794 def __repr__(self):

795 return repr((self.version, self.status, self.reason, sorted(self.headers),

796 self.response_data))

797

798 def __hash__(self):

799 """Return a integer hash to use for hashed collections including dict."""

800 return hash(repr(self))

801

802 def __eq__(self, other):

803 """Define the __eq__ method to match the hash behavior."""

804 return repr(self) == repr(other)

805

806 def __setstate__(self, state):

807 """Influence how to unpickle.

808

809 "original_headers" are the original request headers.

810 "headers" are the trimmed headers used for replaying responses.

811

812 Args:

813 state: a dictionary for __dict__

814 """

815 if 'server_delays' in state:

816 state['delays'] = {

817 'connect': 0,

818 'headers': 0,

819 'data': state['server_delays']

820 }

821 del state['server_delays']

822 elif 'delays' not in state:

823 state['delays'] = None

824 state['original_headers'] = state['headers']

825 state['headers'] = self._TrimHeaders(state['original_headers'])

826 self.__dict__.update(state)

827 self.fix_delays()

828

829 def __getstate__(self):

830 """Influence how to pickle.

831

832 Returns:

833 a dict to use for pickling

834 """

835 state = self.__dict__.copy()

836 state['headers'] = state['original_headers']

837 del state['original_headers']

838 return state

839

840 def get_header(self, key, default=None):

841 for k, v in self.headers:

842 if key.lower() == k.lower():

843 return v

844 return default

845

846 def set_header(self, key, value):

847 for i, (k, v) in enumerate(self.headers):

848 if key == k:

849 self.headers[i] = (key, value)

850 return

851 self.headers.append((key, value))

852

853 def remove_header(self, key):

854 for i, (k, v) in enumerate(self.headers):

855 if key.lower() == k.lower():

856 self.headers.pop(i)

857 return

858

859 @staticmethod

860 def _get_epoch_seconds(date_str):

861 """Return the epoch seconds of a date header.

862

863 Args:

864 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")

865 Returns:

866 epoch seconds as a float

867 """

868 date_tuple = email.utils.parsedate(date_str)

869 if date_tuple:

870 return calendar.timegm(date_tuple)

871 return None

872

873 def update_date(self, date_str, now=None):

874 """Return an updated date based on its delta from the "Date" header.

875

876 For example, if \|date_str\| is one week later than the "Date" header,

877 then the returned date string is one week later than the current date.

878

879 Args:

880 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")

881 Returns:

882 a date string

883 """

884 date_seconds = self._get_epoch_seconds(self.get_header('date'))

885 header_seconds = self._get_epoch_seconds(date_str)

886 if date_seconds and header_seconds:

887 updated_seconds = header_seconds + (now or time.time()) - date_seconds

888 return email.utils.formatdate(updated_seconds, usegmt=True)

889 return date_str

890

891 def is_gzip(self):

892 return self.get_header('content-encoding') == 'gzip'

893

894 def is_compressed(self):

895 return self.get_header('content-encoding') in ('gzip', 'deflate')

896

897 def is_chunked(self):

898 return self.get_header('transfer-encoding') == 'chunked'

899

900 def get_data_as_chunks(self):

901 """Return content as a list of strings, each corresponding to a chunk.

902

903 Uncompresses the chunks, if needed.

904 """

905 content_type = self.get_header('content-type')

906 if (not content_type or

907 not (content_type.startswith('text/') or

908 content_type == 'application/x-javascript' or

909 content_type.startswith('application/json'))):

910 return None

911 if self.is_compressed():

912 return httpzlib.uncompress_chunks(self.response_data, self.is_gzip())

913 else:

914 return self.response_data

915

916 def get_data_as_text(self):

917 """Return content as a single string.

918

919 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.

920 """

921 return self.CHUNK_EDIT_SEPARATOR.join(self.get_data_as_chunks())

922

923 def get_delays_as_text(self):

924 """Return delays as editable text."""

925 return json.dumps(self.delays, indent=2)

926

927 def get_response_as_text(self):

928 """Returns response content as a single string.

929

930 Server delays are separated on a per-chunk basis. Delays are in seconds.

931 Response content begins after DELAY_EDIT_SEPARATOR

932 """

933 data = self.get_data_as_text()

934 if data is None:

935 logging.warning('Data can not be represented as text.')

936 data = ''

937 delays = self.get_delays_as_text()

938 return self.DELAY_EDIT_SEPARATOR.join((delays, data))

939

940 def set_data_from_chunks(self, text_chunks):

941 """Inverse of get_data_as_chunks().

942

943 Compress, if needed.

944 """

945 if self.is_compressed():

946 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())

947 else:

948 self.response_data = text_chunks

949 if not self.is_chunked():

950 content_length = sum(len(c) for c in self.response_data)

951 self.set_header('content-length', str(content_length))

952

953 def set_data(self, text):

954 """Inverse of get_data_as_text().

955

956 Split on CHUNK_EDIT_SEPARATOR and compress if needed.

957 """

958 self.set_data_from_chunks(text.split(self.CHUNK_EDIT_SEPARATOR))

959

960 def set_delays(self, delays_text):

961 """Inverse of get_delays_as_text().

962

963 Args:

964 delays_text: JSON encoded text such as the following:

965 {

966 connect: 80,

967 headers: 80,

968 data: [6, 55, 0]

969 }

970 Times are in milliseconds.

971 Each data delay corresponds with one response_data value.

972 """

973 try:

974 self.delays = json.loads(delays_text)

975 except (ValueError, KeyError) as e:

976 logging.critical('Unable to parse delays %s: %s', delays_text, e)

977 self.fix_delays()

978

979 def set_response_from_text(self, text):

980 """Inverse of get_response_as_text().

981

982 Modifies the state of the archive according to the textual representation.

983 """

984 try:

985 delays, data = text.split(self.DELAY_EDIT_SEPARATOR)

986 except ValueError:

987 logging.critical(

988 'Error parsing text representation. Skipping edits.')

989 return

990 self.set_delays(delays)

991 self.set_data(data)

992

993

994 def create_response(status, reason=None, headers=None, body=None):

995 """Convenience method for creating simple ArchivedHttpResponse objects."""

996 if reason is None:

997 reason = httplib.responses.get(status, 'Unknown')

998 if headers is None:

999 headers = [('content-type', 'text/plain')]

1000 if body is None:

1001 body = "%s %s" % (status, reason)

1002 return ArchivedHttpResponse(11, status, reason, headers, [body])

1003

1004

1005 def main():

1006 class PlainHelpFormatter(optparse.IndentedHelpFormatter):

1007 def format_description(self, description):

1008 if description:

1009 return description + '\n'

1010 else:

1011 return ''

1012

1013 option_parser = optparse.OptionParser(

1014 usage='%prog [ls\|cat\|edit\|stats\|merge] [options] replay_file(s)',

1015 formatter=PlainHelpFormatter(),

1016 description=__doc__,

1017 epilog='http://code.google.com/p/web-page-replay/')

1018

1019 option_parser.add_option('-c', '--command', default=None,

1020 action='store',

1021 type='string',

1022 help='Only show URLs matching this command.')

1023 option_parser.add_option('-o', '--host', default=None,

1024 action='store',

1025 type='string',

1026 help='Only show URLs matching this host.')

1027 option_parser.add_option('-p', '--full_path', default=None,

1028 action='store',

1029 type='string',

1030 help='Only show URLs matching this full path.')

1031 option_parser.add_option('-f', '--merged_file', default=None,

1032 action='store',

1033 type='string',

1034 help='The output file to use when using the merge command.')

1035

1036 options, args = option_parser.parse_args()

1037

1038 # Merge command expects an umlimited number of archives.

1039 if len(args) < 2:

1040 print 'args: %s' % args

1041 option_parser.error('Must specify a command and replay_file')

1042

1043 command = args[0]

1044 replay_file = args[1]

1045

1046 if not os.path.exists(replay_file):

1047 option_parser.error('Replay file "%s" does not exist' % replay_file)

1048

1049 http_archive = HttpArchive.Load(replay_file)

1050 if command == 'ls':

1051 print http_archive.ls(options.command, options.host, options.full_path)

1052 elif command == 'cat':

1053 print http_archive.cat(options.command, options.host, options.full_path)

1054 elif command == 'stats':

1055 print http_archive.stats(options.command, options.host, options.full_path)

1056 elif command == 'merge':

1057 if not options.merged_file:

1058 print 'Error: Must specify a merged file name (use --merged_file)'

1059 return

1060 http_archive.merge(options.merged_file, args[2:])

1061 elif command == 'edit':

1062 http_archive.edit(options.command, options.host, options.full_path)

1063 http_archive.Persist(replay_file)

1064 else:

1065 option_parser.error('Unknown command "%s"' % command)

1066 return 0

1067

1068

1069 if __name__ == '__main__':

1070 sys.exit(main())

OLD	NEW