tools/telemetry/third_party/webpagereplay/httparchive.py - Issue 1647513002: Delete tools/telemetry.

Side by Side Diff: tools/telemetry/third_party/webpagereplay/httparchive.py

Issue 1647513002: Delete tools/telemetry. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « tools/telemetry/third_party/webpagereplay/exception_formatter.py ('k') | tools/telemetry/third_party/webpagereplay/httparchive_test.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
	(Empty)
1 #!/usr/bin/env python

2 # Copyright 2010 Google Inc. All Rights Reserved.

3 #

4 # Licensed under the Apache License, Version 2.0 (the "License");

5 # you may not use this file except in compliance with the License.

6 # You may obtain a copy of the License at

7 #

8 # http://www.apache.org/licenses/LICENSE-2.0

9 #

10 # Unless required by applicable law or agreed to in writing, software

11 # distributed under the License is distributed on an "AS IS" BASIS,

12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13 # See the License for the specific language governing permissions and

14 # limitations under the License.

15

16 """View and edit HTTP Archives.

17

18 To list all URLs in an archive:

19 $ ./httparchive.py ls archive.wpr

20

21 To view the content of all URLs from example.com:

22 $ ./httparchive.py cat --host example.com archive.wpr

23

24 To view the content of a particular URL:

25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr

26

27 To view the content of all URLs:

28 $ ./httparchive.py cat archive.wpr

29

30 To edit a particular URL:

31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr

32

33 To print statistics of an archive:

34 $ ./httparchive.py stats archive.wpr

35

36 To print statistics of a set of URLs:

37 $ ./httparchive.py stats --host www.example.com archive.wpr

38

39 To merge multiple archives

40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...

41 """

42

43 import calendar

44 import certutils

45 import cPickle

46 import difflib

47 import email.utils

48 import httplib

49 import httpzlib

50 import json

51 import logging

52 import optparse

53 import os

54 import StringIO

55 import subprocess

56 import sys

57 import tempfile

58 import time

59 import urlparse

60 from collections import defaultdict

61

62

63

64 def LogRunTime(fn):

65 """Annotation which logs the run time of the function."""

66 def wrapped(self, args, *kwargs):

67 start_time = time.time()

68 try:

69 return fn(self, args, *kwargs)

70 finally:

71 run_time = (time.time() - start_time) * 1000.0

72 logging.debug('%s: %dms', fn.__name__, run_time)

73 return wrapped

74

75

76 class HttpArchiveException(Exception):

77 """Base class for all exceptions in httparchive."""

78 pass

79

80

81 class HttpArchive(dict):

82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.

83

84 Attributes:

85 responses_by_host: dict of {hostname, {request: response}}. This must remain

86 in sync with the underlying dict of self. It is used as an optimization

87 so that get_requests() doesn't have to linearly search all requests in

88 the archive to find potential matches.

89 """

90

91 def __init__(self): # pylint: disable=super-init-not-called

92 self.responses_by_host = defaultdict(dict)

93

94 def __setstate__(self, state):

95 """Influence how to unpickle.

96

97 Args:

98 state: a dictionary for __dict__

99 """

100 self.__dict__.update(state)

101 self.responses_by_host = defaultdict(dict)

102 for request in self:

103 self.responses_by_host[request.host][request] = self[request]

104

105 def __getstate__(self):

106 """Influence how to pickle.

107

108 Returns:

109 a dict to use for pickling

110 """

111 state = self.__dict__.copy()

112 del state['responses_by_host']

113 return state

114

115 def __setitem__(self, key, value):

116 super(HttpArchive, self).__setitem__(key, value)

117 if hasattr(self, 'responses_by_host'):

118 self.responses_by_host[key.host][key] = value

119

120 def __delitem__(self, key):

121 super(HttpArchive, self).__delitem__(key)

122 del self.responses_by_host[key.host][key]

123

124 def get(self, request, default=None):

125 """Return the archived response for a given request.

126

127 Does extra checking for handling some HTTP request headers.

128

129 Args:

130 request: instance of ArchivedHttpRequest

131 default: default value to return if request is not found

132

133 Returns:

134 Instance of ArchivedHttpResponse or default if no matching

135 response is found

136 """

137 if request in self:

138 return self[request]

139 return self.get_conditional_response(request, default)

140

141 def get_conditional_response(self, request, default):

142 """Get the response based on the conditional HTTP request headers.

143

144 Args:

145 request: an ArchivedHttpRequest representing the original request.

146 default: default ArchivedHttpResponse

147 original request with matched headers removed.

148

149 Returns:

150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or

151 412 (precondition failed)

152 """

153 response = default

154 if request.is_conditional():

155 stripped_request = request.create_request_without_conditions()

156 if stripped_request in self:

157 response = self[stripped_request]

158 if response.status == 200:

159 status = self.get_conditional_status(request, response)

160 if status != 200:

161 response = create_response(status)

162 return response

163

164 def get_conditional_status(self, request, response):

165 status = 200

166 last_modified = email.utils.parsedate(

167 response.update_date(response.get_header('last-modified')))

168 response_etag = response.get_header('etag')

169 is_get_or_head = request.command.upper() in ('GET', 'HEAD')

170

171 match_value = request.headers.get('if-match', None)

172 if match_value:

173 if self.is_etag_match(match_value, response_etag):

174 status = 200

175 else:

176 status = 412 # precondition failed

177 none_match_value = request.headers.get('if-none-match', None)

178 if none_match_value:

179 if self.is_etag_match(none_match_value, response_etag):

180 status = 304

181 elif is_get_or_head:

182 status = 200

183 else:

184 status = 412

185 if is_get_or_head and last_modified:

186 for header in ('if-modified-since', 'if-unmodified-since'):

187 date = email.utils.parsedate(request.headers.get(header, None))

188 if date:

189 if ((header == 'if-modified-since' and last_modified > date) or

190 (header == 'if-unmodified-since' and last_modified < date)):

191 if status != 412:

192 status = 200

193 else:

194 status = 304 # not modified

195 return status

196

197 @staticmethod

198 def is_etag_match(request_etag, response_etag):

199 """Determines whether the entity tags of the request/response matches.

200

201 Args:

202 request_etag: the value string of the "if-(none)-match:"

203 portion of the request header

204 response_etag: the etag value of the response

205

206 Returns:

207 True on match, False otherwise

208 """

209 response_etag = response_etag.strip('" ')

210 for etag in request_etag.split(','):

211 etag = etag.strip('" ')

212 if etag in ('*', response_etag):

213 return True

214 return False

215

216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,

217 use_query=True):

218 """Return a list of requests that match the given args."""

219 if host:

220 return [r for r in self.responses_by_host[host]

221 if r.matches(command, None, full_path, is_ssl,

222 use_query=use_query)]

223 else:

224 return [r for r in self

225 if r.matches(command, host, full_path, is_ssl,

226 use_query=use_query)]

227

228 def ls(self, command=None, host=None, full_path=None):

229 """List all URLs that match given params."""

230 return ''.join(sorted(

231 '%s\n' % r for r in self.get_requests(command, host, full_path)))

232

233 def cat(self, command=None, host=None, full_path=None):

234 """Print the contents of all URLs that match given params."""

235 out = StringIO.StringIO()

236 for request in self.get_requests(command, host, full_path):

237 print >>out, str(request)

238 print >>out, 'Untrimmed request headers:'

239 for k in request.headers:

240 print >>out, ' %s: %s' % (k, request.headers[k])

241 if request.request_body:

242 print >>out, request.request_body

243 print >>out, '---- Response Info', '-' * 51

244 response = self[request]

245 chunk_lengths = [len(x) for x in response.response_data]

246 print >>out, ('Status: %s\n'

247 'Reason: %s\n'

248 'Headers delay: %s\n'

249 'Response headers:') % (

250 response.status, response.reason, response.delays['headers'])

251 for k, v in response.headers:

252 print >>out, ' %s: %s' % (k, v)

253 print >>out, ('Chunk count: %s\n'

254 'Chunk lengths: %s\n'

255 'Chunk delays: %s') % (

256 len(chunk_lengths), chunk_lengths, response.delays['data'])

257 body = response.get_data_as_text()

258 print >>out, '---- Response Data', '-' * 51

259 if body:

260 print >>out, body

261 else:

262 print >>out, '[binary data]'

263 print >>out, '=' * 70

264 return out.getvalue()

265

266 def stats(self, command=None, host=None, full_path=None):

267 """Print stats about the archive for all URLs that match given params."""

268 matching_requests = self.get_requests(command, host, full_path)

269 if not matching_requests:

270 print 'Failed to find any requests matching given command, host, path.'

271 return

272

273 out = StringIO.StringIO()

274 stats = {

275 'Total': len(matching_requests),

276 'Domains': defaultdict(int),

277 'HTTP_response_code': defaultdict(int),

278 'content_type': defaultdict(int),

279 'Documents': defaultdict(int),

280 }

281

282 for request in matching_requests:

283 stats['Domains'][request.host] += 1

284 stats['HTTP_response_code'][self[request].status] += 1

285

286 content_type = self[request].get_header('content-type')

287 # Remove content type options for readability and higher level groupings.

288 str_content_type = str(content_type.split(';')[0]

289 if content_type else None)

290 stats['content_type'][str_content_type] += 1

291

292 # Documents are the main URL requested and not a referenced resource.

293 if str_content_type == 'text/html' and not 'referer' in request.headers:

294 stats['Documents'][request.host] += 1

295

296 print >>out, json.dumps(stats, indent=4)

297 return out.getvalue()

298

299 def merge(self, merged_archive=None, other_archives=None):

300 """Merge multiple archives into merged_archive by 'chaining' resources,

301 only resources that are not part of the accumlated archive are added"""

302 if not other_archives:

303 print 'No archives passed to merge'

304 return

305

306 # Note we already loaded 'replay_file'.

307 print 'Loaded %d responses' % len(self)

308

309 for archive in other_archives:

310 if not os.path.exists(archive):

311 print 'Error: Replay file "%s" does not exist' % archive

312 return

313

314 http_archive_other = HttpArchive.Load(archive)

315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive)

316 for r in http_archive_other:

317 # Only resources that are not already part of the current archive

318 # get added.

319 if r not in self:

320 print '\t %s ' % r

321 self[r] = http_archive_other[r]

322 self.Persist('%s' % merged_archive)

323

324 def edit(self, command=None, host=None, full_path=None):

325 """Edits the single request which matches given params."""

326 editor = os.getenv('EDITOR')

327 if not editor:

328 print 'You must set the EDITOR environmental variable.'

329 return

330

331 matching_requests = self.get_requests(command, host, full_path)

332 if not matching_requests:

333 print ('Failed to find any requests matching given command, host, '

334 'full_path.')

335 return

336

337 if len(matching_requests) > 1:

338 print 'Found multiple matching requests. Please refine.'

339 print self.ls(command, host, full_path)

340

341 response = self[matching_requests[0]]

342 tmp_file = tempfile.NamedTemporaryFile(delete=False)

343 tmp_file.write(response.get_response_as_text())

344 tmp_file.close()

345 subprocess.check_call([editor, tmp_file.name])

346 response.set_response_from_text(''.join(open(tmp_file.name).readlines()))

347 os.remove(tmp_file.name)

348

349 def find_closest_request(self, request, use_path=False):

350 """Find the closest matching request in the archive to the given request.

351

352 Args:

353 request: an ArchivedHttpRequest

354 use_path: If True, closest matching request's path component must match.

355 (Note: this refers to the 'path' component within the URL, not the

356 'full path' which includes the query string component.)

357

358 If use_path=True, candidate will NOT match in example below

359 e.g. request = GET www.test.com/a?p=1

360 candidate = GET www.test.com/b?p=1

361

362 Even if use_path=False, urls with same paths are always favored.

363 For example, candidate1 is considered a better match than candidate2.

364 request = GET www.test.com/a?p=1&q=2&r=3

365 candidate1 = GET www.test.com/a?s=4

366 candidate2 = GET www.test.com/b?p=1&q=2&r=3

367

368 Returns:

369 If a close match is found, return the instance of ArchivedHttpRequest.

370 Otherwise, return None.

371 """

372 # Start with strictest constraints. This trims search space considerably.

373 requests = self.get_requests(request.command, request.host,

374 request.full_path, is_ssl=request.is_ssl,

375 use_query=True)

376 # Relax constraint: use_query if there is no match.

377 if not requests:

378 requests = self.get_requests(request.command, request.host,

379 request.full_path, is_ssl=request.is_ssl,

380 use_query=False)

381 # Relax constraint: full_path if there is no match and use_path=False.

382 if not requests and not use_path:

383 requests = self.get_requests(request.command, request.host,

384 None, is_ssl=request.is_ssl,

385 use_query=False)

386

387 if not requests:

388 return None

389

390 if len(requests) == 1:

391 return requests[0]

392

393 matcher = difflib.SequenceMatcher(b=request.cmp_seq)

394

395 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call

396 # quick_ratio() on all requests, sort them descending, and then loop through

397 # until we find a candidate whose ratio() is >= the next quick_ratio().

398 # This works because quick_ratio() is guaranteed to be an upper bound on

399 # ratio().

400 candidates = []

401 for candidate in requests:

402 matcher.set_seq1(candidate.cmp_seq)

403 candidates.append((matcher.quick_ratio(), candidate))

404

405 candidates.sort(reverse=True, key=lambda c: c[0])

406

407 best_match = (0, None)

408 for i in xrange(len(candidates)):

409 matcher.set_seq1(candidates[i][1].cmp_seq)

410 best_match = max(best_match, (matcher.ratio(), candidates[i][1]))

411 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:

412 break

413 return best_match[1]

414

415 def diff(self, request):

416 """Diff the given request to the closest matching request in the archive.

417

418 Args:

419 request: an ArchivedHttpRequest

420 Returns:

421 If a close match is found, return a textual diff between the requests.

422 Otherwise, return None.

423 """

424 request_lines = request.formatted_request.split('\n')

425 closest_request = self.find_closest_request(request)

426 if closest_request:

427 closest_request_lines = closest_request.formatted_request.split('\n')

428 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))

429 return None

430

431 def get_server_cert(self, host):

432 """Gets certificate from the server and stores it in archive"""

433 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})

434 if request not in self:

435 self[request] = create_response(200, body=certutils.get_host_cert(host))

436 return self[request].response_data[0]

437

438 def get_certificate(self, host):

439 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})

440 if request not in self:

441 self[request] = create_response(200, body=self._generate_cert(host))

442 return self[request].response_data[0]

443

444 @classmethod

445 def AssertWritable(cls, filename):

446 """Raises an IOError if filename is not writable."""

447 persist_dir = os.path.dirname(os.path.abspath(filename))

448 if not os.path.exists(persist_dir):

449 raise IOError('Directory does not exist: %s' % persist_dir)

450 if os.path.exists(filename):

451 if not os.access(filename, os.W_OK):

452 raise IOError('Need write permission on file: %s' % filename)

453 elif not os.access(persist_dir, os.W_OK):

454 raise IOError('Need write permission on directory: %s' % persist_dir)

455

456 @classmethod

457 def Load(cls, filename):

458 """Load an instance from filename."""

459 return cPickle.load(open(filename, 'rb'))

460

461 def Persist(self, filename):

462 """Persist all state to filename."""

463 try:

464 original_checkinterval = sys.getcheckinterval()

465 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can

466 # modify \|self\| during pickling.

467 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)

468 finally:

469 sys.setcheckinterval(original_checkinterval)

470 with open(filename, 'wb') as f:

471 f.write(pickled_self)

472

473

474 class ArchivedHttpRequest(object):

475 """Record all the state that goes into a request.

476

477 ArchivedHttpRequest instances are considered immutable so they can

478 serve as keys for HttpArchive instances.

479 (The immutability is not enforced.)

480

481 Upon creation, the headers are "trimmed" (i.e. edited or dropped)

482 and saved to self.trimmed_headers to allow requests to match in a wider

483 variety of playback situations (e.g. using different user agents).

484

485 For unpickling, 'trimmed_headers' is recreated from 'headers'. That

486 allows for changes to the trim function and can help with debugging.

487 """

488 CONDITIONAL_HEADERS = [

489 'if-none-match', 'if-match',

490 'if-modified-since', 'if-unmodified-since']

491

492 def __init__(self, command, host, full_path, request_body, headers,

493 is_ssl=False):

494 """Initialize an ArchivedHttpRequest.

495

496 Args:

497 command: a string (e.g. 'GET' or 'POST').

498 host: a host name (e.g. 'www.google.com').

499 full_path: a request path. Includes everything after the host & port in

500 the URL (e.g. '/search?q=dogs').

501 request_body: a request body string for a POST or None.

502 headers: {key: value, ...} where key and value are strings.

503 is_ssl: a boolean which is True iff request is make via SSL.

504 """

505 self.command = command

506 self.host = host

507 self.full_path = full_path

508 parsed_url = urlparse.urlparse(full_path) if full_path else None

509 self.path = parsed_url.path if parsed_url else None

510 self.request_body = request_body

511 self.headers = headers

512 self.is_ssl = is_ssl

513 self.trimmed_headers = self._TrimHeaders(headers)

514 self.formatted_request = self._GetFormattedRequest()

515 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)

516

517 def __str__(self):

518 scheme = 'https' if self.is_ssl else 'http'

519 return '%s %s://%s%s %s' % (

520 self.command, scheme, self.host, self.full_path, self.trimmed_headers)

521

522 def __repr__(self):

523 return repr((self.command, self.host, self.full_path, self.request_body,

524 self.trimmed_headers, self.is_ssl))

525

526 def __hash__(self):

527 """Return a integer hash to use for hashed collections including dict."""

528 return hash(repr(self))

529

530 def __eq__(self, other):

531 """Define the __eq__ method to match the hash behavior."""

532 return repr(self) == repr(other)

533

534 def __setstate__(self, state):

535 """Influence how to unpickle.

536

537 "headers" are the original request headers.

538 "trimmed_headers" are the trimmed headers used for matching requests

539 during replay.

540

541 Args:

542 state: a dictionary for __dict__

543 """

544 if 'full_headers' in state:

545 # Fix older version of archive.

546 state['headers'] = state['full_headers']

547 del state['full_headers']

548 if 'headers' not in state:

549 raise HttpArchiveException(

550 'Archived HTTP request is missing "headers". The HTTP archive is'

551 ' likely from a previous version and must be re-recorded.')

552 if 'path' in state:

553 # before, 'path' and 'path_without_query' were used and 'path' was

554 # pickled. Now, 'path' has been renamed to 'full_path' and

555 # 'path_without_query' has been renamed to 'path'. 'full_path' is

556 # pickled, but 'path' is not. If we see 'path' here it means we are

557 # dealing with an older archive.

558 state['full_path'] = state['path']

559 del state['path']

560 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))

561 if 'is_ssl' not in state:

562 state['is_ssl'] = False

563 self.__dict__.update(state)

564 parsed_url = urlparse.urlparse(self.full_path)

565 self.path = parsed_url.path

566 self.formatted_request = self._GetFormattedRequest()

567 self.cmp_seq = self._GetCmpSeq(parsed_url.query)

568

569 def __getstate__(self):

570 """Influence how to pickle.

571

572 Returns:

573 a dict to use for pickling

574 """

575 state = self.__dict__.copy()

576 del state['trimmed_headers']

577 del state['path']

578 del state['formatted_request']

579 del state['cmp_seq']

580 return state

581

582 def _GetFormattedRequest(self):

583 """Format request to make diffs easier to read.

584

585 Returns:

586 A string consisting of the request. Example:

587 'GET www.example.com/path\nHeader-Key: header value\n'

588 """

589 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]

590 if self.request_body:

591 parts.append('%s\n' % self.request_body)

592 for k, v in self.trimmed_headers:

593 k = '-'.join(x.capitalize() for x in k.split('-'))

594 parts.append('%s: %s\n' % (k, v))

595 return ''.join(parts)

596

597 def _GetCmpSeq(self, query=None):

598 """Compute a sequence out of query and header for difflib to compare.

599 For example:

600 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]

601 will be returned for a request with URL:

602 http://example.com/index.html?q1=a2&q2=a2

603 and header:

604 k1: v1

605 k2: v2

606

607 Args:

608 query: the query string in the URL.

609

610 Returns:

611 A sequence for difflib to compare.

612 """

613 if not query:

614 return self.trimmed_headers

615 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers

616

617 def matches(self, command=None, host=None, full_path=None, is_ssl=None,

618 use_query=True):

619 """Returns true iff the request matches all parameters.

620

621 Args:

622 command: a string (e.g. 'GET' or 'POST').

623 host: a host name (e.g. 'www.google.com').

624 full_path: a request path with query string (e.g. '/search?q=dogs')

625 is_ssl: whether the request is secure.

626 use_query:

627 If use_query is True, request matching uses both the hierarchical path

628 and query string component.

629 If use_query is False, request matching only uses the hierarchical path

630

631 e.g. req1 = GET www.test.com/index?aaaa

632 req2 = GET www.test.com/index?bbbb

633

634 If use_query is True, req1.matches(req2) evaluates to False

635 If use_query is False, req1.matches(req2) evaluates to True

636

637 Returns:

638 True iff the request matches all parameters

639 """

640 if command is not None and command != self.command:

641 return False

642 if is_ssl is not None and is_ssl != self.is_ssl:

643 return False

644 if host is not None and host != self.host:

645 return False

646 if full_path is None:

647 return True

648 if use_query:

649 return full_path == self.full_path

650 else:

651 return self.path == urlparse.urlparse(full_path).path

652

653 @classmethod

654 def _TrimHeaders(cls, headers):

655 """Removes headers that are known to cause problems during replay.

656

657 These headers are removed for the following reasons:

658 - accept: Causes problems with www.bing.com. During record, CSS is fetched

659 with *. During replay, it's text/css.

660 - accept-charset, accept-language, referer: vary between clients.

661 - cache-control: sometimes sent from Chrome with 'max-age=0' as value.

662 - connection, method, scheme, url, version: Cause problems with spdy.

663 - cookie: Extremely sensitive to request/response order.

664 - keep-alive: Doesn't affect the content of the request, only some

665 transient state of the transport layer.

666 - user-agent: Changes with every Chrome version.

667 - proxy-connection: Sent for proxy requests.

668 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by

669 Google to collect statistics about Chrome's enabled features.

670

671 Another variant to consider is dropping only the value from the header.

672 However, this is particularly bad for the cookie header, because the

673 presence of the cookie depends on the responses we've seen when the request

674 is made.

675

676 Args:

677 headers: {header_key: header_value, ...}

678

679 Returns:

680 [(header_key, header_value), ...] # (with undesirable headers removed)

681 """

682 # TODO(tonyg): Strip sdch from the request headers because we can't

683 # guarantee that the dictionary will be recorded, so replay may not work.

684 if 'accept-encoding' in headers:

685 accept_encoding = headers['accept-encoding']

686 accept_encoding = accept_encoding.replace('sdch', '')

687 # Strip lzma so Opera's requests matches archives recorded using Chrome.

688 accept_encoding = accept_encoding.replace('lzma', '')

689 stripped_encodings = [e.strip() for e in accept_encoding.split(',')]

690 accept_encoding = ','.join(filter(bool, stripped_encodings))

691 headers['accept-encoding'] = accept_encoding

692 undesirable_keys = [

693 'accept', 'accept-charset', 'accept-language', 'cache-control',

694 'connection', 'cookie', 'keep-alive', 'method',

695 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',

696 'x-chrome-variations', 'x-client-data']

697 return sorted([(k, v) for k, v in headers.items()

698 if k.lower() not in undesirable_keys])

699

700 def is_conditional(self):

701 """Return list of headers that match conditional headers."""

702 for header in self.CONDITIONAL_HEADERS:

703 if header in self.headers:

704 return True

705 return False

706

707 def create_request_without_conditions(self):

708 stripped_headers = dict((k, v) for k, v in self.headers.iteritems()

709 if k.lower() not in self.CONDITIONAL_HEADERS)

710 return ArchivedHttpRequest(

711 self.command, self.host, self.full_path, self.request_body,

712 stripped_headers, self.is_ssl)

713

714 class ArchivedHttpResponse(object):

715 """All the data needed to recreate all HTTP response."""

716

717 # CHUNK_EDIT_SEPARATOR is used to edit and view text content.

718 # It is not sent in responses. It is added by get_data_as_text()

719 # and removed by set_data().

720 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'

721

722 # DELAY_EDIT_SEPARATOR is used to edit and view server delays.

723 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '

724 'Delays are above. Response content is below.]\n')

725

726 def __init__(self, version, status, reason, headers, response_data,

727 delays=None):

728 """Initialize an ArchivedHttpResponse.

729

730 Args:

731 version: HTTP protocol version used by server.

732 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).

733 status: Status code returned by server (e.g. 200).

734 reason: Reason phrase returned by server (e.g. "OK").

735 headers: list of (header, value) tuples.

736 response_data: list of content chunks.

737 Concatenating the chunks gives the complete contents

738 (i.e. the chunks do not have any lengths or delimiters).

739 Do not include the final, zero-length chunk that marks the end.

740 delays: dict of (ms) delays for 'connect', 'headers' and 'data'.

741 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}

742 connect - The time to connect to the server.

743 Each resource has a value because Replay's record mode captures it.

744 This includes the time for the SYN and SYN/ACK (1 rtt).

745 headers - The time elapsed between the TCP connect and the headers.

746 This typically includes all the server-time to generate a response.

747 data - If the response is chunked, these are the times for each chunk.

748 """

749 self.version = version

750 self.status = status

751 self.reason = reason

752 self.headers = headers

753 self.response_data = response_data

754 self.delays = delays

755 self.fix_delays()

756

757 def fix_delays(self):

758 """Initialize delays, or check the number of data delays."""

759 expected_num_delays = len(self.response_data)

760 if not self.delays:

761 self.delays = {

762 'connect': 0,

763 'headers': 0,

764 'data': [0] * expected_num_delays

765 }

766 else:

767 num_delays = len(self.delays['data'])

768 if num_delays != expected_num_delays:

769 raise HttpArchiveException(

770 'Server delay length mismatch: %d (expected %d): %s',

771 num_delays, expected_num_delays, self.delays['data'])

772

773 def __repr__(self):

774 return repr((self.version, self.status, self.reason, sorted(self.headers),

775 self.response_data))

776

777 def __hash__(self):

778 """Return a integer hash to use for hashed collections including dict."""

779 return hash(repr(self))

780

781 def __eq__(self, other):

782 """Define the __eq__ method to match the hash behavior."""

783 return repr(self) == repr(other)

784

785 def __setstate__(self, state):

786 """Influence how to unpickle.

787

788 Args:

789 state: a dictionary for __dict__

790 """

791 if 'server_delays' in state:

792 state['delays'] = {

793 'connect': 0,

794 'headers': 0,

795 'data': state['server_delays']

796 }

797 del state['server_delays']

798 elif 'delays' not in state:

799 state['delays'] = None

800 self.__dict__.update(state)

801 self.fix_delays()

802

803 def get_header(self, key, default=None):

804 for k, v in self.headers:

805 if key.lower() == k.lower():

806 return v

807 return default

808

809 def set_header(self, key, value):

810 for i, (k, v) in enumerate(self.headers):

811 if key == k:

812 self.headers[i] = (key, value)

813 return

814 self.headers.append((key, value))

815

816 def remove_header(self, key):

817 for i, (k, v) in enumerate(self.headers):

818 if key.lower() == k.lower():

819 self.headers.pop(i)

820 return

821

822 @staticmethod

823 def _get_epoch_seconds(date_str):

824 """Return the epoch seconds of a date header.

825

826 Args:

827 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")

828 Returns:

829 epoch seconds as a float

830 """

831 date_tuple = email.utils.parsedate(date_str)

832 if date_tuple:

833 return calendar.timegm(date_tuple)

834 return None

835

836 def update_date(self, date_str, now=None):

837 """Return an updated date based on its delta from the "Date" header.

838

839 For example, if \|date_str\| is one week later than the "Date" header,

840 then the returned date string is one week later than the current date.

841

842 Args:

843 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")

844 Returns:

845 a date string

846 """

847 date_seconds = self._get_epoch_seconds(self.get_header('date'))

848 header_seconds = self._get_epoch_seconds(date_str)

849 if date_seconds and header_seconds:

850 updated_seconds = header_seconds + (now or time.time()) - date_seconds

851 return email.utils.formatdate(updated_seconds, usegmt=True)

852 return date_str

853

854 def is_gzip(self):

855 return self.get_header('content-encoding') == 'gzip'

856

857 def is_compressed(self):

858 return self.get_header('content-encoding') in ('gzip', 'deflate')

859

860 def is_chunked(self):

861 return self.get_header('transfer-encoding') == 'chunked'

862

863 def get_data_as_text(self):

864 """Return content as a single string.

865

866 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.

867 """

868 content_type = self.get_header('content-type')

869 if (not content_type or

870 not (content_type.startswith('text/') or

871 content_type == 'application/x-javascript' or

872 content_type.startswith('application/json'))):

873 return None

874 if self.is_compressed():

875 uncompressed_chunks = httpzlib.uncompress_chunks(

876 self.response_data, self.is_gzip())

877 else:

878 uncompressed_chunks = self.response_data

879 return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks)

880

881 def get_delays_as_text(self):

882 """Return delays as editable text."""

883 return json.dumps(self.delays, indent=2)

884

885 def get_response_as_text(self):

886 """Returns response content as a single string.

887

888 Server delays are separated on a per-chunk basis. Delays are in seconds.

889 Response content begins after DELAY_EDIT_SEPARATOR

890 """

891 data = self.get_data_as_text()

892 if data is None:

893 logging.warning('Data can not be represented as text.')

894 data = ''

895 delays = self.get_delays_as_text()

896 return self.DELAY_EDIT_SEPARATOR.join((delays, data))

897

898 def set_data(self, text):

899 """Inverse of get_data_as_text().

900

901 Split on CHUNK_EDIT_SEPARATOR and compress if needed.

902 """

903 text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR)

904 if self.is_compressed():

905 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())

906 else:

907 self.response_data = text_chunks

908 if not self.is_chunked():

909 content_length = sum(len(c) for c in self.response_data)

910 self.set_header('content-length', str(content_length))

911

912 def set_delays(self, delays_text):

913 """Inverse of get_delays_as_text().

914

915 Args:

916 delays_text: JSON encoded text such as the following:

917 {

918 connect: 80,

919 headers: 80,

920 data: [6, 55, 0]

921 }

922 Times are in milliseconds.

923 Each data delay corresponds with one response_data value.

924 """

925 try:

926 self.delays = json.loads(delays_text)

927 except (ValueError, KeyError) as e:

928 logging.critical('Unable to parse delays %s: %s', delays_text, e)

929 self.fix_delays()

930

931 def set_response_from_text(self, text):

932 """Inverse of get_response_as_text().

933

934 Modifies the state of the archive according to the textual representation.

935 """

936 try:

937 delays, data = text.split(self.DELAY_EDIT_SEPARATOR)

938 except ValueError:

939 logging.critical(

940 'Error parsing text representation. Skipping edits.')

941 return

942 self.set_delays(delays)

943 self.set_data(data)

944

945

946 def create_response(status, reason=None, headers=None, body=None):

947 """Convenience method for creating simple ArchivedHttpResponse objects."""

948 if reason is None:

949 reason = httplib.responses.get(status, 'Unknown')

950 if headers is None:

951 headers = [('content-type', 'text/plain')]

952 if body is None:

953 body = "%s %s" % (status, reason)

954 return ArchivedHttpResponse(11, status, reason, headers, [body])

955

956

957 def main():

958 class PlainHelpFormatter(optparse.IndentedHelpFormatter):

959 def format_description(self, description):

960 if description:

961 return description + '\n'

962 else:

963 return ''

964

965 option_parser = optparse.OptionParser(

966 usage='%prog [ls\|cat\|edit\|stats\|merge] [options] replay_file(s)',

967 formatter=PlainHelpFormatter(),

968 description=__doc__,

969 epilog='http://code.google.com/p/web-page-replay/')

970

971 option_parser.add_option('-c', '--command', default=None,

972 action='store',

973 type='string',

974 help='Only show URLs matching this command.')

975 option_parser.add_option('-o', '--host', default=None,

976 action='store',

977 type='string',

978 help='Only show URLs matching this host.')

979 option_parser.add_option('-p', '--full_path', default=None,

980 action='store',

981 type='string',

982 help='Only show URLs matching this full path.')

983 option_parser.add_option('-f', '--merged_file', default=None,

984 action='store',

985 type='string',

986 help='The output file to use when using the merge command.')

987

988 options, args = option_parser.parse_args()

989

990 # Merge command expects an umlimited number of archives.

991 if len(args) < 2:

992 print 'args: %s' % args

993 option_parser.error('Must specify a command and replay_file')

994

995 command = args[0]

996 replay_file = args[1]

997

998 if not os.path.exists(replay_file):

999 option_parser.error('Replay file "%s" does not exist' % replay_file)

1000

1001 http_archive = HttpArchive.Load(replay_file)

1002 if command == 'ls':

1003 print http_archive.ls(options.command, options.host, options.full_path)

1004 elif command == 'cat':

1005 print http_archive.cat(options.command, options.host, options.full_path)

1006 elif command == 'stats':

1007 print http_archive.stats(options.command, options.host, options.full_path)

1008 elif command == 'merge':

1009 if not options.merged_file:

1010 print 'Error: Must specify a merged file name (use --merged_file)'

1011 return

1012 http_archive.merge(options.merged_file, args[2:])

1013 elif command == 'edit':

1014 http_archive.edit(options.command, options.host, options.full_path)

1015 http_archive.Persist(replay_file)

1016 else:

1017 option_parser.error('Unknown command "%s"' % command)

1018 return 0

1019

1020

1021 if __name__ == '__main__':

1022 sys.exit(main())

OLD	NEW