Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(179)

Side by Side Diff: tools/telemetry/third_party/webpagereplay/httparchive.py

Issue 1647513002: Delete tools/telemetry. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2010 Google Inc. All Rights Reserved.
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 """View and edit HTTP Archives.
17
18 To list all URLs in an archive:
19 $ ./httparchive.py ls archive.wpr
20
21 To view the content of all URLs from example.com:
22 $ ./httparchive.py cat --host example.com archive.wpr
23
24 To view the content of a particular URL:
25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
26
27 To view the content of all URLs:
28 $ ./httparchive.py cat archive.wpr
29
30 To edit a particular URL:
31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
32
33 To print statistics of an archive:
34 $ ./httparchive.py stats archive.wpr
35
36 To print statistics of a set of URLs:
37 $ ./httparchive.py stats --host www.example.com archive.wpr
38
39 To merge multiple archives
40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
41 """
42
43 import calendar
44 import certutils
45 import cPickle
46 import difflib
47 import email.utils
48 import httplib
49 import httpzlib
50 import json
51 import logging
52 import optparse
53 import os
54 import StringIO
55 import subprocess
56 import sys
57 import tempfile
58 import time
59 import urlparse
60 from collections import defaultdict
61
62
63
64 def LogRunTime(fn):
65 """Annotation which logs the run time of the function."""
66 def wrapped(self, *args, **kwargs):
67 start_time = time.time()
68 try:
69 return fn(self, *args, **kwargs)
70 finally:
71 run_time = (time.time() - start_time) * 1000.0
72 logging.debug('%s: %dms', fn.__name__, run_time)
73 return wrapped
74
75
76 class HttpArchiveException(Exception):
77 """Base class for all exceptions in httparchive."""
78 pass
79
80
81 class HttpArchive(dict):
82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
83
84 Attributes:
85 responses_by_host: dict of {hostname, {request: response}}. This must remain
86 in sync with the underlying dict of self. It is used as an optimization
87 so that get_requests() doesn't have to linearly search all requests in
88 the archive to find potential matches.
89 """
90
91 def __init__(self): # pylint: disable=super-init-not-called
92 self.responses_by_host = defaultdict(dict)
93
94 def __setstate__(self, state):
95 """Influence how to unpickle.
96
97 Args:
98 state: a dictionary for __dict__
99 """
100 self.__dict__.update(state)
101 self.responses_by_host = defaultdict(dict)
102 for request in self:
103 self.responses_by_host[request.host][request] = self[request]
104
105 def __getstate__(self):
106 """Influence how to pickle.
107
108 Returns:
109 a dict to use for pickling
110 """
111 state = self.__dict__.copy()
112 del state['responses_by_host']
113 return state
114
115 def __setitem__(self, key, value):
116 super(HttpArchive, self).__setitem__(key, value)
117 if hasattr(self, 'responses_by_host'):
118 self.responses_by_host[key.host][key] = value
119
120 def __delitem__(self, key):
121 super(HttpArchive, self).__delitem__(key)
122 del self.responses_by_host[key.host][key]
123
124 def get(self, request, default=None):
125 """Return the archived response for a given request.
126
127 Does extra checking for handling some HTTP request headers.
128
129 Args:
130 request: instance of ArchivedHttpRequest
131 default: default value to return if request is not found
132
133 Returns:
134 Instance of ArchivedHttpResponse or default if no matching
135 response is found
136 """
137 if request in self:
138 return self[request]
139 return self.get_conditional_response(request, default)
140
141 def get_conditional_response(self, request, default):
142 """Get the response based on the conditional HTTP request headers.
143
144 Args:
145 request: an ArchivedHttpRequest representing the original request.
146 default: default ArchivedHttpResponse
147 original request with matched headers removed.
148
149 Returns:
150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or
151 412 (precondition failed)
152 """
153 response = default
154 if request.is_conditional():
155 stripped_request = request.create_request_without_conditions()
156 if stripped_request in self:
157 response = self[stripped_request]
158 if response.status == 200:
159 status = self.get_conditional_status(request, response)
160 if status != 200:
161 response = create_response(status)
162 return response
163
164 def get_conditional_status(self, request, response):
165 status = 200
166 last_modified = email.utils.parsedate(
167 response.update_date(response.get_header('last-modified')))
168 response_etag = response.get_header('etag')
169 is_get_or_head = request.command.upper() in ('GET', 'HEAD')
170
171 match_value = request.headers.get('if-match', None)
172 if match_value:
173 if self.is_etag_match(match_value, response_etag):
174 status = 200
175 else:
176 status = 412 # precondition failed
177 none_match_value = request.headers.get('if-none-match', None)
178 if none_match_value:
179 if self.is_etag_match(none_match_value, response_etag):
180 status = 304
181 elif is_get_or_head:
182 status = 200
183 else:
184 status = 412
185 if is_get_or_head and last_modified:
186 for header in ('if-modified-since', 'if-unmodified-since'):
187 date = email.utils.parsedate(request.headers.get(header, None))
188 if date:
189 if ((header == 'if-modified-since' and last_modified > date) or
190 (header == 'if-unmodified-since' and last_modified < date)):
191 if status != 412:
192 status = 200
193 else:
194 status = 304 # not modified
195 return status
196
197 @staticmethod
198 def is_etag_match(request_etag, response_etag):
199 """Determines whether the entity tags of the request/response matches.
200
201 Args:
202 request_etag: the value string of the "if-(none)-match:"
203 portion of the request header
204 response_etag: the etag value of the response
205
206 Returns:
207 True on match, False otherwise
208 """
209 response_etag = response_etag.strip('" ')
210 for etag in request_etag.split(','):
211 etag = etag.strip('" ')
212 if etag in ('*', response_etag):
213 return True
214 return False
215
216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
217 use_query=True):
218 """Return a list of requests that match the given args."""
219 if host:
220 return [r for r in self.responses_by_host[host]
221 if r.matches(command, None, full_path, is_ssl,
222 use_query=use_query)]
223 else:
224 return [r for r in self
225 if r.matches(command, host, full_path, is_ssl,
226 use_query=use_query)]
227
228 def ls(self, command=None, host=None, full_path=None):
229 """List all URLs that match given params."""
230 return ''.join(sorted(
231 '%s\n' % r for r in self.get_requests(command, host, full_path)))
232
233 def cat(self, command=None, host=None, full_path=None):
234 """Print the contents of all URLs that match given params."""
235 out = StringIO.StringIO()
236 for request in self.get_requests(command, host, full_path):
237 print >>out, str(request)
238 print >>out, 'Untrimmed request headers:'
239 for k in request.headers:
240 print >>out, ' %s: %s' % (k, request.headers[k])
241 if request.request_body:
242 print >>out, request.request_body
243 print >>out, '---- Response Info', '-' * 51
244 response = self[request]
245 chunk_lengths = [len(x) for x in response.response_data]
246 print >>out, ('Status: %s\n'
247 'Reason: %s\n'
248 'Headers delay: %s\n'
249 'Response headers:') % (
250 response.status, response.reason, response.delays['headers'])
251 for k, v in response.headers:
252 print >>out, ' %s: %s' % (k, v)
253 print >>out, ('Chunk count: %s\n'
254 'Chunk lengths: %s\n'
255 'Chunk delays: %s') % (
256 len(chunk_lengths), chunk_lengths, response.delays['data'])
257 body = response.get_data_as_text()
258 print >>out, '---- Response Data', '-' * 51
259 if body:
260 print >>out, body
261 else:
262 print >>out, '[binary data]'
263 print >>out, '=' * 70
264 return out.getvalue()
265
266 def stats(self, command=None, host=None, full_path=None):
267 """Print stats about the archive for all URLs that match given params."""
268 matching_requests = self.get_requests(command, host, full_path)
269 if not matching_requests:
270 print 'Failed to find any requests matching given command, host, path.'
271 return
272
273 out = StringIO.StringIO()
274 stats = {
275 'Total': len(matching_requests),
276 'Domains': defaultdict(int),
277 'HTTP_response_code': defaultdict(int),
278 'content_type': defaultdict(int),
279 'Documents': defaultdict(int),
280 }
281
282 for request in matching_requests:
283 stats['Domains'][request.host] += 1
284 stats['HTTP_response_code'][self[request].status] += 1
285
286 content_type = self[request].get_header('content-type')
287 # Remove content type options for readability and higher level groupings.
288 str_content_type = str(content_type.split(';')[0]
289 if content_type else None)
290 stats['content_type'][str_content_type] += 1
291
292 # Documents are the main URL requested and not a referenced resource.
293 if str_content_type == 'text/html' and not 'referer' in request.headers:
294 stats['Documents'][request.host] += 1
295
296 print >>out, json.dumps(stats, indent=4)
297 return out.getvalue()
298
299 def merge(self, merged_archive=None, other_archives=None):
300 """Merge multiple archives into merged_archive by 'chaining' resources,
301 only resources that are not part of the accumlated archive are added"""
302 if not other_archives:
303 print 'No archives passed to merge'
304 return
305
306 # Note we already loaded 'replay_file'.
307 print 'Loaded %d responses' % len(self)
308
309 for archive in other_archives:
310 if not os.path.exists(archive):
311 print 'Error: Replay file "%s" does not exist' % archive
312 return
313
314 http_archive_other = HttpArchive.Load(archive)
315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
316 for r in http_archive_other:
317 # Only resources that are not already part of the current archive
318 # get added.
319 if r not in self:
320 print '\t %s ' % r
321 self[r] = http_archive_other[r]
322 self.Persist('%s' % merged_archive)
323
324 def edit(self, command=None, host=None, full_path=None):
325 """Edits the single request which matches given params."""
326 editor = os.getenv('EDITOR')
327 if not editor:
328 print 'You must set the EDITOR environmental variable.'
329 return
330
331 matching_requests = self.get_requests(command, host, full_path)
332 if not matching_requests:
333 print ('Failed to find any requests matching given command, host, '
334 'full_path.')
335 return
336
337 if len(matching_requests) > 1:
338 print 'Found multiple matching requests. Please refine.'
339 print self.ls(command, host, full_path)
340
341 response = self[matching_requests[0]]
342 tmp_file = tempfile.NamedTemporaryFile(delete=False)
343 tmp_file.write(response.get_response_as_text())
344 tmp_file.close()
345 subprocess.check_call([editor, tmp_file.name])
346 response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
347 os.remove(tmp_file.name)
348
349 def find_closest_request(self, request, use_path=False):
350 """Find the closest matching request in the archive to the given request.
351
352 Args:
353 request: an ArchivedHttpRequest
354 use_path: If True, closest matching request's path component must match.
355 (Note: this refers to the 'path' component within the URL, not the
356 'full path' which includes the query string component.)
357
358 If use_path=True, candidate will NOT match in example below
359 e.g. request = GET www.test.com/a?p=1
360 candidate = GET www.test.com/b?p=1
361
362 Even if use_path=False, urls with same paths are always favored.
363 For example, candidate1 is considered a better match than candidate2.
364 request = GET www.test.com/a?p=1&q=2&r=3
365 candidate1 = GET www.test.com/a?s=4
366 candidate2 = GET www.test.com/b?p=1&q=2&r=3
367
368 Returns:
369 If a close match is found, return the instance of ArchivedHttpRequest.
370 Otherwise, return None.
371 """
372 # Start with strictest constraints. This trims search space considerably.
373 requests = self.get_requests(request.command, request.host,
374 request.full_path, is_ssl=request.is_ssl,
375 use_query=True)
376 # Relax constraint: use_query if there is no match.
377 if not requests:
378 requests = self.get_requests(request.command, request.host,
379 request.full_path, is_ssl=request.is_ssl,
380 use_query=False)
381 # Relax constraint: full_path if there is no match and use_path=False.
382 if not requests and not use_path:
383 requests = self.get_requests(request.command, request.host,
384 None, is_ssl=request.is_ssl,
385 use_query=False)
386
387 if not requests:
388 return None
389
390 if len(requests) == 1:
391 return requests[0]
392
393 matcher = difflib.SequenceMatcher(b=request.cmp_seq)
394
395 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
396 # quick_ratio() on all requests, sort them descending, and then loop through
397 # until we find a candidate whose ratio() is >= the next quick_ratio().
398 # This works because quick_ratio() is guaranteed to be an upper bound on
399 # ratio().
400 candidates = []
401 for candidate in requests:
402 matcher.set_seq1(candidate.cmp_seq)
403 candidates.append((matcher.quick_ratio(), candidate))
404
405 candidates.sort(reverse=True, key=lambda c: c[0])
406
407 best_match = (0, None)
408 for i in xrange(len(candidates)):
409 matcher.set_seq1(candidates[i][1].cmp_seq)
410 best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
411 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
412 break
413 return best_match[1]
414
415 def diff(self, request):
416 """Diff the given request to the closest matching request in the archive.
417
418 Args:
419 request: an ArchivedHttpRequest
420 Returns:
421 If a close match is found, return a textual diff between the requests.
422 Otherwise, return None.
423 """
424 request_lines = request.formatted_request.split('\n')
425 closest_request = self.find_closest_request(request)
426 if closest_request:
427 closest_request_lines = closest_request.formatted_request.split('\n')
428 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
429 return None
430
431 def get_server_cert(self, host):
432 """Gets certificate from the server and stores it in archive"""
433 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
434 if request not in self:
435 self[request] = create_response(200, body=certutils.get_host_cert(host))
436 return self[request].response_data[0]
437
438 def get_certificate(self, host):
439 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
440 if request not in self:
441 self[request] = create_response(200, body=self._generate_cert(host))
442 return self[request].response_data[0]
443
444 @classmethod
445 def AssertWritable(cls, filename):
446 """Raises an IOError if filename is not writable."""
447 persist_dir = os.path.dirname(os.path.abspath(filename))
448 if not os.path.exists(persist_dir):
449 raise IOError('Directory does not exist: %s' % persist_dir)
450 if os.path.exists(filename):
451 if not os.access(filename, os.W_OK):
452 raise IOError('Need write permission on file: %s' % filename)
453 elif not os.access(persist_dir, os.W_OK):
454 raise IOError('Need write permission on directory: %s' % persist_dir)
455
456 @classmethod
457 def Load(cls, filename):
458 """Load an instance from filename."""
459 return cPickle.load(open(filename, 'rb'))
460
461 def Persist(self, filename):
462 """Persist all state to filename."""
463 try:
464 original_checkinterval = sys.getcheckinterval()
465 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can
466 # modify |self| during pickling.
467 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
468 finally:
469 sys.setcheckinterval(original_checkinterval)
470 with open(filename, 'wb') as f:
471 f.write(pickled_self)
472
473
474 class ArchivedHttpRequest(object):
475 """Record all the state that goes into a request.
476
477 ArchivedHttpRequest instances are considered immutable so they can
478 serve as keys for HttpArchive instances.
479 (The immutability is not enforced.)
480
481 Upon creation, the headers are "trimmed" (i.e. edited or dropped)
482 and saved to self.trimmed_headers to allow requests to match in a wider
483 variety of playback situations (e.g. using different user agents).
484
485 For unpickling, 'trimmed_headers' is recreated from 'headers'. That
486 allows for changes to the trim function and can help with debugging.
487 """
488 CONDITIONAL_HEADERS = [
489 'if-none-match', 'if-match',
490 'if-modified-since', 'if-unmodified-since']
491
492 def __init__(self, command, host, full_path, request_body, headers,
493 is_ssl=False):
494 """Initialize an ArchivedHttpRequest.
495
496 Args:
497 command: a string (e.g. 'GET' or 'POST').
498 host: a host name (e.g. 'www.google.com').
499 full_path: a request path. Includes everything after the host & port in
500 the URL (e.g. '/search?q=dogs').
501 request_body: a request body string for a POST or None.
502 headers: {key: value, ...} where key and value are strings.
503 is_ssl: a boolean which is True iff request is make via SSL.
504 """
505 self.command = command
506 self.host = host
507 self.full_path = full_path
508 parsed_url = urlparse.urlparse(full_path) if full_path else None
509 self.path = parsed_url.path if parsed_url else None
510 self.request_body = request_body
511 self.headers = headers
512 self.is_ssl = is_ssl
513 self.trimmed_headers = self._TrimHeaders(headers)
514 self.formatted_request = self._GetFormattedRequest()
515 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)
516
517 def __str__(self):
518 scheme = 'https' if self.is_ssl else 'http'
519 return '%s %s://%s%s %s' % (
520 self.command, scheme, self.host, self.full_path, self.trimmed_headers)
521
522 def __repr__(self):
523 return repr((self.command, self.host, self.full_path, self.request_body,
524 self.trimmed_headers, self.is_ssl))
525
526 def __hash__(self):
527 """Return a integer hash to use for hashed collections including dict."""
528 return hash(repr(self))
529
530 def __eq__(self, other):
531 """Define the __eq__ method to match the hash behavior."""
532 return repr(self) == repr(other)
533
534 def __setstate__(self, state):
535 """Influence how to unpickle.
536
537 "headers" are the original request headers.
538 "trimmed_headers" are the trimmed headers used for matching requests
539 during replay.
540
541 Args:
542 state: a dictionary for __dict__
543 """
544 if 'full_headers' in state:
545 # Fix older version of archive.
546 state['headers'] = state['full_headers']
547 del state['full_headers']
548 if 'headers' not in state:
549 raise HttpArchiveException(
550 'Archived HTTP request is missing "headers". The HTTP archive is'
551 ' likely from a previous version and must be re-recorded.')
552 if 'path' in state:
553 # before, 'path' and 'path_without_query' were used and 'path' was
554 # pickled. Now, 'path' has been renamed to 'full_path' and
555 # 'path_without_query' has been renamed to 'path'. 'full_path' is
556 # pickled, but 'path' is not. If we see 'path' here it means we are
557 # dealing with an older archive.
558 state['full_path'] = state['path']
559 del state['path']
560 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
561 if 'is_ssl' not in state:
562 state['is_ssl'] = False
563 self.__dict__.update(state)
564 parsed_url = urlparse.urlparse(self.full_path)
565 self.path = parsed_url.path
566 self.formatted_request = self._GetFormattedRequest()
567 self.cmp_seq = self._GetCmpSeq(parsed_url.query)
568
569 def __getstate__(self):
570 """Influence how to pickle.
571
572 Returns:
573 a dict to use for pickling
574 """
575 state = self.__dict__.copy()
576 del state['trimmed_headers']
577 del state['path']
578 del state['formatted_request']
579 del state['cmp_seq']
580 return state
581
582 def _GetFormattedRequest(self):
583 """Format request to make diffs easier to read.
584
585 Returns:
586 A string consisting of the request. Example:
587 'GET www.example.com/path\nHeader-Key: header value\n'
588 """
589 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
590 if self.request_body:
591 parts.append('%s\n' % self.request_body)
592 for k, v in self.trimmed_headers:
593 k = '-'.join(x.capitalize() for x in k.split('-'))
594 parts.append('%s: %s\n' % (k, v))
595 return ''.join(parts)
596
597 def _GetCmpSeq(self, query=None):
598 """Compute a sequence out of query and header for difflib to compare.
599 For example:
600 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]
601 will be returned for a request with URL:
602 http://example.com/index.html?q1=a2&q2=a2
603 and header:
604 k1: v1
605 k2: v2
606
607 Args:
608 query: the query string in the URL.
609
610 Returns:
611 A sequence for difflib to compare.
612 """
613 if not query:
614 return self.trimmed_headers
615 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers
616
617 def matches(self, command=None, host=None, full_path=None, is_ssl=None,
618 use_query=True):
619 """Returns true iff the request matches all parameters.
620
621 Args:
622 command: a string (e.g. 'GET' or 'POST').
623 host: a host name (e.g. 'www.google.com').
624 full_path: a request path with query string (e.g. '/search?q=dogs')
625 is_ssl: whether the request is secure.
626 use_query:
627 If use_query is True, request matching uses both the hierarchical path
628 and query string component.
629 If use_query is False, request matching only uses the hierarchical path
630
631 e.g. req1 = GET www.test.com/index?aaaa
632 req2 = GET www.test.com/index?bbbb
633
634 If use_query is True, req1.matches(req2) evaluates to False
635 If use_query is False, req1.matches(req2) evaluates to True
636
637 Returns:
638 True iff the request matches all parameters
639 """
640 if command is not None and command != self.command:
641 return False
642 if is_ssl is not None and is_ssl != self.is_ssl:
643 return False
644 if host is not None and host != self.host:
645 return False
646 if full_path is None:
647 return True
648 if use_query:
649 return full_path == self.full_path
650 else:
651 return self.path == urlparse.urlparse(full_path).path
652
653 @classmethod
654 def _TrimHeaders(cls, headers):
655 """Removes headers that are known to cause problems during replay.
656
657 These headers are removed for the following reasons:
658 - accept: Causes problems with www.bing.com. During record, CSS is fetched
659 with *. During replay, it's text/css.
660 - accept-charset, accept-language, referer: vary between clients.
661 - cache-control: sometimes sent from Chrome with 'max-age=0' as value.
662 - connection, method, scheme, url, version: Cause problems with spdy.
663 - cookie: Extremely sensitive to request/response order.
664 - keep-alive: Doesn't affect the content of the request, only some
665 transient state of the transport layer.
666 - user-agent: Changes with every Chrome version.
667 - proxy-connection: Sent for proxy requests.
668 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by
669 Google to collect statistics about Chrome's enabled features.
670
671 Another variant to consider is dropping only the value from the header.
672 However, this is particularly bad for the cookie header, because the
673 presence of the cookie depends on the responses we've seen when the request
674 is made.
675
676 Args:
677 headers: {header_key: header_value, ...}
678
679 Returns:
680 [(header_key, header_value), ...] # (with undesirable headers removed)
681 """
682 # TODO(tonyg): Strip sdch from the request headers because we can't
683 # guarantee that the dictionary will be recorded, so replay may not work.
684 if 'accept-encoding' in headers:
685 accept_encoding = headers['accept-encoding']
686 accept_encoding = accept_encoding.replace('sdch', '')
687 # Strip lzma so Opera's requests matches archives recorded using Chrome.
688 accept_encoding = accept_encoding.replace('lzma', '')
689 stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
690 accept_encoding = ','.join(filter(bool, stripped_encodings))
691 headers['accept-encoding'] = accept_encoding
692 undesirable_keys = [
693 'accept', 'accept-charset', 'accept-language', 'cache-control',
694 'connection', 'cookie', 'keep-alive', 'method',
695 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
696 'x-chrome-variations', 'x-client-data']
697 return sorted([(k, v) for k, v in headers.items()
698 if k.lower() not in undesirable_keys])
699
700 def is_conditional(self):
701 """Return list of headers that match conditional headers."""
702 for header in self.CONDITIONAL_HEADERS:
703 if header in self.headers:
704 return True
705 return False
706
707 def create_request_without_conditions(self):
708 stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
709 if k.lower() not in self.CONDITIONAL_HEADERS)
710 return ArchivedHttpRequest(
711 self.command, self.host, self.full_path, self.request_body,
712 stripped_headers, self.is_ssl)
713
714 class ArchivedHttpResponse(object):
715 """All the data needed to recreate all HTTP response."""
716
717 # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
718 # It is not sent in responses. It is added by get_data_as_text()
719 # and removed by set_data().
720 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
721
722 # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
723 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
724 'Delays are above. Response content is below.]\n')
725
726 def __init__(self, version, status, reason, headers, response_data,
727 delays=None):
728 """Initialize an ArchivedHttpResponse.
729
730 Args:
731 version: HTTP protocol version used by server.
732 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
733 status: Status code returned by server (e.g. 200).
734 reason: Reason phrase returned by server (e.g. "OK").
735 headers: list of (header, value) tuples.
736 response_data: list of content chunks.
737 Concatenating the chunks gives the complete contents
738 (i.e. the chunks do not have any lengths or delimiters).
739 Do not include the final, zero-length chunk that marks the end.
740 delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
741 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
742 connect - The time to connect to the server.
743 Each resource has a value because Replay's record mode captures it.
744 This includes the time for the SYN and SYN/ACK (1 rtt).
745 headers - The time elapsed between the TCP connect and the headers.
746 This typically includes all the server-time to generate a response.
747 data - If the response is chunked, these are the times for each chunk.
748 """
749 self.version = version
750 self.status = status
751 self.reason = reason
752 self.headers = headers
753 self.response_data = response_data
754 self.delays = delays
755 self.fix_delays()
756
757 def fix_delays(self):
758 """Initialize delays, or check the number of data delays."""
759 expected_num_delays = len(self.response_data)
760 if not self.delays:
761 self.delays = {
762 'connect': 0,
763 'headers': 0,
764 'data': [0] * expected_num_delays
765 }
766 else:
767 num_delays = len(self.delays['data'])
768 if num_delays != expected_num_delays:
769 raise HttpArchiveException(
770 'Server delay length mismatch: %d (expected %d): %s',
771 num_delays, expected_num_delays, self.delays['data'])
772
773 def __repr__(self):
774 return repr((self.version, self.status, self.reason, sorted(self.headers),
775 self.response_data))
776
777 def __hash__(self):
778 """Return a integer hash to use for hashed collections including dict."""
779 return hash(repr(self))
780
781 def __eq__(self, other):
782 """Define the __eq__ method to match the hash behavior."""
783 return repr(self) == repr(other)
784
785 def __setstate__(self, state):
786 """Influence how to unpickle.
787
788 Args:
789 state: a dictionary for __dict__
790 """
791 if 'server_delays' in state:
792 state['delays'] = {
793 'connect': 0,
794 'headers': 0,
795 'data': state['server_delays']
796 }
797 del state['server_delays']
798 elif 'delays' not in state:
799 state['delays'] = None
800 self.__dict__.update(state)
801 self.fix_delays()
802
803 def get_header(self, key, default=None):
804 for k, v in self.headers:
805 if key.lower() == k.lower():
806 return v
807 return default
808
809 def set_header(self, key, value):
810 for i, (k, v) in enumerate(self.headers):
811 if key == k:
812 self.headers[i] = (key, value)
813 return
814 self.headers.append((key, value))
815
816 def remove_header(self, key):
817 for i, (k, v) in enumerate(self.headers):
818 if key.lower() == k.lower():
819 self.headers.pop(i)
820 return
821
822 @staticmethod
823 def _get_epoch_seconds(date_str):
824 """Return the epoch seconds of a date header.
825
826 Args:
827 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
828 Returns:
829 epoch seconds as a float
830 """
831 date_tuple = email.utils.parsedate(date_str)
832 if date_tuple:
833 return calendar.timegm(date_tuple)
834 return None
835
836 def update_date(self, date_str, now=None):
837 """Return an updated date based on its delta from the "Date" header.
838
839 For example, if |date_str| is one week later than the "Date" header,
840 then the returned date string is one week later than the current date.
841
842 Args:
843 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
844 Returns:
845 a date string
846 """
847 date_seconds = self._get_epoch_seconds(self.get_header('date'))
848 header_seconds = self._get_epoch_seconds(date_str)
849 if date_seconds and header_seconds:
850 updated_seconds = header_seconds + (now or time.time()) - date_seconds
851 return email.utils.formatdate(updated_seconds, usegmt=True)
852 return date_str
853
854 def is_gzip(self):
855 return self.get_header('content-encoding') == 'gzip'
856
857 def is_compressed(self):
858 return self.get_header('content-encoding') in ('gzip', 'deflate')
859
860 def is_chunked(self):
861 return self.get_header('transfer-encoding') == 'chunked'
862
863 def get_data_as_text(self):
864 """Return content as a single string.
865
866 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
867 """
868 content_type = self.get_header('content-type')
869 if (not content_type or
870 not (content_type.startswith('text/') or
871 content_type == 'application/x-javascript' or
872 content_type.startswith('application/json'))):
873 return None
874 if self.is_compressed():
875 uncompressed_chunks = httpzlib.uncompress_chunks(
876 self.response_data, self.is_gzip())
877 else:
878 uncompressed_chunks = self.response_data
879 return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks)
880
881 def get_delays_as_text(self):
882 """Return delays as editable text."""
883 return json.dumps(self.delays, indent=2)
884
885 def get_response_as_text(self):
886 """Returns response content as a single string.
887
888 Server delays are separated on a per-chunk basis. Delays are in seconds.
889 Response content begins after DELAY_EDIT_SEPARATOR
890 """
891 data = self.get_data_as_text()
892 if data is None:
893 logging.warning('Data can not be represented as text.')
894 data = ''
895 delays = self.get_delays_as_text()
896 return self.DELAY_EDIT_SEPARATOR.join((delays, data))
897
898 def set_data(self, text):
899 """Inverse of get_data_as_text().
900
901 Split on CHUNK_EDIT_SEPARATOR and compress if needed.
902 """
903 text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR)
904 if self.is_compressed():
905 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
906 else:
907 self.response_data = text_chunks
908 if not self.is_chunked():
909 content_length = sum(len(c) for c in self.response_data)
910 self.set_header('content-length', str(content_length))
911
912 def set_delays(self, delays_text):
913 """Inverse of get_delays_as_text().
914
915 Args:
916 delays_text: JSON encoded text such as the following:
917 {
918 connect: 80,
919 headers: 80,
920 data: [6, 55, 0]
921 }
922 Times are in milliseconds.
923 Each data delay corresponds with one response_data value.
924 """
925 try:
926 self.delays = json.loads(delays_text)
927 except (ValueError, KeyError) as e:
928 logging.critical('Unable to parse delays %s: %s', delays_text, e)
929 self.fix_delays()
930
931 def set_response_from_text(self, text):
932 """Inverse of get_response_as_text().
933
934 Modifies the state of the archive according to the textual representation.
935 """
936 try:
937 delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
938 except ValueError:
939 logging.critical(
940 'Error parsing text representation. Skipping edits.')
941 return
942 self.set_delays(delays)
943 self.set_data(data)
944
945
946 def create_response(status, reason=None, headers=None, body=None):
947 """Convenience method for creating simple ArchivedHttpResponse objects."""
948 if reason is None:
949 reason = httplib.responses.get(status, 'Unknown')
950 if headers is None:
951 headers = [('content-type', 'text/plain')]
952 if body is None:
953 body = "%s %s" % (status, reason)
954 return ArchivedHttpResponse(11, status, reason, headers, [body])
955
956
957 def main():
958 class PlainHelpFormatter(optparse.IndentedHelpFormatter):
959 def format_description(self, description):
960 if description:
961 return description + '\n'
962 else:
963 return ''
964
965 option_parser = optparse.OptionParser(
966 usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
967 formatter=PlainHelpFormatter(),
968 description=__doc__,
969 epilog='http://code.google.com/p/web-page-replay/')
970
971 option_parser.add_option('-c', '--command', default=None,
972 action='store',
973 type='string',
974 help='Only show URLs matching this command.')
975 option_parser.add_option('-o', '--host', default=None,
976 action='store',
977 type='string',
978 help='Only show URLs matching this host.')
979 option_parser.add_option('-p', '--full_path', default=None,
980 action='store',
981 type='string',
982 help='Only show URLs matching this full path.')
983 option_parser.add_option('-f', '--merged_file', default=None,
984 action='store',
985 type='string',
986 help='The output file to use when using the merge command.')
987
988 options, args = option_parser.parse_args()
989
990 # Merge command expects an umlimited number of archives.
991 if len(args) < 2:
992 print 'args: %s' % args
993 option_parser.error('Must specify a command and replay_file')
994
995 command = args[0]
996 replay_file = args[1]
997
998 if not os.path.exists(replay_file):
999 option_parser.error('Replay file "%s" does not exist' % replay_file)
1000
1001 http_archive = HttpArchive.Load(replay_file)
1002 if command == 'ls':
1003 print http_archive.ls(options.command, options.host, options.full_path)
1004 elif command == 'cat':
1005 print http_archive.cat(options.command, options.host, options.full_path)
1006 elif command == 'stats':
1007 print http_archive.stats(options.command, options.host, options.full_path)
1008 elif command == 'merge':
1009 if not options.merged_file:
1010 print 'Error: Must specify a merged file name (use --merged_file)'
1011 return
1012 http_archive.merge(options.merged_file, args[2:])
1013 elif command == 'edit':
1014 http_archive.edit(options.command, options.host, options.full_path)
1015 http_archive.Persist(replay_file)
1016 else:
1017 option_parser.error('Unknown command "%s"' % command)
1018 return 0
1019
1020
1021 if __name__ == '__main__':
1022 sys.exit(main())
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698