| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env python | |
| 2 # Copyright 2010 Google Inc. All Rights Reserved. | |
| 3 # | |
| 4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 # you may not use this file except in compliance with the License. | |
| 6 # You may obtain a copy of the License at | |
| 7 # | |
| 8 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 # | |
| 10 # Unless required by applicable law or agreed to in writing, software | |
| 11 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 # See the License for the specific language governing permissions and | |
| 14 # limitations under the License. | |
| 15 | |
| 16 """View and edit HTTP Archives. | |
| 17 | |
| 18 To list all URLs in an archive: | |
| 19 $ ./httparchive.py ls archive.wpr | |
| 20 | |
| 21 To view the content of all URLs from example.com: | |
| 22 $ ./httparchive.py cat --host example.com archive.wpr | |
| 23 | |
| 24 To view the content of a particular URL: | |
| 25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr | |
| 26 | |
| 27 To view the content of all URLs: | |
| 28 $ ./httparchive.py cat archive.wpr | |
| 29 | |
| 30 To edit a particular URL: | |
| 31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr | |
| 32 | |
| 33 To print statistics of an archive: | |
| 34 $ ./httparchive.py stats archive.wpr | |
| 35 | |
| 36 To print statistics of a set of URLs: | |
| 37 $ ./httparchive.py stats --host www.example.com archive.wpr | |
| 38 | |
| 39 To merge multiple archives | |
| 40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ... | |
| 41 """ | |
| 42 | |
| 43 import calendar | |
| 44 import certutils | |
| 45 import cPickle | |
| 46 import difflib | |
| 47 import email.utils | |
| 48 import httplib | |
| 49 import httpzlib | |
| 50 import json | |
| 51 import logging | |
| 52 import optparse | |
| 53 import os | |
| 54 import StringIO | |
| 55 import subprocess | |
| 56 import sys | |
| 57 import tempfile | |
| 58 import time | |
| 59 import urlparse | |
| 60 from collections import defaultdict | |
| 61 | |
| 62 | |
| 63 | |
| 64 def LogRunTime(fn): | |
| 65 """Annotation which logs the run time of the function.""" | |
| 66 def wrapped(self, *args, **kwargs): | |
| 67 start_time = time.time() | |
| 68 try: | |
| 69 return fn(self, *args, **kwargs) | |
| 70 finally: | |
| 71 run_time = (time.time() - start_time) * 1000.0 | |
| 72 logging.debug('%s: %dms', fn.__name__, run_time) | |
| 73 return wrapped | |
| 74 | |
| 75 | |
| 76 class HttpArchiveException(Exception): | |
| 77 """Base class for all exceptions in httparchive.""" | |
| 78 pass | |
| 79 | |
| 80 | |
| 81 class HttpArchive(dict): | |
| 82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values. | |
| 83 | |
| 84 Attributes: | |
| 85 responses_by_host: dict of {hostname, {request: response}}. This must remain | |
| 86 in sync with the underlying dict of self. It is used as an optimization | |
| 87 so that get_requests() doesn't have to linearly search all requests in | |
| 88 the archive to find potential matches. | |
| 89 """ | |
| 90 | |
| 91 def __init__(self): # pylint: disable=super-init-not-called | |
| 92 self.responses_by_host = defaultdict(dict) | |
| 93 | |
| 94 def __setstate__(self, state): | |
| 95 """Influence how to unpickle. | |
| 96 | |
| 97 Args: | |
| 98 state: a dictionary for __dict__ | |
| 99 """ | |
| 100 self.__dict__.update(state) | |
| 101 self.responses_by_host = defaultdict(dict) | |
| 102 for request in self: | |
| 103 self.responses_by_host[request.host][request] = self[request] | |
| 104 | |
| 105 def __getstate__(self): | |
| 106 """Influence how to pickle. | |
| 107 | |
| 108 Returns: | |
| 109 a dict to use for pickling | |
| 110 """ | |
| 111 state = self.__dict__.copy() | |
| 112 del state['responses_by_host'] | |
| 113 return state | |
| 114 | |
| 115 def __setitem__(self, key, value): | |
| 116 super(HttpArchive, self).__setitem__(key, value) | |
| 117 if hasattr(self, 'responses_by_host'): | |
| 118 self.responses_by_host[key.host][key] = value | |
| 119 | |
| 120 def __delitem__(self, key): | |
| 121 super(HttpArchive, self).__delitem__(key) | |
| 122 del self.responses_by_host[key.host][key] | |
| 123 | |
| 124 def get(self, request, default=None): | |
| 125 """Return the archived response for a given request. | |
| 126 | |
| 127 Does extra checking for handling some HTTP request headers. | |
| 128 | |
| 129 Args: | |
| 130 request: instance of ArchivedHttpRequest | |
| 131 default: default value to return if request is not found | |
| 132 | |
| 133 Returns: | |
| 134 Instance of ArchivedHttpResponse or default if no matching | |
| 135 response is found | |
| 136 """ | |
| 137 if request in self: | |
| 138 return self[request] | |
| 139 return self.get_conditional_response(request, default) | |
| 140 | |
| 141 def get_conditional_response(self, request, default): | |
| 142 """Get the response based on the conditional HTTP request headers. | |
| 143 | |
| 144 Args: | |
| 145 request: an ArchivedHttpRequest representing the original request. | |
| 146 default: default ArchivedHttpResponse | |
| 147 original request with matched headers removed. | |
| 148 | |
| 149 Returns: | |
| 150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or | |
| 151 412 (precondition failed) | |
| 152 """ | |
| 153 response = default | |
| 154 if request.is_conditional(): | |
| 155 stripped_request = request.create_request_without_conditions() | |
| 156 if stripped_request in self: | |
| 157 response = self[stripped_request] | |
| 158 if response.status == 200: | |
| 159 status = self.get_conditional_status(request, response) | |
| 160 if status != 200: | |
| 161 response = create_response(status) | |
| 162 return response | |
| 163 | |
| 164 def get_conditional_status(self, request, response): | |
| 165 status = 200 | |
| 166 last_modified = email.utils.parsedate( | |
| 167 response.update_date(response.get_header('last-modified'))) | |
| 168 response_etag = response.get_header('etag') | |
| 169 is_get_or_head = request.command.upper() in ('GET', 'HEAD') | |
| 170 | |
| 171 match_value = request.headers.get('if-match', None) | |
| 172 if match_value: | |
| 173 if self.is_etag_match(match_value, response_etag): | |
| 174 status = 200 | |
| 175 else: | |
| 176 status = 412 # precondition failed | |
| 177 none_match_value = request.headers.get('if-none-match', None) | |
| 178 if none_match_value: | |
| 179 if self.is_etag_match(none_match_value, response_etag): | |
| 180 status = 304 | |
| 181 elif is_get_or_head: | |
| 182 status = 200 | |
| 183 else: | |
| 184 status = 412 | |
| 185 if is_get_or_head and last_modified: | |
| 186 for header in ('if-modified-since', 'if-unmodified-since'): | |
| 187 date = email.utils.parsedate(request.headers.get(header, None)) | |
| 188 if date: | |
| 189 if ((header == 'if-modified-since' and last_modified > date) or | |
| 190 (header == 'if-unmodified-since' and last_modified < date)): | |
| 191 if status != 412: | |
| 192 status = 200 | |
| 193 else: | |
| 194 status = 304 # not modified | |
| 195 return status | |
| 196 | |
| 197 @staticmethod | |
| 198 def is_etag_match(request_etag, response_etag): | |
| 199 """Determines whether the entity tags of the request/response matches. | |
| 200 | |
| 201 Args: | |
| 202 request_etag: the value string of the "if-(none)-match:" | |
| 203 portion of the request header | |
| 204 response_etag: the etag value of the response | |
| 205 | |
| 206 Returns: | |
| 207 True on match, False otherwise | |
| 208 """ | |
| 209 response_etag = response_etag.strip('" ') | |
| 210 for etag in request_etag.split(','): | |
| 211 etag = etag.strip('" ') | |
| 212 if etag in ('*', response_etag): | |
| 213 return True | |
| 214 return False | |
| 215 | |
| 216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None, | |
| 217 use_query=True): | |
| 218 """Return a list of requests that match the given args.""" | |
| 219 if host: | |
| 220 return [r for r in self.responses_by_host[host] | |
| 221 if r.matches(command, None, full_path, is_ssl, | |
| 222 use_query=use_query)] | |
| 223 else: | |
| 224 return [r for r in self | |
| 225 if r.matches(command, host, full_path, is_ssl, | |
| 226 use_query=use_query)] | |
| 227 | |
| 228 def ls(self, command=None, host=None, full_path=None): | |
| 229 """List all URLs that match given params.""" | |
| 230 return ''.join(sorted( | |
| 231 '%s\n' % r for r in self.get_requests(command, host, full_path))) | |
| 232 | |
| 233 def cat(self, command=None, host=None, full_path=None): | |
| 234 """Print the contents of all URLs that match given params.""" | |
| 235 out = StringIO.StringIO() | |
| 236 for request in self.get_requests(command, host, full_path): | |
| 237 print >>out, str(request) | |
| 238 print >>out, 'Untrimmed request headers:' | |
| 239 for k in request.headers: | |
| 240 print >>out, ' %s: %s' % (k, request.headers[k]) | |
| 241 if request.request_body: | |
| 242 print >>out, request.request_body | |
| 243 print >>out, '---- Response Info', '-' * 51 | |
| 244 response = self[request] | |
| 245 chunk_lengths = [len(x) for x in response.response_data] | |
| 246 print >>out, ('Status: %s\n' | |
| 247 'Reason: %s\n' | |
| 248 'Headers delay: %s\n' | |
| 249 'Untrimmed response headers:') % ( | |
| 250 response.status, response.reason, response.delays['headers']) | |
| 251 for k, v in response.original_headers: | |
| 252 print >>out, ' %s: %s' % (k, v) | |
| 253 print >>out, ('Chunk count: %s\n' | |
| 254 'Chunk lengths: %s\n' | |
| 255 'Chunk delays: %s') % ( | |
| 256 len(chunk_lengths), chunk_lengths, response.delays['data']) | |
| 257 body = response.get_data_as_text() | |
| 258 print >>out, '---- Response Data', '-' * 51 | |
| 259 if body: | |
| 260 print >>out, body | |
| 261 else: | |
| 262 print >>out, '[binary data]' | |
| 263 print >>out, '=' * 70 | |
| 264 return out.getvalue() | |
| 265 | |
| 266 def stats(self, command=None, host=None, full_path=None): | |
| 267 """Print stats about the archive for all URLs that match given params.""" | |
| 268 matching_requests = self.get_requests(command, host, full_path) | |
| 269 if not matching_requests: | |
| 270 print 'Failed to find any requests matching given command, host, path.' | |
| 271 return | |
| 272 | |
| 273 out = StringIO.StringIO() | |
| 274 stats = { | |
| 275 'Total': len(matching_requests), | |
| 276 'Domains': defaultdict(int), | |
| 277 'HTTP_response_code': defaultdict(int), | |
| 278 'content_type': defaultdict(int), | |
| 279 'Documents': defaultdict(int), | |
| 280 } | |
| 281 | |
| 282 for request in matching_requests: | |
| 283 stats['Domains'][request.host] += 1 | |
| 284 stats['HTTP_response_code'][self[request].status] += 1 | |
| 285 | |
| 286 content_type = self[request].get_header('content-type') | |
| 287 # Remove content type options for readability and higher level groupings. | |
| 288 str_content_type = str(content_type.split(';')[0] | |
| 289 if content_type else None) | |
| 290 stats['content_type'][str_content_type] += 1 | |
| 291 | |
| 292 # Documents are the main URL requested and not a referenced resource. | |
| 293 if str_content_type == 'text/html' and not 'referer' in request.headers: | |
| 294 stats['Documents'][request.host] += 1 | |
| 295 | |
| 296 print >>out, json.dumps(stats, indent=4) | |
| 297 return out.getvalue() | |
| 298 | |
| 299 def merge(self, merged_archive=None, other_archives=None): | |
| 300 """Merge multiple archives into merged_archive by 'chaining' resources, | |
| 301 only resources that are not part of the accumlated archive are added""" | |
| 302 if not other_archives: | |
| 303 print 'No archives passed to merge' | |
| 304 return | |
| 305 | |
| 306 # Note we already loaded 'replay_file'. | |
| 307 print 'Loaded %d responses' % len(self) | |
| 308 | |
| 309 for archive in other_archives: | |
| 310 if not os.path.exists(archive): | |
| 311 print 'Error: Replay file "%s" does not exist' % archive | |
| 312 return | |
| 313 | |
| 314 http_archive_other = HttpArchive.Load(archive) | |
| 315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive) | |
| 316 for r in http_archive_other: | |
| 317 # Only resources that are not already part of the current archive | |
| 318 # get added. | |
| 319 if r not in self: | |
| 320 print '\t %s ' % r | |
| 321 self[r] = http_archive_other[r] | |
| 322 self.Persist('%s' % merged_archive) | |
| 323 | |
| 324 def edit(self, command=None, host=None, full_path=None): | |
| 325 """Edits the single request which matches given params.""" | |
| 326 editor = os.getenv('EDITOR') | |
| 327 if not editor: | |
| 328 print 'You must set the EDITOR environmental variable.' | |
| 329 return | |
| 330 | |
| 331 matching_requests = self.get_requests(command, host, full_path) | |
| 332 if not matching_requests: | |
| 333 print ('Failed to find any requests matching given command, host, ' | |
| 334 'full_path.') | |
| 335 return | |
| 336 | |
| 337 if len(matching_requests) > 1: | |
| 338 print 'Found multiple matching requests. Please refine.' | |
| 339 print self.ls(command, host, full_path) | |
| 340 | |
| 341 response = self[matching_requests[0]] | |
| 342 tmp_file = tempfile.NamedTemporaryFile(delete=False) | |
| 343 tmp_file.write(response.get_response_as_text()) | |
| 344 tmp_file.close() | |
| 345 subprocess.check_call([editor, tmp_file.name]) | |
| 346 response.set_response_from_text(''.join(open(tmp_file.name).readlines())) | |
| 347 os.remove(tmp_file.name) | |
| 348 | |
| 349 def find_closest_request(self, request, use_path=False): | |
| 350 """Find the closest matching request in the archive to the given request. | |
| 351 | |
| 352 Args: | |
| 353 request: an ArchivedHttpRequest | |
| 354 use_path: If True, closest matching request's path component must match. | |
| 355 (Note: this refers to the 'path' component within the URL, not the | |
| 356 'full path' which includes the query string component.) | |
| 357 | |
| 358 If use_path=True, candidate will NOT match in example below | |
| 359 e.g. request = GET www.test.com/a?p=1 | |
| 360 candidate = GET www.test.com/b?p=1 | |
| 361 | |
| 362 Even if use_path=False, urls with same paths are always favored. | |
| 363 For example, candidate1 is considered a better match than candidate2. | |
| 364 request = GET www.test.com/a?p=1&q=2&r=3 | |
| 365 candidate1 = GET www.test.com/a?s=4 | |
| 366 candidate2 = GET www.test.com/b?p=1&q=2&r=3 | |
| 367 | |
| 368 Returns: | |
| 369 If a close match is found, return the instance of ArchivedHttpRequest. | |
| 370 Otherwise, return None. | |
| 371 """ | |
| 372 # Start with strictest constraints. This trims search space considerably. | |
| 373 requests = self.get_requests(request.command, request.host, | |
| 374 request.full_path, is_ssl=request.is_ssl, | |
| 375 use_query=True) | |
| 376 # Relax constraint: use_query if there is no match. | |
| 377 if not requests: | |
| 378 requests = self.get_requests(request.command, request.host, | |
| 379 request.full_path, is_ssl=request.is_ssl, | |
| 380 use_query=False) | |
| 381 # Relax constraint: full_path if there is no match and use_path=False. | |
| 382 if not requests and not use_path: | |
| 383 requests = self.get_requests(request.command, request.host, | |
| 384 None, is_ssl=request.is_ssl, | |
| 385 use_query=False) | |
| 386 | |
| 387 if not requests: | |
| 388 return None | |
| 389 | |
| 390 if len(requests) == 1: | |
| 391 return requests[0] | |
| 392 | |
| 393 matcher = difflib.SequenceMatcher(b=request.cmp_seq) | |
| 394 | |
| 395 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call | |
| 396 # quick_ratio() on all requests, sort them descending, and then loop through | |
| 397 # until we find a candidate whose ratio() is >= the next quick_ratio(). | |
| 398 # This works because quick_ratio() is guaranteed to be an upper bound on | |
| 399 # ratio(). | |
| 400 candidates = [] | |
| 401 for candidate in requests: | |
| 402 matcher.set_seq1(candidate.cmp_seq) | |
| 403 candidates.append((matcher.quick_ratio(), candidate)) | |
| 404 | |
| 405 candidates.sort(reverse=True, key=lambda c: c[0]) | |
| 406 | |
| 407 best_match = (0, None) | |
| 408 for i in xrange(len(candidates)): | |
| 409 matcher.set_seq1(candidates[i][1].cmp_seq) | |
| 410 best_match = max(best_match, (matcher.ratio(), candidates[i][1])) | |
| 411 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]: | |
| 412 break | |
| 413 return best_match[1] | |
| 414 | |
| 415 def diff(self, request): | |
| 416 """Diff the given request to the closest matching request in the archive. | |
| 417 | |
| 418 Args: | |
| 419 request: an ArchivedHttpRequest | |
| 420 Returns: | |
| 421 If a close match is found, return a textual diff between the requests. | |
| 422 Otherwise, return None. | |
| 423 """ | |
| 424 request_lines = request.formatted_request.split('\n') | |
| 425 closest_request = self.find_closest_request(request) | |
| 426 if closest_request: | |
| 427 closest_request_lines = closest_request.formatted_request.split('\n') | |
| 428 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines)) | |
| 429 return None | |
| 430 | |
| 431 def get_server_cert(self, host): | |
| 432 """Gets certificate from the server and stores it in archive""" | |
| 433 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {}) | |
| 434 if request not in self: | |
| 435 self[request] = create_response(200, body=certutils.get_host_cert(host)) | |
| 436 return self[request].response_data[0] | |
| 437 | |
| 438 def get_certificate(self, host): | |
| 439 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {}) | |
| 440 if request not in self: | |
| 441 self[request] = create_response(200, body=self._generate_cert(host)) | |
| 442 return self[request].response_data[0] | |
| 443 | |
| 444 @classmethod | |
| 445 def AssertWritable(cls, filename): | |
| 446 """Raises an IOError if filename is not writable.""" | |
| 447 persist_dir = os.path.dirname(os.path.abspath(filename)) | |
| 448 if not os.path.exists(persist_dir): | |
| 449 raise IOError('Directory does not exist: %s' % persist_dir) | |
| 450 if os.path.exists(filename): | |
| 451 if not os.access(filename, os.W_OK): | |
| 452 raise IOError('Need write permission on file: %s' % filename) | |
| 453 elif not os.access(persist_dir, os.W_OK): | |
| 454 raise IOError('Need write permission on directory: %s' % persist_dir) | |
| 455 | |
| 456 @classmethod | |
| 457 def Load(cls, filename): | |
| 458 """Load an instance from filename.""" | |
| 459 return cPickle.load(open(filename, 'rb')) | |
| 460 | |
| 461 def Persist(self, filename): | |
| 462 """Persist all state to filename.""" | |
| 463 try: | |
| 464 original_checkinterval = sys.getcheckinterval() | |
| 465 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can | |
| 466 # modify |self| during pickling. | |
| 467 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL) | |
| 468 finally: | |
| 469 sys.setcheckinterval(original_checkinterval) | |
| 470 with open(filename, 'wb') as f: | |
| 471 f.write(pickled_self) | |
| 472 | |
| 473 | |
| 474 class ArchivedHttpRequest(object): | |
| 475 """Record all the state that goes into a request. | |
| 476 | |
| 477 ArchivedHttpRequest instances are considered immutable so they can | |
| 478 serve as keys for HttpArchive instances. | |
| 479 (The immutability is not enforced.) | |
| 480 | |
| 481 Upon creation, the headers are "trimmed" (i.e. edited or dropped) | |
| 482 and saved to self.trimmed_headers to allow requests to match in a wider | |
| 483 variety of playback situations (e.g. using different user agents). | |
| 484 | |
| 485 For unpickling, 'trimmed_headers' is recreated from 'headers'. That | |
| 486 allows for changes to the trim function and can help with debugging. | |
| 487 """ | |
| 488 CONDITIONAL_HEADERS = [ | |
| 489 'if-none-match', 'if-match', | |
| 490 'if-modified-since', 'if-unmodified-since'] | |
| 491 | |
| 492 def __init__(self, command, host, full_path, request_body, headers, | |
| 493 is_ssl=False): | |
| 494 """Initialize an ArchivedHttpRequest. | |
| 495 | |
| 496 Args: | |
| 497 command: a string (e.g. 'GET' or 'POST'). | |
| 498 host: a host name (e.g. 'www.google.com'). | |
| 499 full_path: a request path. Includes everything after the host & port in | |
| 500 the URL (e.g. '/search?q=dogs'). | |
| 501 request_body: a request body string for a POST or None. | |
| 502 headers: {key: value, ...} where key and value are strings. | |
| 503 is_ssl: a boolean which is True iff request is make via SSL. | |
| 504 """ | |
| 505 self.command = command | |
| 506 self.host = host | |
| 507 self.full_path = full_path | |
| 508 parsed_url = urlparse.urlparse(full_path) if full_path else None | |
| 509 self.path = parsed_url.path if parsed_url else None | |
| 510 self.request_body = request_body | |
| 511 self.headers = headers | |
| 512 self.is_ssl = is_ssl | |
| 513 self.trimmed_headers = self._TrimHeaders(headers) | |
| 514 self.formatted_request = self._GetFormattedRequest() | |
| 515 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None) | |
| 516 | |
| 517 def __str__(self): | |
| 518 scheme = 'https' if self.is_ssl else 'http' | |
| 519 return '%s %s://%s%s %s' % ( | |
| 520 self.command, scheme, self.host, self.full_path, self.trimmed_headers) | |
| 521 | |
| 522 def __repr__(self): | |
| 523 return repr((self.command, self.host, self.full_path, self.request_body, | |
| 524 self.trimmed_headers, self.is_ssl)) | |
| 525 | |
| 526 def __hash__(self): | |
| 527 """Return a integer hash to use for hashed collections including dict.""" | |
| 528 return hash(repr(self)) | |
| 529 | |
| 530 def __eq__(self, other): | |
| 531 """Define the __eq__ method to match the hash behavior.""" | |
| 532 return repr(self) == repr(other) | |
| 533 | |
| 534 def __setstate__(self, state): | |
| 535 """Influence how to unpickle. | |
| 536 | |
| 537 "headers" are the original request headers. | |
| 538 "trimmed_headers" are the trimmed headers used for matching requests | |
| 539 during replay. | |
| 540 | |
| 541 Args: | |
| 542 state: a dictionary for __dict__ | |
| 543 """ | |
| 544 if 'full_headers' in state: | |
| 545 # Fix older version of archive. | |
| 546 state['headers'] = state['full_headers'] | |
| 547 del state['full_headers'] | |
| 548 if 'headers' not in state: | |
| 549 raise HttpArchiveException( | |
| 550 'Archived HTTP request is missing "headers". The HTTP archive is' | |
| 551 ' likely from a previous version and must be re-recorded.') | |
| 552 if 'path' in state: | |
| 553 # before, 'path' and 'path_without_query' were used and 'path' was | |
| 554 # pickled. Now, 'path' has been renamed to 'full_path' and | |
| 555 # 'path_without_query' has been renamed to 'path'. 'full_path' is | |
| 556 # pickled, but 'path' is not. If we see 'path' here it means we are | |
| 557 # dealing with an older archive. | |
| 558 state['full_path'] = state['path'] | |
| 559 del state['path'] | |
| 560 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers'])) | |
| 561 if 'is_ssl' not in state: | |
| 562 state['is_ssl'] = False | |
| 563 self.__dict__.update(state) | |
| 564 parsed_url = urlparse.urlparse(self.full_path) | |
| 565 self.path = parsed_url.path | |
| 566 self.formatted_request = self._GetFormattedRequest() | |
| 567 self.cmp_seq = self._GetCmpSeq(parsed_url.query) | |
| 568 | |
| 569 def __getstate__(self): | |
| 570 """Influence how to pickle. | |
| 571 | |
| 572 Returns: | |
| 573 a dict to use for pickling | |
| 574 """ | |
| 575 state = self.__dict__.copy() | |
| 576 del state['trimmed_headers'] | |
| 577 del state['path'] | |
| 578 del state['formatted_request'] | |
| 579 del state['cmp_seq'] | |
| 580 return state | |
| 581 | |
| 582 def _GetFormattedRequest(self): | |
| 583 """Format request to make diffs easier to read. | |
| 584 | |
| 585 Returns: | |
| 586 A string consisting of the request. Example: | |
| 587 'GET www.example.com/path\nHeader-Key: header value\n' | |
| 588 """ | |
| 589 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)] | |
| 590 if self.request_body: | |
| 591 parts.append('%s\n' % self.request_body) | |
| 592 for k, v in self.trimmed_headers: | |
| 593 k = '-'.join(x.capitalize() for x in k.split('-')) | |
| 594 parts.append('%s: %s\n' % (k, v)) | |
| 595 return ''.join(parts) | |
| 596 | |
| 597 def _GetCmpSeq(self, query=None): | |
| 598 """Compute a sequence out of query and header for difflib to compare. | |
| 599 For example: | |
| 600 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')] | |
| 601 will be returned for a request with URL: | |
| 602 http://example.com/index.html?q1=a2&q2=a2 | |
| 603 and header: | |
| 604 k1: v1 | |
| 605 k2: v2 | |
| 606 | |
| 607 Args: | |
| 608 query: the query string in the URL. | |
| 609 | |
| 610 Returns: | |
| 611 A sequence for difflib to compare. | |
| 612 """ | |
| 613 if not query: | |
| 614 return self.trimmed_headers | |
| 615 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers | |
| 616 | |
| 617 def matches(self, command=None, host=None, full_path=None, is_ssl=None, | |
| 618 use_query=True): | |
| 619 """Returns true iff the request matches all parameters. | |
| 620 | |
| 621 Args: | |
| 622 command: a string (e.g. 'GET' or 'POST'). | |
| 623 host: a host name (e.g. 'www.google.com'). | |
| 624 full_path: a request path with query string (e.g. '/search?q=dogs') | |
| 625 is_ssl: whether the request is secure. | |
| 626 use_query: | |
| 627 If use_query is True, request matching uses both the hierarchical path | |
| 628 and query string component. | |
| 629 If use_query is False, request matching only uses the hierarchical path | |
| 630 | |
| 631 e.g. req1 = GET www.test.com/index?aaaa | |
| 632 req2 = GET www.test.com/index?bbbb | |
| 633 | |
| 634 If use_query is True, req1.matches(req2) evaluates to False | |
| 635 If use_query is False, req1.matches(req2) evaluates to True | |
| 636 | |
| 637 Returns: | |
| 638 True iff the request matches all parameters | |
| 639 """ | |
| 640 if command is not None and command != self.command: | |
| 641 return False | |
| 642 if is_ssl is not None and is_ssl != self.is_ssl: | |
| 643 return False | |
| 644 if host is not None and host != self.host: | |
| 645 return False | |
| 646 if full_path is None: | |
| 647 return True | |
| 648 if use_query: | |
| 649 return full_path == self.full_path | |
| 650 else: | |
| 651 return self.path == urlparse.urlparse(full_path).path | |
| 652 | |
| 653 @classmethod | |
| 654 def _TrimHeaders(cls, headers): | |
| 655 """Removes headers that are known to cause problems during replay. | |
| 656 | |
| 657 These headers are removed for the following reasons: | |
| 658 - accept: Causes problems with www.bing.com. During record, CSS is fetched | |
| 659 with *. During replay, it's text/css. | |
| 660 - accept-charset, accept-language, referer: vary between clients. | |
| 661 - cache-control: sometimes sent from Chrome with 'max-age=0' as value. | |
| 662 - connection, method, scheme, url, version: Cause problems with spdy. | |
| 663 - cookie: Extremely sensitive to request/response order. | |
| 664 - keep-alive: Doesn't affect the content of the request, only some | |
| 665 transient state of the transport layer. | |
| 666 - user-agent: Changes with every Chrome version. | |
| 667 - proxy-connection: Sent for proxy requests. | |
| 668 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by | |
| 669 Google to collect statistics about Chrome's enabled features. | |
| 670 | |
| 671 Another variant to consider is dropping only the value from the header. | |
| 672 However, this is particularly bad for the cookie header, because the | |
| 673 presence of the cookie depends on the responses we've seen when the request | |
| 674 is made. | |
| 675 | |
| 676 Args: | |
| 677 headers: {header_key: header_value, ...} | |
| 678 | |
| 679 Returns: | |
| 680 [(header_key, header_value), ...] # (with undesirable headers removed) | |
| 681 """ | |
| 682 # TODO(tonyg): Strip sdch from the request headers because we can't | |
| 683 # guarantee that the dictionary will be recorded, so replay may not work. | |
| 684 if 'accept-encoding' in headers: | |
| 685 accept_encoding = headers['accept-encoding'] | |
| 686 accept_encoding = accept_encoding.replace('sdch', '') | |
| 687 # Strip lzma so Opera's requests matches archives recorded using Chrome. | |
| 688 accept_encoding = accept_encoding.replace('lzma', '') | |
| 689 stripped_encodings = [e.strip() for e in accept_encoding.split(',')] | |
| 690 accept_encoding = ','.join(filter(bool, stripped_encodings)) | |
| 691 headers['accept-encoding'] = accept_encoding | |
| 692 undesirable_keys = [ | |
| 693 'accept', 'accept-charset', 'accept-language', 'cache-control', | |
| 694 'connection', 'cookie', 'keep-alive', 'method', | |
| 695 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection', | |
| 696 'x-chrome-variations', 'x-client-data'] | |
| 697 return sorted([(k, v) for k, v in headers.items() | |
| 698 if k.lower() not in undesirable_keys]) | |
| 699 | |
| 700 def is_conditional(self): | |
| 701 """Return list of headers that match conditional headers.""" | |
| 702 for header in self.CONDITIONAL_HEADERS: | |
| 703 if header in self.headers: | |
| 704 return True | |
| 705 return False | |
| 706 | |
| 707 def create_request_without_conditions(self): | |
| 708 stripped_headers = dict((k, v) for k, v in self.headers.iteritems() | |
| 709 if k.lower() not in self.CONDITIONAL_HEADERS) | |
| 710 return ArchivedHttpRequest( | |
| 711 self.command, self.host, self.full_path, self.request_body, | |
| 712 stripped_headers, self.is_ssl) | |
| 713 | |
| 714 class ArchivedHttpResponse(object): | |
| 715 """All the data needed to recreate all HTTP response. | |
| 716 | |
| 717 Upon creation, the headers are "trimmed" (i.e. edited or dropped). | |
| 718 The original headers are saved to self.original_headers, while the | |
| 719 trimmed ones are used to allow responses to match in a wider variety | |
| 720 of playback situations. | |
| 721 | |
| 722 For pickling, 'original_headers' are stored in the archive. For unpickling | |
| 723 the headers are trimmed again. That allows for changes to the trim | |
| 724 function and can help with debugging. | |
| 725 """ | |
| 726 | |
| 727 # CHUNK_EDIT_SEPARATOR is used to edit and view text content. | |
| 728 # It is not sent in responses. It is added by get_data_as_text() | |
| 729 # and removed by set_data(). | |
| 730 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]' | |
| 731 | |
| 732 # DELAY_EDIT_SEPARATOR is used to edit and view server delays. | |
| 733 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- ' | |
| 734 'Delays are above. Response content is below.]\n') | |
| 735 | |
| 736 def __init__(self, version, status, reason, headers, response_data, | |
| 737 delays=None): | |
| 738 """Initialize an ArchivedHttpResponse. | |
| 739 | |
| 740 Args: | |
| 741 version: HTTP protocol version used by server. | |
| 742 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib). | |
| 743 status: Status code returned by server (e.g. 200). | |
| 744 reason: Reason phrase returned by server (e.g. "OK"). | |
| 745 headers: list of (header, value) tuples. | |
| 746 response_data: list of content chunks. | |
| 747 Concatenating the chunks gives the complete contents | |
| 748 (i.e. the chunks do not have any lengths or delimiters). | |
| 749 Do not include the final, zero-length chunk that marks the end. | |
| 750 delays: dict of (ms) delays for 'connect', 'headers' and 'data'. | |
| 751 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]} | |
| 752 connect - The time to connect to the server. | |
| 753 Each resource has a value because Replay's record mode captures it. | |
| 754 This includes the time for the SYN and SYN/ACK (1 rtt). | |
| 755 headers - The time elapsed between the TCP connect and the headers. | |
| 756 This typically includes all the server-time to generate a response. | |
| 757 data - If the response is chunked, these are the times for each chunk. | |
| 758 """ | |
| 759 self.version = version | |
| 760 self.status = status | |
| 761 self.reason = reason | |
| 762 self.original_headers = headers | |
| 763 self.headers = self._TrimHeaders(headers) | |
| 764 self.response_data = response_data | |
| 765 self.delays = delays | |
| 766 self.fix_delays() | |
| 767 | |
| 768 def fix_delays(self): | |
| 769 """Initialize delays, or check the number of data delays.""" | |
| 770 expected_num_delays = len(self.response_data) | |
| 771 if not self.delays: | |
| 772 self.delays = { | |
| 773 'connect': 0, | |
| 774 'headers': 0, | |
| 775 'data': [0] * expected_num_delays | |
| 776 } | |
| 777 else: | |
| 778 num_delays = len(self.delays['data']) | |
| 779 if num_delays != expected_num_delays: | |
| 780 raise HttpArchiveException( | |
| 781 'Server delay length mismatch: %d (expected %d): %s', | |
| 782 num_delays, expected_num_delays, self.delays['data']) | |
| 783 | |
| 784 @classmethod | |
| 785 def _TrimHeaders(cls, headers): | |
| 786 """Removes headers that are known to cause problems during replay. | |
| 787 | |
| 788 These headers are removed for the following reasons: | |
| 789 - content-security-policy: Causes problems with script injection. | |
| 790 """ | |
| 791 undesirable_keys = ['content-security-policy'] | |
| 792 return [(k, v) for k, v in headers if k.lower() not in undesirable_keys] | |
| 793 | |
| 794 def __repr__(self): | |
| 795 return repr((self.version, self.status, self.reason, sorted(self.headers), | |
| 796 self.response_data)) | |
| 797 | |
| 798 def __hash__(self): | |
| 799 """Return a integer hash to use for hashed collections including dict.""" | |
| 800 return hash(repr(self)) | |
| 801 | |
| 802 def __eq__(self, other): | |
| 803 """Define the __eq__ method to match the hash behavior.""" | |
| 804 return repr(self) == repr(other) | |
| 805 | |
| 806 def __setstate__(self, state): | |
| 807 """Influence how to unpickle. | |
| 808 | |
| 809 "original_headers" are the original request headers. | |
| 810 "headers" are the trimmed headers used for replaying responses. | |
| 811 | |
| 812 Args: | |
| 813 state: a dictionary for __dict__ | |
| 814 """ | |
| 815 if 'server_delays' in state: | |
| 816 state['delays'] = { | |
| 817 'connect': 0, | |
| 818 'headers': 0, | |
| 819 'data': state['server_delays'] | |
| 820 } | |
| 821 del state['server_delays'] | |
| 822 elif 'delays' not in state: | |
| 823 state['delays'] = None | |
| 824 state['original_headers'] = state['headers'] | |
| 825 state['headers'] = self._TrimHeaders(state['original_headers']) | |
| 826 self.__dict__.update(state) | |
| 827 self.fix_delays() | |
| 828 | |
| 829 def __getstate__(self): | |
| 830 """Influence how to pickle. | |
| 831 | |
| 832 Returns: | |
| 833 a dict to use for pickling | |
| 834 """ | |
| 835 state = self.__dict__.copy() | |
| 836 state['headers'] = state['original_headers'] | |
| 837 del state['original_headers'] | |
| 838 return state | |
| 839 | |
| 840 def get_header(self, key, default=None): | |
| 841 for k, v in self.headers: | |
| 842 if key.lower() == k.lower(): | |
| 843 return v | |
| 844 return default | |
| 845 | |
| 846 def set_header(self, key, value): | |
| 847 for i, (k, v) in enumerate(self.headers): | |
| 848 if key == k: | |
| 849 self.headers[i] = (key, value) | |
| 850 return | |
| 851 self.headers.append((key, value)) | |
| 852 | |
| 853 def remove_header(self, key): | |
| 854 for i, (k, v) in enumerate(self.headers): | |
| 855 if key.lower() == k.lower(): | |
| 856 self.headers.pop(i) | |
| 857 return | |
| 858 | |
| 859 @staticmethod | |
| 860 def _get_epoch_seconds(date_str): | |
| 861 """Return the epoch seconds of a date header. | |
| 862 | |
| 863 Args: | |
| 864 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") | |
| 865 Returns: | |
| 866 epoch seconds as a float | |
| 867 """ | |
| 868 date_tuple = email.utils.parsedate(date_str) | |
| 869 if date_tuple: | |
| 870 return calendar.timegm(date_tuple) | |
| 871 return None | |
| 872 | |
| 873 def update_date(self, date_str, now=None): | |
| 874 """Return an updated date based on its delta from the "Date" header. | |
| 875 | |
| 876 For example, if |date_str| is one week later than the "Date" header, | |
| 877 then the returned date string is one week later than the current date. | |
| 878 | |
| 879 Args: | |
| 880 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") | |
| 881 Returns: | |
| 882 a date string | |
| 883 """ | |
| 884 date_seconds = self._get_epoch_seconds(self.get_header('date')) | |
| 885 header_seconds = self._get_epoch_seconds(date_str) | |
| 886 if date_seconds and header_seconds: | |
| 887 updated_seconds = header_seconds + (now or time.time()) - date_seconds | |
| 888 return email.utils.formatdate(updated_seconds, usegmt=True) | |
| 889 return date_str | |
| 890 | |
| 891 def is_gzip(self): | |
| 892 return self.get_header('content-encoding') == 'gzip' | |
| 893 | |
| 894 def is_compressed(self): | |
| 895 return self.get_header('content-encoding') in ('gzip', 'deflate') | |
| 896 | |
| 897 def is_chunked(self): | |
| 898 return self.get_header('transfer-encoding') == 'chunked' | |
| 899 | |
| 900 def get_data_as_chunks(self): | |
| 901 """Return content as a list of strings, each corresponding to a chunk. | |
| 902 | |
| 903 Uncompresses the chunks, if needed. | |
| 904 """ | |
| 905 content_type = self.get_header('content-type') | |
| 906 if (not content_type or | |
| 907 not (content_type.startswith('text/') or | |
| 908 content_type == 'application/x-javascript' or | |
| 909 content_type.startswith('application/json'))): | |
| 910 return None | |
| 911 if self.is_compressed(): | |
| 912 return httpzlib.uncompress_chunks(self.response_data, self.is_gzip()) | |
| 913 else: | |
| 914 return self.response_data | |
| 915 | |
| 916 def get_data_as_text(self): | |
| 917 """Return content as a single string. | |
| 918 | |
| 919 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR. | |
| 920 """ | |
| 921 return self.CHUNK_EDIT_SEPARATOR.join(self.get_data_as_chunks()) | |
| 922 | |
| 923 def get_delays_as_text(self): | |
| 924 """Return delays as editable text.""" | |
| 925 return json.dumps(self.delays, indent=2) | |
| 926 | |
| 927 def get_response_as_text(self): | |
| 928 """Returns response content as a single string. | |
| 929 | |
| 930 Server delays are separated on a per-chunk basis. Delays are in seconds. | |
| 931 Response content begins after DELAY_EDIT_SEPARATOR | |
| 932 """ | |
| 933 data = self.get_data_as_text() | |
| 934 if data is None: | |
| 935 logging.warning('Data can not be represented as text.') | |
| 936 data = '' | |
| 937 delays = self.get_delays_as_text() | |
| 938 return self.DELAY_EDIT_SEPARATOR.join((delays, data)) | |
| 939 | |
| 940 def set_data_from_chunks(self, text_chunks): | |
| 941 """Inverse of get_data_as_chunks(). | |
| 942 | |
| 943 Compress, if needed. | |
| 944 """ | |
| 945 if self.is_compressed(): | |
| 946 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip()) | |
| 947 else: | |
| 948 self.response_data = text_chunks | |
| 949 if not self.is_chunked(): | |
| 950 content_length = sum(len(c) for c in self.response_data) | |
| 951 self.set_header('content-length', str(content_length)) | |
| 952 | |
| 953 def set_data(self, text): | |
| 954 """Inverse of get_data_as_text(). | |
| 955 | |
| 956 Split on CHUNK_EDIT_SEPARATOR and compress if needed. | |
| 957 """ | |
| 958 self.set_data_from_chunks(text.split(self.CHUNK_EDIT_SEPARATOR)) | |
| 959 | |
| 960 def set_delays(self, delays_text): | |
| 961 """Inverse of get_delays_as_text(). | |
| 962 | |
| 963 Args: | |
| 964 delays_text: JSON encoded text such as the following: | |
| 965 { | |
| 966 connect: 80, | |
| 967 headers: 80, | |
| 968 data: [6, 55, 0] | |
| 969 } | |
| 970 Times are in milliseconds. | |
| 971 Each data delay corresponds with one response_data value. | |
| 972 """ | |
| 973 try: | |
| 974 self.delays = json.loads(delays_text) | |
| 975 except (ValueError, KeyError) as e: | |
| 976 logging.critical('Unable to parse delays %s: %s', delays_text, e) | |
| 977 self.fix_delays() | |
| 978 | |
| 979 def set_response_from_text(self, text): | |
| 980 """Inverse of get_response_as_text(). | |
| 981 | |
| 982 Modifies the state of the archive according to the textual representation. | |
| 983 """ | |
| 984 try: | |
| 985 delays, data = text.split(self.DELAY_EDIT_SEPARATOR) | |
| 986 except ValueError: | |
| 987 logging.critical( | |
| 988 'Error parsing text representation. Skipping edits.') | |
| 989 return | |
| 990 self.set_delays(delays) | |
| 991 self.set_data(data) | |
| 992 | |
| 993 | |
| 994 def create_response(status, reason=None, headers=None, body=None): | |
| 995 """Convenience method for creating simple ArchivedHttpResponse objects.""" | |
| 996 if reason is None: | |
| 997 reason = httplib.responses.get(status, 'Unknown') | |
| 998 if headers is None: | |
| 999 headers = [('content-type', 'text/plain')] | |
| 1000 if body is None: | |
| 1001 body = "%s %s" % (status, reason) | |
| 1002 return ArchivedHttpResponse(11, status, reason, headers, [body]) | |
| 1003 | |
| 1004 | |
| 1005 def main(): | |
| 1006 class PlainHelpFormatter(optparse.IndentedHelpFormatter): | |
| 1007 def format_description(self, description): | |
| 1008 if description: | |
| 1009 return description + '\n' | |
| 1010 else: | |
| 1011 return '' | |
| 1012 | |
| 1013 option_parser = optparse.OptionParser( | |
| 1014 usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)', | |
| 1015 formatter=PlainHelpFormatter(), | |
| 1016 description=__doc__, | |
| 1017 epilog='http://code.google.com/p/web-page-replay/') | |
| 1018 | |
| 1019 option_parser.add_option('-c', '--command', default=None, | |
| 1020 action='store', | |
| 1021 type='string', | |
| 1022 help='Only show URLs matching this command.') | |
| 1023 option_parser.add_option('-o', '--host', default=None, | |
| 1024 action='store', | |
| 1025 type='string', | |
| 1026 help='Only show URLs matching this host.') | |
| 1027 option_parser.add_option('-p', '--full_path', default=None, | |
| 1028 action='store', | |
| 1029 type='string', | |
| 1030 help='Only show URLs matching this full path.') | |
| 1031 option_parser.add_option('-f', '--merged_file', default=None, | |
| 1032 action='store', | |
| 1033 type='string', | |
| 1034 help='The output file to use when using the merge command.') | |
| 1035 | |
| 1036 options, args = option_parser.parse_args() | |
| 1037 | |
| 1038 # Merge command expects an umlimited number of archives. | |
| 1039 if len(args) < 2: | |
| 1040 print 'args: %s' % args | |
| 1041 option_parser.error('Must specify a command and replay_file') | |
| 1042 | |
| 1043 command = args[0] | |
| 1044 replay_file = args[1] | |
| 1045 | |
| 1046 if not os.path.exists(replay_file): | |
| 1047 option_parser.error('Replay file "%s" does not exist' % replay_file) | |
| 1048 | |
| 1049 http_archive = HttpArchive.Load(replay_file) | |
| 1050 if command == 'ls': | |
| 1051 print http_archive.ls(options.command, options.host, options.full_path) | |
| 1052 elif command == 'cat': | |
| 1053 print http_archive.cat(options.command, options.host, options.full_path) | |
| 1054 elif command == 'stats': | |
| 1055 print http_archive.stats(options.command, options.host, options.full_path) | |
| 1056 elif command == 'merge': | |
| 1057 if not options.merged_file: | |
| 1058 print 'Error: Must specify a merged file name (use --merged_file)' | |
| 1059 return | |
| 1060 http_archive.merge(options.merged_file, args[2:]) | |
| 1061 elif command == 'edit': | |
| 1062 http_archive.edit(options.command, options.host, options.full_path) | |
| 1063 http_archive.Persist(replay_file) | |
| 1064 else: | |
| 1065 option_parser.error('Unknown command "%s"' % command) | |
| 1066 return 0 | |
| 1067 | |
| 1068 | |
| 1069 if __name__ == '__main__': | |
| 1070 sys.exit(main()) | |
| OLD | NEW |