| Index: tools/telemetry/third_party/webpagereplay/httparchive.py
|
| diff --git a/tools/telemetry/third_party/webpagereplay/httparchive.py b/tools/telemetry/third_party/webpagereplay/httparchive.py
|
| deleted file mode 100755
|
| index 388fc663cc3d8d3e31340b84b921ad5188145e90..0000000000000000000000000000000000000000
|
| --- a/tools/telemetry/third_party/webpagereplay/httparchive.py
|
| +++ /dev/null
|
| @@ -1,1022 +0,0 @@
|
| -#!/usr/bin/env python
|
| -# Copyright 2010 Google Inc. All Rights Reserved.
|
| -#
|
| -# Licensed under the Apache License, Version 2.0 (the "License");
|
| -# you may not use this file except in compliance with the License.
|
| -# You may obtain a copy of the License at
|
| -#
|
| -# http://www.apache.org/licenses/LICENSE-2.0
|
| -#
|
| -# Unless required by applicable law or agreed to in writing, software
|
| -# distributed under the License is distributed on an "AS IS" BASIS,
|
| -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| -# See the License for the specific language governing permissions and
|
| -# limitations under the License.
|
| -
|
| -"""View and edit HTTP Archives.
|
| -
|
| -To list all URLs in an archive:
|
| - $ ./httparchive.py ls archive.wpr
|
| -
|
| -To view the content of all URLs from example.com:
|
| - $ ./httparchive.py cat --host example.com archive.wpr
|
| -
|
| -To view the content of a particular URL:
|
| - $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
|
| -
|
| -To view the content of all URLs:
|
| - $ ./httparchive.py cat archive.wpr
|
| -
|
| -To edit a particular URL:
|
| - $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
|
| -
|
| -To print statistics of an archive:
|
| - $ ./httparchive.py stats archive.wpr
|
| -
|
| -To print statistics of a set of URLs:
|
| - $ ./httparchive.py stats --host www.example.com archive.wpr
|
| -
|
| -To merge multiple archives
|
| - $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
|
| -"""
|
| -
|
| -import calendar
|
| -import certutils
|
| -import cPickle
|
| -import difflib
|
| -import email.utils
|
| -import httplib
|
| -import httpzlib
|
| -import json
|
| -import logging
|
| -import optparse
|
| -import os
|
| -import StringIO
|
| -import subprocess
|
| -import sys
|
| -import tempfile
|
| -import time
|
| -import urlparse
|
| -from collections import defaultdict
|
| -
|
| -
|
| -
|
| -def LogRunTime(fn):
|
| - """Annotation which logs the run time of the function."""
|
| - def wrapped(self, *args, **kwargs):
|
| - start_time = time.time()
|
| - try:
|
| - return fn(self, *args, **kwargs)
|
| - finally:
|
| - run_time = (time.time() - start_time) * 1000.0
|
| - logging.debug('%s: %dms', fn.__name__, run_time)
|
| - return wrapped
|
| -
|
| -
|
| -class HttpArchiveException(Exception):
|
| - """Base class for all exceptions in httparchive."""
|
| - pass
|
| -
|
| -
|
| -class HttpArchive(dict):
|
| - """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
|
| -
|
| - Attributes:
|
| - responses_by_host: dict of {hostname, {request: response}}. This must remain
|
| - in sync with the underlying dict of self. It is used as an optimization
|
| - so that get_requests() doesn't have to linearly search all requests in
|
| - the archive to find potential matches.
|
| - """
|
| -
|
| - def __init__(self): # pylint: disable=super-init-not-called
|
| - self.responses_by_host = defaultdict(dict)
|
| -
|
| - def __setstate__(self, state):
|
| - """Influence how to unpickle.
|
| -
|
| - Args:
|
| - state: a dictionary for __dict__
|
| - """
|
| - self.__dict__.update(state)
|
| - self.responses_by_host = defaultdict(dict)
|
| - for request in self:
|
| - self.responses_by_host[request.host][request] = self[request]
|
| -
|
| - def __getstate__(self):
|
| - """Influence how to pickle.
|
| -
|
| - Returns:
|
| - a dict to use for pickling
|
| - """
|
| - state = self.__dict__.copy()
|
| - del state['responses_by_host']
|
| - return state
|
| -
|
| - def __setitem__(self, key, value):
|
| - super(HttpArchive, self).__setitem__(key, value)
|
| - if hasattr(self, 'responses_by_host'):
|
| - self.responses_by_host[key.host][key] = value
|
| -
|
| - def __delitem__(self, key):
|
| - super(HttpArchive, self).__delitem__(key)
|
| - del self.responses_by_host[key.host][key]
|
| -
|
| - def get(self, request, default=None):
|
| - """Return the archived response for a given request.
|
| -
|
| - Does extra checking for handling some HTTP request headers.
|
| -
|
| - Args:
|
| - request: instance of ArchivedHttpRequest
|
| - default: default value to return if request is not found
|
| -
|
| - Returns:
|
| - Instance of ArchivedHttpResponse or default if no matching
|
| - response is found
|
| - """
|
| - if request in self:
|
| - return self[request]
|
| - return self.get_conditional_response(request, default)
|
| -
|
| - def get_conditional_response(self, request, default):
|
| - """Get the response based on the conditional HTTP request headers.
|
| -
|
| - Args:
|
| - request: an ArchivedHttpRequest representing the original request.
|
| - default: default ArchivedHttpResponse
|
| - original request with matched headers removed.
|
| -
|
| - Returns:
|
| - an ArchivedHttpResponse with a status of 200, 302 (not modified), or
|
| - 412 (precondition failed)
|
| - """
|
| - response = default
|
| - if request.is_conditional():
|
| - stripped_request = request.create_request_without_conditions()
|
| - if stripped_request in self:
|
| - response = self[stripped_request]
|
| - if response.status == 200:
|
| - status = self.get_conditional_status(request, response)
|
| - if status != 200:
|
| - response = create_response(status)
|
| - return response
|
| -
|
| - def get_conditional_status(self, request, response):
|
| - status = 200
|
| - last_modified = email.utils.parsedate(
|
| - response.update_date(response.get_header('last-modified')))
|
| - response_etag = response.get_header('etag')
|
| - is_get_or_head = request.command.upper() in ('GET', 'HEAD')
|
| -
|
| - match_value = request.headers.get('if-match', None)
|
| - if match_value:
|
| - if self.is_etag_match(match_value, response_etag):
|
| - status = 200
|
| - else:
|
| - status = 412 # precondition failed
|
| - none_match_value = request.headers.get('if-none-match', None)
|
| - if none_match_value:
|
| - if self.is_etag_match(none_match_value, response_etag):
|
| - status = 304
|
| - elif is_get_or_head:
|
| - status = 200
|
| - else:
|
| - status = 412
|
| - if is_get_or_head and last_modified:
|
| - for header in ('if-modified-since', 'if-unmodified-since'):
|
| - date = email.utils.parsedate(request.headers.get(header, None))
|
| - if date:
|
| - if ((header == 'if-modified-since' and last_modified > date) or
|
| - (header == 'if-unmodified-since' and last_modified < date)):
|
| - if status != 412:
|
| - status = 200
|
| - else:
|
| - status = 304 # not modified
|
| - return status
|
| -
|
| - @staticmethod
|
| - def is_etag_match(request_etag, response_etag):
|
| - """Determines whether the entity tags of the request/response matches.
|
| -
|
| - Args:
|
| - request_etag: the value string of the "if-(none)-match:"
|
| - portion of the request header
|
| - response_etag: the etag value of the response
|
| -
|
| - Returns:
|
| - True on match, False otherwise
|
| - """
|
| - response_etag = response_etag.strip('" ')
|
| - for etag in request_etag.split(','):
|
| - etag = etag.strip('" ')
|
| - if etag in ('*', response_etag):
|
| - return True
|
| - return False
|
| -
|
| - def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
|
| - use_query=True):
|
| - """Return a list of requests that match the given args."""
|
| - if host:
|
| - return [r for r in self.responses_by_host[host]
|
| - if r.matches(command, None, full_path, is_ssl,
|
| - use_query=use_query)]
|
| - else:
|
| - return [r for r in self
|
| - if r.matches(command, host, full_path, is_ssl,
|
| - use_query=use_query)]
|
| -
|
| - def ls(self, command=None, host=None, full_path=None):
|
| - """List all URLs that match given params."""
|
| - return ''.join(sorted(
|
| - '%s\n' % r for r in self.get_requests(command, host, full_path)))
|
| -
|
| - def cat(self, command=None, host=None, full_path=None):
|
| - """Print the contents of all URLs that match given params."""
|
| - out = StringIO.StringIO()
|
| - for request in self.get_requests(command, host, full_path):
|
| - print >>out, str(request)
|
| - print >>out, 'Untrimmed request headers:'
|
| - for k in request.headers:
|
| - print >>out, ' %s: %s' % (k, request.headers[k])
|
| - if request.request_body:
|
| - print >>out, request.request_body
|
| - print >>out, '---- Response Info', '-' * 51
|
| - response = self[request]
|
| - chunk_lengths = [len(x) for x in response.response_data]
|
| - print >>out, ('Status: %s\n'
|
| - 'Reason: %s\n'
|
| - 'Headers delay: %s\n'
|
| - 'Response headers:') % (
|
| - response.status, response.reason, response.delays['headers'])
|
| - for k, v in response.headers:
|
| - print >>out, ' %s: %s' % (k, v)
|
| - print >>out, ('Chunk count: %s\n'
|
| - 'Chunk lengths: %s\n'
|
| - 'Chunk delays: %s') % (
|
| - len(chunk_lengths), chunk_lengths, response.delays['data'])
|
| - body = response.get_data_as_text()
|
| - print >>out, '---- Response Data', '-' * 51
|
| - if body:
|
| - print >>out, body
|
| - else:
|
| - print >>out, '[binary data]'
|
| - print >>out, '=' * 70
|
| - return out.getvalue()
|
| -
|
| - def stats(self, command=None, host=None, full_path=None):
|
| - """Print stats about the archive for all URLs that match given params."""
|
| - matching_requests = self.get_requests(command, host, full_path)
|
| - if not matching_requests:
|
| - print 'Failed to find any requests matching given command, host, path.'
|
| - return
|
| -
|
| - out = StringIO.StringIO()
|
| - stats = {
|
| - 'Total': len(matching_requests),
|
| - 'Domains': defaultdict(int),
|
| - 'HTTP_response_code': defaultdict(int),
|
| - 'content_type': defaultdict(int),
|
| - 'Documents': defaultdict(int),
|
| - }
|
| -
|
| - for request in matching_requests:
|
| - stats['Domains'][request.host] += 1
|
| - stats['HTTP_response_code'][self[request].status] += 1
|
| -
|
| - content_type = self[request].get_header('content-type')
|
| - # Remove content type options for readability and higher level groupings.
|
| - str_content_type = str(content_type.split(';')[0]
|
| - if content_type else None)
|
| - stats['content_type'][str_content_type] += 1
|
| -
|
| - # Documents are the main URL requested and not a referenced resource.
|
| - if str_content_type == 'text/html' and not 'referer' in request.headers:
|
| - stats['Documents'][request.host] += 1
|
| -
|
| - print >>out, json.dumps(stats, indent=4)
|
| - return out.getvalue()
|
| -
|
| - def merge(self, merged_archive=None, other_archives=None):
|
| - """Merge multiple archives into merged_archive by 'chaining' resources,
|
| - only resources that are not part of the accumlated archive are added"""
|
| - if not other_archives:
|
| - print 'No archives passed to merge'
|
| - return
|
| -
|
| - # Note we already loaded 'replay_file'.
|
| - print 'Loaded %d responses' % len(self)
|
| -
|
| - for archive in other_archives:
|
| - if not os.path.exists(archive):
|
| - print 'Error: Replay file "%s" does not exist' % archive
|
| - return
|
| -
|
| - http_archive_other = HttpArchive.Load(archive)
|
| - print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
|
| - for r in http_archive_other:
|
| - # Only resources that are not already part of the current archive
|
| - # get added.
|
| - if r not in self:
|
| - print '\t %s ' % r
|
| - self[r] = http_archive_other[r]
|
| - self.Persist('%s' % merged_archive)
|
| -
|
| - def edit(self, command=None, host=None, full_path=None):
|
| - """Edits the single request which matches given params."""
|
| - editor = os.getenv('EDITOR')
|
| - if not editor:
|
| - print 'You must set the EDITOR environmental variable.'
|
| - return
|
| -
|
| - matching_requests = self.get_requests(command, host, full_path)
|
| - if not matching_requests:
|
| - print ('Failed to find any requests matching given command, host, '
|
| - 'full_path.')
|
| - return
|
| -
|
| - if len(matching_requests) > 1:
|
| - print 'Found multiple matching requests. Please refine.'
|
| - print self.ls(command, host, full_path)
|
| -
|
| - response = self[matching_requests[0]]
|
| - tmp_file = tempfile.NamedTemporaryFile(delete=False)
|
| - tmp_file.write(response.get_response_as_text())
|
| - tmp_file.close()
|
| - subprocess.check_call([editor, tmp_file.name])
|
| - response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
|
| - os.remove(tmp_file.name)
|
| -
|
| - def find_closest_request(self, request, use_path=False):
|
| - """Find the closest matching request in the archive to the given request.
|
| -
|
| - Args:
|
| - request: an ArchivedHttpRequest
|
| - use_path: If True, closest matching request's path component must match.
|
| - (Note: this refers to the 'path' component within the URL, not the
|
| - 'full path' which includes the query string component.)
|
| -
|
| - If use_path=True, candidate will NOT match in example below
|
| - e.g. request = GET www.test.com/a?p=1
|
| - candidate = GET www.test.com/b?p=1
|
| -
|
| - Even if use_path=False, urls with same paths are always favored.
|
| - For example, candidate1 is considered a better match than candidate2.
|
| - request = GET www.test.com/a?p=1&q=2&r=3
|
| - candidate1 = GET www.test.com/a?s=4
|
| - candidate2 = GET www.test.com/b?p=1&q=2&r=3
|
| -
|
| - Returns:
|
| - If a close match is found, return the instance of ArchivedHttpRequest.
|
| - Otherwise, return None.
|
| - """
|
| - # Start with strictest constraints. This trims search space considerably.
|
| - requests = self.get_requests(request.command, request.host,
|
| - request.full_path, is_ssl=request.is_ssl,
|
| - use_query=True)
|
| - # Relax constraint: use_query if there is no match.
|
| - if not requests:
|
| - requests = self.get_requests(request.command, request.host,
|
| - request.full_path, is_ssl=request.is_ssl,
|
| - use_query=False)
|
| - # Relax constraint: full_path if there is no match and use_path=False.
|
| - if not requests and not use_path:
|
| - requests = self.get_requests(request.command, request.host,
|
| - None, is_ssl=request.is_ssl,
|
| - use_query=False)
|
| -
|
| - if not requests:
|
| - return None
|
| -
|
| - if len(requests) == 1:
|
| - return requests[0]
|
| -
|
| - matcher = difflib.SequenceMatcher(b=request.cmp_seq)
|
| -
|
| - # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
|
| - # quick_ratio() on all requests, sort them descending, and then loop through
|
| - # until we find a candidate whose ratio() is >= the next quick_ratio().
|
| - # This works because quick_ratio() is guaranteed to be an upper bound on
|
| - # ratio().
|
| - candidates = []
|
| - for candidate in requests:
|
| - matcher.set_seq1(candidate.cmp_seq)
|
| - candidates.append((matcher.quick_ratio(), candidate))
|
| -
|
| - candidates.sort(reverse=True, key=lambda c: c[0])
|
| -
|
| - best_match = (0, None)
|
| - for i in xrange(len(candidates)):
|
| - matcher.set_seq1(candidates[i][1].cmp_seq)
|
| - best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
|
| - if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
|
| - break
|
| - return best_match[1]
|
| -
|
| - def diff(self, request):
|
| - """Diff the given request to the closest matching request in the archive.
|
| -
|
| - Args:
|
| - request: an ArchivedHttpRequest
|
| - Returns:
|
| - If a close match is found, return a textual diff between the requests.
|
| - Otherwise, return None.
|
| - """
|
| - request_lines = request.formatted_request.split('\n')
|
| - closest_request = self.find_closest_request(request)
|
| - if closest_request:
|
| - closest_request_lines = closest_request.formatted_request.split('\n')
|
| - return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
|
| - return None
|
| -
|
| - def get_server_cert(self, host):
|
| - """Gets certificate from the server and stores it in archive"""
|
| - request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
|
| - if request not in self:
|
| - self[request] = create_response(200, body=certutils.get_host_cert(host))
|
| - return self[request].response_data[0]
|
| -
|
| - def get_certificate(self, host):
|
| - request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
|
| - if request not in self:
|
| - self[request] = create_response(200, body=self._generate_cert(host))
|
| - return self[request].response_data[0]
|
| -
|
| - @classmethod
|
| - def AssertWritable(cls, filename):
|
| - """Raises an IOError if filename is not writable."""
|
| - persist_dir = os.path.dirname(os.path.abspath(filename))
|
| - if not os.path.exists(persist_dir):
|
| - raise IOError('Directory does not exist: %s' % persist_dir)
|
| - if os.path.exists(filename):
|
| - if not os.access(filename, os.W_OK):
|
| - raise IOError('Need write permission on file: %s' % filename)
|
| - elif not os.access(persist_dir, os.W_OK):
|
| - raise IOError('Need write permission on directory: %s' % persist_dir)
|
| -
|
| - @classmethod
|
| - def Load(cls, filename):
|
| - """Load an instance from filename."""
|
| - return cPickle.load(open(filename, 'rb'))
|
| -
|
| - def Persist(self, filename):
|
| - """Persist all state to filename."""
|
| - try:
|
| - original_checkinterval = sys.getcheckinterval()
|
| - sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can
|
| - # modify |self| during pickling.
|
| - pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
|
| - finally:
|
| - sys.setcheckinterval(original_checkinterval)
|
| - with open(filename, 'wb') as f:
|
| - f.write(pickled_self)
|
| -
|
| -
|
| -class ArchivedHttpRequest(object):
|
| - """Record all the state that goes into a request.
|
| -
|
| - ArchivedHttpRequest instances are considered immutable so they can
|
| - serve as keys for HttpArchive instances.
|
| - (The immutability is not enforced.)
|
| -
|
| - Upon creation, the headers are "trimmed" (i.e. edited or dropped)
|
| - and saved to self.trimmed_headers to allow requests to match in a wider
|
| - variety of playback situations (e.g. using different user agents).
|
| -
|
| - For unpickling, 'trimmed_headers' is recreated from 'headers'. That
|
| - allows for changes to the trim function and can help with debugging.
|
| - """
|
| - CONDITIONAL_HEADERS = [
|
| - 'if-none-match', 'if-match',
|
| - 'if-modified-since', 'if-unmodified-since']
|
| -
|
| - def __init__(self, command, host, full_path, request_body, headers,
|
| - is_ssl=False):
|
| - """Initialize an ArchivedHttpRequest.
|
| -
|
| - Args:
|
| - command: a string (e.g. 'GET' or 'POST').
|
| - host: a host name (e.g. 'www.google.com').
|
| - full_path: a request path. Includes everything after the host & port in
|
| - the URL (e.g. '/search?q=dogs').
|
| - request_body: a request body string for a POST or None.
|
| - headers: {key: value, ...} where key and value are strings.
|
| - is_ssl: a boolean which is True iff request is make via SSL.
|
| - """
|
| - self.command = command
|
| - self.host = host
|
| - self.full_path = full_path
|
| - parsed_url = urlparse.urlparse(full_path) if full_path else None
|
| - self.path = parsed_url.path if parsed_url else None
|
| - self.request_body = request_body
|
| - self.headers = headers
|
| - self.is_ssl = is_ssl
|
| - self.trimmed_headers = self._TrimHeaders(headers)
|
| - self.formatted_request = self._GetFormattedRequest()
|
| - self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)
|
| -
|
| - def __str__(self):
|
| - scheme = 'https' if self.is_ssl else 'http'
|
| - return '%s %s://%s%s %s' % (
|
| - self.command, scheme, self.host, self.full_path, self.trimmed_headers)
|
| -
|
| - def __repr__(self):
|
| - return repr((self.command, self.host, self.full_path, self.request_body,
|
| - self.trimmed_headers, self.is_ssl))
|
| -
|
| - def __hash__(self):
|
| - """Return a integer hash to use for hashed collections including dict."""
|
| - return hash(repr(self))
|
| -
|
| - def __eq__(self, other):
|
| - """Define the __eq__ method to match the hash behavior."""
|
| - return repr(self) == repr(other)
|
| -
|
| - def __setstate__(self, state):
|
| - """Influence how to unpickle.
|
| -
|
| - "headers" are the original request headers.
|
| - "trimmed_headers" are the trimmed headers used for matching requests
|
| - during replay.
|
| -
|
| - Args:
|
| - state: a dictionary for __dict__
|
| - """
|
| - if 'full_headers' in state:
|
| - # Fix older version of archive.
|
| - state['headers'] = state['full_headers']
|
| - del state['full_headers']
|
| - if 'headers' not in state:
|
| - raise HttpArchiveException(
|
| - 'Archived HTTP request is missing "headers". The HTTP archive is'
|
| - ' likely from a previous version and must be re-recorded.')
|
| - if 'path' in state:
|
| - # before, 'path' and 'path_without_query' were used and 'path' was
|
| - # pickled. Now, 'path' has been renamed to 'full_path' and
|
| - # 'path_without_query' has been renamed to 'path'. 'full_path' is
|
| - # pickled, but 'path' is not. If we see 'path' here it means we are
|
| - # dealing with an older archive.
|
| - state['full_path'] = state['path']
|
| - del state['path']
|
| - state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
|
| - if 'is_ssl' not in state:
|
| - state['is_ssl'] = False
|
| - self.__dict__.update(state)
|
| - parsed_url = urlparse.urlparse(self.full_path)
|
| - self.path = parsed_url.path
|
| - self.formatted_request = self._GetFormattedRequest()
|
| - self.cmp_seq = self._GetCmpSeq(parsed_url.query)
|
| -
|
| - def __getstate__(self):
|
| - """Influence how to pickle.
|
| -
|
| - Returns:
|
| - a dict to use for pickling
|
| - """
|
| - state = self.__dict__.copy()
|
| - del state['trimmed_headers']
|
| - del state['path']
|
| - del state['formatted_request']
|
| - del state['cmp_seq']
|
| - return state
|
| -
|
| - def _GetFormattedRequest(self):
|
| - """Format request to make diffs easier to read.
|
| -
|
| - Returns:
|
| - A string consisting of the request. Example:
|
| - 'GET www.example.com/path\nHeader-Key: header value\n'
|
| - """
|
| - parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
|
| - if self.request_body:
|
| - parts.append('%s\n' % self.request_body)
|
| - for k, v in self.trimmed_headers:
|
| - k = '-'.join(x.capitalize() for x in k.split('-'))
|
| - parts.append('%s: %s\n' % (k, v))
|
| - return ''.join(parts)
|
| -
|
| - def _GetCmpSeq(self, query=None):
|
| - """Compute a sequence out of query and header for difflib to compare.
|
| - For example:
|
| - [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]
|
| - will be returned for a request with URL:
|
| - http://example.com/index.html?q1=a2&q2=a2
|
| - and header:
|
| - k1: v1
|
| - k2: v2
|
| -
|
| - Args:
|
| - query: the query string in the URL.
|
| -
|
| - Returns:
|
| - A sequence for difflib to compare.
|
| - """
|
| - if not query:
|
| - return self.trimmed_headers
|
| - return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers
|
| -
|
| - def matches(self, command=None, host=None, full_path=None, is_ssl=None,
|
| - use_query=True):
|
| - """Returns true iff the request matches all parameters.
|
| -
|
| - Args:
|
| - command: a string (e.g. 'GET' or 'POST').
|
| - host: a host name (e.g. 'www.google.com').
|
| - full_path: a request path with query string (e.g. '/search?q=dogs')
|
| - is_ssl: whether the request is secure.
|
| - use_query:
|
| - If use_query is True, request matching uses both the hierarchical path
|
| - and query string component.
|
| - If use_query is False, request matching only uses the hierarchical path
|
| -
|
| - e.g. req1 = GET www.test.com/index?aaaa
|
| - req2 = GET www.test.com/index?bbbb
|
| -
|
| - If use_query is True, req1.matches(req2) evaluates to False
|
| - If use_query is False, req1.matches(req2) evaluates to True
|
| -
|
| - Returns:
|
| - True iff the request matches all parameters
|
| - """
|
| - if command is not None and command != self.command:
|
| - return False
|
| - if is_ssl is not None and is_ssl != self.is_ssl:
|
| - return False
|
| - if host is not None and host != self.host:
|
| - return False
|
| - if full_path is None:
|
| - return True
|
| - if use_query:
|
| - return full_path == self.full_path
|
| - else:
|
| - return self.path == urlparse.urlparse(full_path).path
|
| -
|
| - @classmethod
|
| - def _TrimHeaders(cls, headers):
|
| - """Removes headers that are known to cause problems during replay.
|
| -
|
| - These headers are removed for the following reasons:
|
| - - accept: Causes problems with www.bing.com. During record, CSS is fetched
|
| - with *. During replay, it's text/css.
|
| - - accept-charset, accept-language, referer: vary between clients.
|
| - - cache-control: sometimes sent from Chrome with 'max-age=0' as value.
|
| - - connection, method, scheme, url, version: Cause problems with spdy.
|
| - - cookie: Extremely sensitive to request/response order.
|
| - - keep-alive: Doesn't affect the content of the request, only some
|
| - transient state of the transport layer.
|
| - - user-agent: Changes with every Chrome version.
|
| - - proxy-connection: Sent for proxy requests.
|
| - - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by
|
| - Google to collect statistics about Chrome's enabled features.
|
| -
|
| - Another variant to consider is dropping only the value from the header.
|
| - However, this is particularly bad for the cookie header, because the
|
| - presence of the cookie depends on the responses we've seen when the request
|
| - is made.
|
| -
|
| - Args:
|
| - headers: {header_key: header_value, ...}
|
| -
|
| - Returns:
|
| - [(header_key, header_value), ...] # (with undesirable headers removed)
|
| - """
|
| - # TODO(tonyg): Strip sdch from the request headers because we can't
|
| - # guarantee that the dictionary will be recorded, so replay may not work.
|
| - if 'accept-encoding' in headers:
|
| - accept_encoding = headers['accept-encoding']
|
| - accept_encoding = accept_encoding.replace('sdch', '')
|
| - # Strip lzma so Opera's requests matches archives recorded using Chrome.
|
| - accept_encoding = accept_encoding.replace('lzma', '')
|
| - stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
|
| - accept_encoding = ','.join(filter(bool, stripped_encodings))
|
| - headers['accept-encoding'] = accept_encoding
|
| - undesirable_keys = [
|
| - 'accept', 'accept-charset', 'accept-language', 'cache-control',
|
| - 'connection', 'cookie', 'keep-alive', 'method',
|
| - 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
|
| - 'x-chrome-variations', 'x-client-data']
|
| - return sorted([(k, v) for k, v in headers.items()
|
| - if k.lower() not in undesirable_keys])
|
| -
|
| - def is_conditional(self):
|
| - """Return list of headers that match conditional headers."""
|
| - for header in self.CONDITIONAL_HEADERS:
|
| - if header in self.headers:
|
| - return True
|
| - return False
|
| -
|
| - def create_request_without_conditions(self):
|
| - stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
|
| - if k.lower() not in self.CONDITIONAL_HEADERS)
|
| - return ArchivedHttpRequest(
|
| - self.command, self.host, self.full_path, self.request_body,
|
| - stripped_headers, self.is_ssl)
|
| -
|
| -class ArchivedHttpResponse(object):
|
| - """All the data needed to recreate all HTTP response."""
|
| -
|
| - # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
|
| - # It is not sent in responses. It is added by get_data_as_text()
|
| - # and removed by set_data().
|
| - CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
|
| -
|
| - # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
|
| - DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
|
| - 'Delays are above. Response content is below.]\n')
|
| -
|
| - def __init__(self, version, status, reason, headers, response_data,
|
| - delays=None):
|
| - """Initialize an ArchivedHttpResponse.
|
| -
|
| - Args:
|
| - version: HTTP protocol version used by server.
|
| - 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
|
| - status: Status code returned by server (e.g. 200).
|
| - reason: Reason phrase returned by server (e.g. "OK").
|
| - headers: list of (header, value) tuples.
|
| - response_data: list of content chunks.
|
| - Concatenating the chunks gives the complete contents
|
| - (i.e. the chunks do not have any lengths or delimiters).
|
| - Do not include the final, zero-length chunk that marks the end.
|
| - delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
|
| - e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
|
| - connect - The time to connect to the server.
|
| - Each resource has a value because Replay's record mode captures it.
|
| - This includes the time for the SYN and SYN/ACK (1 rtt).
|
| - headers - The time elapsed between the TCP connect and the headers.
|
| - This typically includes all the server-time to generate a response.
|
| - data - If the response is chunked, these are the times for each chunk.
|
| - """
|
| - self.version = version
|
| - self.status = status
|
| - self.reason = reason
|
| - self.headers = headers
|
| - self.response_data = response_data
|
| - self.delays = delays
|
| - self.fix_delays()
|
| -
|
| - def fix_delays(self):
|
| - """Initialize delays, or check the number of data delays."""
|
| - expected_num_delays = len(self.response_data)
|
| - if not self.delays:
|
| - self.delays = {
|
| - 'connect': 0,
|
| - 'headers': 0,
|
| - 'data': [0] * expected_num_delays
|
| - }
|
| - else:
|
| - num_delays = len(self.delays['data'])
|
| - if num_delays != expected_num_delays:
|
| - raise HttpArchiveException(
|
| - 'Server delay length mismatch: %d (expected %d): %s',
|
| - num_delays, expected_num_delays, self.delays['data'])
|
| -
|
| - def __repr__(self):
|
| - return repr((self.version, self.status, self.reason, sorted(self.headers),
|
| - self.response_data))
|
| -
|
| - def __hash__(self):
|
| - """Return a integer hash to use for hashed collections including dict."""
|
| - return hash(repr(self))
|
| -
|
| - def __eq__(self, other):
|
| - """Define the __eq__ method to match the hash behavior."""
|
| - return repr(self) == repr(other)
|
| -
|
| - def __setstate__(self, state):
|
| - """Influence how to unpickle.
|
| -
|
| - Args:
|
| - state: a dictionary for __dict__
|
| - """
|
| - if 'server_delays' in state:
|
| - state['delays'] = {
|
| - 'connect': 0,
|
| - 'headers': 0,
|
| - 'data': state['server_delays']
|
| - }
|
| - del state['server_delays']
|
| - elif 'delays' not in state:
|
| - state['delays'] = None
|
| - self.__dict__.update(state)
|
| - self.fix_delays()
|
| -
|
| - def get_header(self, key, default=None):
|
| - for k, v in self.headers:
|
| - if key.lower() == k.lower():
|
| - return v
|
| - return default
|
| -
|
| - def set_header(self, key, value):
|
| - for i, (k, v) in enumerate(self.headers):
|
| - if key == k:
|
| - self.headers[i] = (key, value)
|
| - return
|
| - self.headers.append((key, value))
|
| -
|
| - def remove_header(self, key):
|
| - for i, (k, v) in enumerate(self.headers):
|
| - if key.lower() == k.lower():
|
| - self.headers.pop(i)
|
| - return
|
| -
|
| - @staticmethod
|
| - def _get_epoch_seconds(date_str):
|
| - """Return the epoch seconds of a date header.
|
| -
|
| - Args:
|
| - date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
|
| - Returns:
|
| - epoch seconds as a float
|
| - """
|
| - date_tuple = email.utils.parsedate(date_str)
|
| - if date_tuple:
|
| - return calendar.timegm(date_tuple)
|
| - return None
|
| -
|
| - def update_date(self, date_str, now=None):
|
| - """Return an updated date based on its delta from the "Date" header.
|
| -
|
| - For example, if |date_str| is one week later than the "Date" header,
|
| - then the returned date string is one week later than the current date.
|
| -
|
| - Args:
|
| - date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
|
| - Returns:
|
| - a date string
|
| - """
|
| - date_seconds = self._get_epoch_seconds(self.get_header('date'))
|
| - header_seconds = self._get_epoch_seconds(date_str)
|
| - if date_seconds and header_seconds:
|
| - updated_seconds = header_seconds + (now or time.time()) - date_seconds
|
| - return email.utils.formatdate(updated_seconds, usegmt=True)
|
| - return date_str
|
| -
|
| - def is_gzip(self):
|
| - return self.get_header('content-encoding') == 'gzip'
|
| -
|
| - def is_compressed(self):
|
| - return self.get_header('content-encoding') in ('gzip', 'deflate')
|
| -
|
| - def is_chunked(self):
|
| - return self.get_header('transfer-encoding') == 'chunked'
|
| -
|
| - def get_data_as_text(self):
|
| - """Return content as a single string.
|
| -
|
| - Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
|
| - """
|
| - content_type = self.get_header('content-type')
|
| - if (not content_type or
|
| - not (content_type.startswith('text/') or
|
| - content_type == 'application/x-javascript' or
|
| - content_type.startswith('application/json'))):
|
| - return None
|
| - if self.is_compressed():
|
| - uncompressed_chunks = httpzlib.uncompress_chunks(
|
| - self.response_data, self.is_gzip())
|
| - else:
|
| - uncompressed_chunks = self.response_data
|
| - return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks)
|
| -
|
| - def get_delays_as_text(self):
|
| - """Return delays as editable text."""
|
| - return json.dumps(self.delays, indent=2)
|
| -
|
| - def get_response_as_text(self):
|
| - """Returns response content as a single string.
|
| -
|
| - Server delays are separated on a per-chunk basis. Delays are in seconds.
|
| - Response content begins after DELAY_EDIT_SEPARATOR
|
| - """
|
| - data = self.get_data_as_text()
|
| - if data is None:
|
| - logging.warning('Data can not be represented as text.')
|
| - data = ''
|
| - delays = self.get_delays_as_text()
|
| - return self.DELAY_EDIT_SEPARATOR.join((delays, data))
|
| -
|
| - def set_data(self, text):
|
| - """Inverse of get_data_as_text().
|
| -
|
| - Split on CHUNK_EDIT_SEPARATOR and compress if needed.
|
| - """
|
| - text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR)
|
| - if self.is_compressed():
|
| - self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
|
| - else:
|
| - self.response_data = text_chunks
|
| - if not self.is_chunked():
|
| - content_length = sum(len(c) for c in self.response_data)
|
| - self.set_header('content-length', str(content_length))
|
| -
|
| - def set_delays(self, delays_text):
|
| - """Inverse of get_delays_as_text().
|
| -
|
| - Args:
|
| - delays_text: JSON encoded text such as the following:
|
| - {
|
| - connect: 80,
|
| - headers: 80,
|
| - data: [6, 55, 0]
|
| - }
|
| - Times are in milliseconds.
|
| - Each data delay corresponds with one response_data value.
|
| - """
|
| - try:
|
| - self.delays = json.loads(delays_text)
|
| - except (ValueError, KeyError) as e:
|
| - logging.critical('Unable to parse delays %s: %s', delays_text, e)
|
| - self.fix_delays()
|
| -
|
| - def set_response_from_text(self, text):
|
| - """Inverse of get_response_as_text().
|
| -
|
| - Modifies the state of the archive according to the textual representation.
|
| - """
|
| - try:
|
| - delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
|
| - except ValueError:
|
| - logging.critical(
|
| - 'Error parsing text representation. Skipping edits.')
|
| - return
|
| - self.set_delays(delays)
|
| - self.set_data(data)
|
| -
|
| -
|
| -def create_response(status, reason=None, headers=None, body=None):
|
| - """Convenience method for creating simple ArchivedHttpResponse objects."""
|
| - if reason is None:
|
| - reason = httplib.responses.get(status, 'Unknown')
|
| - if headers is None:
|
| - headers = [('content-type', 'text/plain')]
|
| - if body is None:
|
| - body = "%s %s" % (status, reason)
|
| - return ArchivedHttpResponse(11, status, reason, headers, [body])
|
| -
|
| -
|
| -def main():
|
| - class PlainHelpFormatter(optparse.IndentedHelpFormatter):
|
| - def format_description(self, description):
|
| - if description:
|
| - return description + '\n'
|
| - else:
|
| - return ''
|
| -
|
| - option_parser = optparse.OptionParser(
|
| - usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
|
| - formatter=PlainHelpFormatter(),
|
| - description=__doc__,
|
| - epilog='http://code.google.com/p/web-page-replay/')
|
| -
|
| - option_parser.add_option('-c', '--command', default=None,
|
| - action='store',
|
| - type='string',
|
| - help='Only show URLs matching this command.')
|
| - option_parser.add_option('-o', '--host', default=None,
|
| - action='store',
|
| - type='string',
|
| - help='Only show URLs matching this host.')
|
| - option_parser.add_option('-p', '--full_path', default=None,
|
| - action='store',
|
| - type='string',
|
| - help='Only show URLs matching this full path.')
|
| - option_parser.add_option('-f', '--merged_file', default=None,
|
| - action='store',
|
| - type='string',
|
| - help='The output file to use when using the merge command.')
|
| -
|
| - options, args = option_parser.parse_args()
|
| -
|
| - # Merge command expects an umlimited number of archives.
|
| - if len(args) < 2:
|
| - print 'args: %s' % args
|
| - option_parser.error('Must specify a command and replay_file')
|
| -
|
| - command = args[0]
|
| - replay_file = args[1]
|
| -
|
| - if not os.path.exists(replay_file):
|
| - option_parser.error('Replay file "%s" does not exist' % replay_file)
|
| -
|
| - http_archive = HttpArchive.Load(replay_file)
|
| - if command == 'ls':
|
| - print http_archive.ls(options.command, options.host, options.full_path)
|
| - elif command == 'cat':
|
| - print http_archive.cat(options.command, options.host, options.full_path)
|
| - elif command == 'stats':
|
| - print http_archive.stats(options.command, options.host, options.full_path)
|
| - elif command == 'merge':
|
| - if not options.merged_file:
|
| - print 'Error: Must specify a merged file name (use --merged_file)'
|
| - return
|
| - http_archive.merge(options.merged_file, args[2:])
|
| - elif command == 'edit':
|
| - http_archive.edit(options.command, options.host, options.full_path)
|
| - http_archive.Persist(replay_file)
|
| - else:
|
| - option_parser.error('Unknown command "%s"' % command)
|
| - return 0
|
| -
|
| -
|
| -if __name__ == '__main__':
|
| - sys.exit(main())
|
|
|