Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(333)

Unified Diff: tools/telemetry/third_party/webpagereplay/httparchive.py

Issue 1647513002: Delete tools/telemetry. (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: tools/telemetry/third_party/webpagereplay/httparchive.py
diff --git a/tools/telemetry/third_party/webpagereplay/httparchive.py b/tools/telemetry/third_party/webpagereplay/httparchive.py
deleted file mode 100755
index 388fc663cc3d8d3e31340b84b921ad5188145e90..0000000000000000000000000000000000000000
--- a/tools/telemetry/third_party/webpagereplay/httparchive.py
+++ /dev/null
@@ -1,1022 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2010 Google Inc. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""View and edit HTTP Archives.
-
-To list all URLs in an archive:
- $ ./httparchive.py ls archive.wpr
-
-To view the content of all URLs from example.com:
- $ ./httparchive.py cat --host example.com archive.wpr
-
-To view the content of a particular URL:
- $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr
-
-To view the content of all URLs:
- $ ./httparchive.py cat archive.wpr
-
-To edit a particular URL:
- $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr
-
-To print statistics of an archive:
- $ ./httparchive.py stats archive.wpr
-
-To print statistics of a set of URLs:
- $ ./httparchive.py stats --host www.example.com archive.wpr
-
-To merge multiple archives
- $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ...
-"""
-
-import calendar
-import certutils
-import cPickle
-import difflib
-import email.utils
-import httplib
-import httpzlib
-import json
-import logging
-import optparse
-import os
-import StringIO
-import subprocess
-import sys
-import tempfile
-import time
-import urlparse
-from collections import defaultdict
-
-
-
-def LogRunTime(fn):
- """Annotation which logs the run time of the function."""
- def wrapped(self, *args, **kwargs):
- start_time = time.time()
- try:
- return fn(self, *args, **kwargs)
- finally:
- run_time = (time.time() - start_time) * 1000.0
- logging.debug('%s: %dms', fn.__name__, run_time)
- return wrapped
-
-
-class HttpArchiveException(Exception):
- """Base class for all exceptions in httparchive."""
- pass
-
-
-class HttpArchive(dict):
- """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values.
-
- Attributes:
- responses_by_host: dict of {hostname, {request: response}}. This must remain
- in sync with the underlying dict of self. It is used as an optimization
- so that get_requests() doesn't have to linearly search all requests in
- the archive to find potential matches.
- """
-
- def __init__(self): # pylint: disable=super-init-not-called
- self.responses_by_host = defaultdict(dict)
-
- def __setstate__(self, state):
- """Influence how to unpickle.
-
- Args:
- state: a dictionary for __dict__
- """
- self.__dict__.update(state)
- self.responses_by_host = defaultdict(dict)
- for request in self:
- self.responses_by_host[request.host][request] = self[request]
-
- def __getstate__(self):
- """Influence how to pickle.
-
- Returns:
- a dict to use for pickling
- """
- state = self.__dict__.copy()
- del state['responses_by_host']
- return state
-
- def __setitem__(self, key, value):
- super(HttpArchive, self).__setitem__(key, value)
- if hasattr(self, 'responses_by_host'):
- self.responses_by_host[key.host][key] = value
-
- def __delitem__(self, key):
- super(HttpArchive, self).__delitem__(key)
- del self.responses_by_host[key.host][key]
-
- def get(self, request, default=None):
- """Return the archived response for a given request.
-
- Does extra checking for handling some HTTP request headers.
-
- Args:
- request: instance of ArchivedHttpRequest
- default: default value to return if request is not found
-
- Returns:
- Instance of ArchivedHttpResponse or default if no matching
- response is found
- """
- if request in self:
- return self[request]
- return self.get_conditional_response(request, default)
-
- def get_conditional_response(self, request, default):
- """Get the response based on the conditional HTTP request headers.
-
- Args:
- request: an ArchivedHttpRequest representing the original request.
- default: default ArchivedHttpResponse
- original request with matched headers removed.
-
- Returns:
- an ArchivedHttpResponse with a status of 200, 302 (not modified), or
- 412 (precondition failed)
- """
- response = default
- if request.is_conditional():
- stripped_request = request.create_request_without_conditions()
- if stripped_request in self:
- response = self[stripped_request]
- if response.status == 200:
- status = self.get_conditional_status(request, response)
- if status != 200:
- response = create_response(status)
- return response
-
- def get_conditional_status(self, request, response):
- status = 200
- last_modified = email.utils.parsedate(
- response.update_date(response.get_header('last-modified')))
- response_etag = response.get_header('etag')
- is_get_or_head = request.command.upper() in ('GET', 'HEAD')
-
- match_value = request.headers.get('if-match', None)
- if match_value:
- if self.is_etag_match(match_value, response_etag):
- status = 200
- else:
- status = 412 # precondition failed
- none_match_value = request.headers.get('if-none-match', None)
- if none_match_value:
- if self.is_etag_match(none_match_value, response_etag):
- status = 304
- elif is_get_or_head:
- status = 200
- else:
- status = 412
- if is_get_or_head and last_modified:
- for header in ('if-modified-since', 'if-unmodified-since'):
- date = email.utils.parsedate(request.headers.get(header, None))
- if date:
- if ((header == 'if-modified-since' and last_modified > date) or
- (header == 'if-unmodified-since' and last_modified < date)):
- if status != 412:
- status = 200
- else:
- status = 304 # not modified
- return status
-
- @staticmethod
- def is_etag_match(request_etag, response_etag):
- """Determines whether the entity tags of the request/response matches.
-
- Args:
- request_etag: the value string of the "if-(none)-match:"
- portion of the request header
- response_etag: the etag value of the response
-
- Returns:
- True on match, False otherwise
- """
- response_etag = response_etag.strip('" ')
- for etag in request_etag.split(','):
- etag = etag.strip('" ')
- if etag in ('*', response_etag):
- return True
- return False
-
- def get_requests(self, command=None, host=None, full_path=None, is_ssl=None,
- use_query=True):
- """Return a list of requests that match the given args."""
- if host:
- return [r for r in self.responses_by_host[host]
- if r.matches(command, None, full_path, is_ssl,
- use_query=use_query)]
- else:
- return [r for r in self
- if r.matches(command, host, full_path, is_ssl,
- use_query=use_query)]
-
- def ls(self, command=None, host=None, full_path=None):
- """List all URLs that match given params."""
- return ''.join(sorted(
- '%s\n' % r for r in self.get_requests(command, host, full_path)))
-
- def cat(self, command=None, host=None, full_path=None):
- """Print the contents of all URLs that match given params."""
- out = StringIO.StringIO()
- for request in self.get_requests(command, host, full_path):
- print >>out, str(request)
- print >>out, 'Untrimmed request headers:'
- for k in request.headers:
- print >>out, ' %s: %s' % (k, request.headers[k])
- if request.request_body:
- print >>out, request.request_body
- print >>out, '---- Response Info', '-' * 51
- response = self[request]
- chunk_lengths = [len(x) for x in response.response_data]
- print >>out, ('Status: %s\n'
- 'Reason: %s\n'
- 'Headers delay: %s\n'
- 'Response headers:') % (
- response.status, response.reason, response.delays['headers'])
- for k, v in response.headers:
- print >>out, ' %s: %s' % (k, v)
- print >>out, ('Chunk count: %s\n'
- 'Chunk lengths: %s\n'
- 'Chunk delays: %s') % (
- len(chunk_lengths), chunk_lengths, response.delays['data'])
- body = response.get_data_as_text()
- print >>out, '---- Response Data', '-' * 51
- if body:
- print >>out, body
- else:
- print >>out, '[binary data]'
- print >>out, '=' * 70
- return out.getvalue()
-
- def stats(self, command=None, host=None, full_path=None):
- """Print stats about the archive for all URLs that match given params."""
- matching_requests = self.get_requests(command, host, full_path)
- if not matching_requests:
- print 'Failed to find any requests matching given command, host, path.'
- return
-
- out = StringIO.StringIO()
- stats = {
- 'Total': len(matching_requests),
- 'Domains': defaultdict(int),
- 'HTTP_response_code': defaultdict(int),
- 'content_type': defaultdict(int),
- 'Documents': defaultdict(int),
- }
-
- for request in matching_requests:
- stats['Domains'][request.host] += 1
- stats['HTTP_response_code'][self[request].status] += 1
-
- content_type = self[request].get_header('content-type')
- # Remove content type options for readability and higher level groupings.
- str_content_type = str(content_type.split(';')[0]
- if content_type else None)
- stats['content_type'][str_content_type] += 1
-
- # Documents are the main URL requested and not a referenced resource.
- if str_content_type == 'text/html' and not 'referer' in request.headers:
- stats['Documents'][request.host] += 1
-
- print >>out, json.dumps(stats, indent=4)
- return out.getvalue()
-
- def merge(self, merged_archive=None, other_archives=None):
- """Merge multiple archives into merged_archive by 'chaining' resources,
- only resources that are not part of the accumlated archive are added"""
- if not other_archives:
- print 'No archives passed to merge'
- return
-
- # Note we already loaded 'replay_file'.
- print 'Loaded %d responses' % len(self)
-
- for archive in other_archives:
- if not os.path.exists(archive):
- print 'Error: Replay file "%s" does not exist' % archive
- return
-
- http_archive_other = HttpArchive.Load(archive)
- print 'Loaded %d responses from %s' % (len(http_archive_other), archive)
- for r in http_archive_other:
- # Only resources that are not already part of the current archive
- # get added.
- if r not in self:
- print '\t %s ' % r
- self[r] = http_archive_other[r]
- self.Persist('%s' % merged_archive)
-
- def edit(self, command=None, host=None, full_path=None):
- """Edits the single request which matches given params."""
- editor = os.getenv('EDITOR')
- if not editor:
- print 'You must set the EDITOR environmental variable.'
- return
-
- matching_requests = self.get_requests(command, host, full_path)
- if not matching_requests:
- print ('Failed to find any requests matching given command, host, '
- 'full_path.')
- return
-
- if len(matching_requests) > 1:
- print 'Found multiple matching requests. Please refine.'
- print self.ls(command, host, full_path)
-
- response = self[matching_requests[0]]
- tmp_file = tempfile.NamedTemporaryFile(delete=False)
- tmp_file.write(response.get_response_as_text())
- tmp_file.close()
- subprocess.check_call([editor, tmp_file.name])
- response.set_response_from_text(''.join(open(tmp_file.name).readlines()))
- os.remove(tmp_file.name)
-
- def find_closest_request(self, request, use_path=False):
- """Find the closest matching request in the archive to the given request.
-
- Args:
- request: an ArchivedHttpRequest
- use_path: If True, closest matching request's path component must match.
- (Note: this refers to the 'path' component within the URL, not the
- 'full path' which includes the query string component.)
-
- If use_path=True, candidate will NOT match in example below
- e.g. request = GET www.test.com/a?p=1
- candidate = GET www.test.com/b?p=1
-
- Even if use_path=False, urls with same paths are always favored.
- For example, candidate1 is considered a better match than candidate2.
- request = GET www.test.com/a?p=1&q=2&r=3
- candidate1 = GET www.test.com/a?s=4
- candidate2 = GET www.test.com/b?p=1&q=2&r=3
-
- Returns:
- If a close match is found, return the instance of ArchivedHttpRequest.
- Otherwise, return None.
- """
- # Start with strictest constraints. This trims search space considerably.
- requests = self.get_requests(request.command, request.host,
- request.full_path, is_ssl=request.is_ssl,
- use_query=True)
- # Relax constraint: use_query if there is no match.
- if not requests:
- requests = self.get_requests(request.command, request.host,
- request.full_path, is_ssl=request.is_ssl,
- use_query=False)
- # Relax constraint: full_path if there is no match and use_path=False.
- if not requests and not use_path:
- requests = self.get_requests(request.command, request.host,
- None, is_ssl=request.is_ssl,
- use_query=False)
-
- if not requests:
- return None
-
- if len(requests) == 1:
- return requests[0]
-
- matcher = difflib.SequenceMatcher(b=request.cmp_seq)
-
- # quick_ratio() is cheap to compute, but ratio() is expensive. So we call
- # quick_ratio() on all requests, sort them descending, and then loop through
- # until we find a candidate whose ratio() is >= the next quick_ratio().
- # This works because quick_ratio() is guaranteed to be an upper bound on
- # ratio().
- candidates = []
- for candidate in requests:
- matcher.set_seq1(candidate.cmp_seq)
- candidates.append((matcher.quick_ratio(), candidate))
-
- candidates.sort(reverse=True, key=lambda c: c[0])
-
- best_match = (0, None)
- for i in xrange(len(candidates)):
- matcher.set_seq1(candidates[i][1].cmp_seq)
- best_match = max(best_match, (matcher.ratio(), candidates[i][1]))
- if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]:
- break
- return best_match[1]
-
- def diff(self, request):
- """Diff the given request to the closest matching request in the archive.
-
- Args:
- request: an ArchivedHttpRequest
- Returns:
- If a close match is found, return a textual diff between the requests.
- Otherwise, return None.
- """
- request_lines = request.formatted_request.split('\n')
- closest_request = self.find_closest_request(request)
- if closest_request:
- closest_request_lines = closest_request.formatted_request.split('\n')
- return '\n'.join(difflib.ndiff(closest_request_lines, request_lines))
- return None
-
- def get_server_cert(self, host):
- """Gets certificate from the server and stores it in archive"""
- request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {})
- if request not in self:
- self[request] = create_response(200, body=certutils.get_host_cert(host))
- return self[request].response_data[0]
-
- def get_certificate(self, host):
- request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {})
- if request not in self:
- self[request] = create_response(200, body=self._generate_cert(host))
- return self[request].response_data[0]
-
- @classmethod
- def AssertWritable(cls, filename):
- """Raises an IOError if filename is not writable."""
- persist_dir = os.path.dirname(os.path.abspath(filename))
- if not os.path.exists(persist_dir):
- raise IOError('Directory does not exist: %s' % persist_dir)
- if os.path.exists(filename):
- if not os.access(filename, os.W_OK):
- raise IOError('Need write permission on file: %s' % filename)
- elif not os.access(persist_dir, os.W_OK):
- raise IOError('Need write permission on directory: %s' % persist_dir)
-
- @classmethod
- def Load(cls, filename):
- """Load an instance from filename."""
- return cPickle.load(open(filename, 'rb'))
-
- def Persist(self, filename):
- """Persist all state to filename."""
- try:
- original_checkinterval = sys.getcheckinterval()
- sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can
- # modify |self| during pickling.
- pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL)
- finally:
- sys.setcheckinterval(original_checkinterval)
- with open(filename, 'wb') as f:
- f.write(pickled_self)
-
-
-class ArchivedHttpRequest(object):
- """Record all the state that goes into a request.
-
- ArchivedHttpRequest instances are considered immutable so they can
- serve as keys for HttpArchive instances.
- (The immutability is not enforced.)
-
- Upon creation, the headers are "trimmed" (i.e. edited or dropped)
- and saved to self.trimmed_headers to allow requests to match in a wider
- variety of playback situations (e.g. using different user agents).
-
- For unpickling, 'trimmed_headers' is recreated from 'headers'. That
- allows for changes to the trim function and can help with debugging.
- """
- CONDITIONAL_HEADERS = [
- 'if-none-match', 'if-match',
- 'if-modified-since', 'if-unmodified-since']
-
- def __init__(self, command, host, full_path, request_body, headers,
- is_ssl=False):
- """Initialize an ArchivedHttpRequest.
-
- Args:
- command: a string (e.g. 'GET' or 'POST').
- host: a host name (e.g. 'www.google.com').
- full_path: a request path. Includes everything after the host & port in
- the URL (e.g. '/search?q=dogs').
- request_body: a request body string for a POST or None.
- headers: {key: value, ...} where key and value are strings.
- is_ssl: a boolean which is True iff request is make via SSL.
- """
- self.command = command
- self.host = host
- self.full_path = full_path
- parsed_url = urlparse.urlparse(full_path) if full_path else None
- self.path = parsed_url.path if parsed_url else None
- self.request_body = request_body
- self.headers = headers
- self.is_ssl = is_ssl
- self.trimmed_headers = self._TrimHeaders(headers)
- self.formatted_request = self._GetFormattedRequest()
- self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None)
-
- def __str__(self):
- scheme = 'https' if self.is_ssl else 'http'
- return '%s %s://%s%s %s' % (
- self.command, scheme, self.host, self.full_path, self.trimmed_headers)
-
- def __repr__(self):
- return repr((self.command, self.host, self.full_path, self.request_body,
- self.trimmed_headers, self.is_ssl))
-
- def __hash__(self):
- """Return a integer hash to use for hashed collections including dict."""
- return hash(repr(self))
-
- def __eq__(self, other):
- """Define the __eq__ method to match the hash behavior."""
- return repr(self) == repr(other)
-
- def __setstate__(self, state):
- """Influence how to unpickle.
-
- "headers" are the original request headers.
- "trimmed_headers" are the trimmed headers used for matching requests
- during replay.
-
- Args:
- state: a dictionary for __dict__
- """
- if 'full_headers' in state:
- # Fix older version of archive.
- state['headers'] = state['full_headers']
- del state['full_headers']
- if 'headers' not in state:
- raise HttpArchiveException(
- 'Archived HTTP request is missing "headers". The HTTP archive is'
- ' likely from a previous version and must be re-recorded.')
- if 'path' in state:
- # before, 'path' and 'path_without_query' were used and 'path' was
- # pickled. Now, 'path' has been renamed to 'full_path' and
- # 'path_without_query' has been renamed to 'path'. 'full_path' is
- # pickled, but 'path' is not. If we see 'path' here it means we are
- # dealing with an older archive.
- state['full_path'] = state['path']
- del state['path']
- state['trimmed_headers'] = self._TrimHeaders(dict(state['headers']))
- if 'is_ssl' not in state:
- state['is_ssl'] = False
- self.__dict__.update(state)
- parsed_url = urlparse.urlparse(self.full_path)
- self.path = parsed_url.path
- self.formatted_request = self._GetFormattedRequest()
- self.cmp_seq = self._GetCmpSeq(parsed_url.query)
-
- def __getstate__(self):
- """Influence how to pickle.
-
- Returns:
- a dict to use for pickling
- """
- state = self.__dict__.copy()
- del state['trimmed_headers']
- del state['path']
- del state['formatted_request']
- del state['cmp_seq']
- return state
-
- def _GetFormattedRequest(self):
- """Format request to make diffs easier to read.
-
- Returns:
- A string consisting of the request. Example:
- 'GET www.example.com/path\nHeader-Key: header value\n'
- """
- parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)]
- if self.request_body:
- parts.append('%s\n' % self.request_body)
- for k, v in self.trimmed_headers:
- k = '-'.join(x.capitalize() for x in k.split('-'))
- parts.append('%s: %s\n' % (k, v))
- return ''.join(parts)
-
- def _GetCmpSeq(self, query=None):
- """Compute a sequence out of query and header for difflib to compare.
- For example:
- [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')]
- will be returned for a request with URL:
- http://example.com/index.html?q1=a2&q2=a2
- and header:
- k1: v1
- k2: v2
-
- Args:
- query: the query string in the URL.
-
- Returns:
- A sequence for difflib to compare.
- """
- if not query:
- return self.trimmed_headers
- return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers
-
- def matches(self, command=None, host=None, full_path=None, is_ssl=None,
- use_query=True):
- """Returns true iff the request matches all parameters.
-
- Args:
- command: a string (e.g. 'GET' or 'POST').
- host: a host name (e.g. 'www.google.com').
- full_path: a request path with query string (e.g. '/search?q=dogs')
- is_ssl: whether the request is secure.
- use_query:
- If use_query is True, request matching uses both the hierarchical path
- and query string component.
- If use_query is False, request matching only uses the hierarchical path
-
- e.g. req1 = GET www.test.com/index?aaaa
- req2 = GET www.test.com/index?bbbb
-
- If use_query is True, req1.matches(req2) evaluates to False
- If use_query is False, req1.matches(req2) evaluates to True
-
- Returns:
- True iff the request matches all parameters
- """
- if command is not None and command != self.command:
- return False
- if is_ssl is not None and is_ssl != self.is_ssl:
- return False
- if host is not None and host != self.host:
- return False
- if full_path is None:
- return True
- if use_query:
- return full_path == self.full_path
- else:
- return self.path == urlparse.urlparse(full_path).path
-
- @classmethod
- def _TrimHeaders(cls, headers):
- """Removes headers that are known to cause problems during replay.
-
- These headers are removed for the following reasons:
- - accept: Causes problems with www.bing.com. During record, CSS is fetched
- with *. During replay, it's text/css.
- - accept-charset, accept-language, referer: vary between clients.
- - cache-control: sometimes sent from Chrome with 'max-age=0' as value.
- - connection, method, scheme, url, version: Cause problems with spdy.
- - cookie: Extremely sensitive to request/response order.
- - keep-alive: Doesn't affect the content of the request, only some
- transient state of the transport layer.
- - user-agent: Changes with every Chrome version.
- - proxy-connection: Sent for proxy requests.
- - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by
- Google to collect statistics about Chrome's enabled features.
-
- Another variant to consider is dropping only the value from the header.
- However, this is particularly bad for the cookie header, because the
- presence of the cookie depends on the responses we've seen when the request
- is made.
-
- Args:
- headers: {header_key: header_value, ...}
-
- Returns:
- [(header_key, header_value), ...] # (with undesirable headers removed)
- """
- # TODO(tonyg): Strip sdch from the request headers because we can't
- # guarantee that the dictionary will be recorded, so replay may not work.
- if 'accept-encoding' in headers:
- accept_encoding = headers['accept-encoding']
- accept_encoding = accept_encoding.replace('sdch', '')
- # Strip lzma so Opera's requests matches archives recorded using Chrome.
- accept_encoding = accept_encoding.replace('lzma', '')
- stripped_encodings = [e.strip() for e in accept_encoding.split(',')]
- accept_encoding = ','.join(filter(bool, stripped_encodings))
- headers['accept-encoding'] = accept_encoding
- undesirable_keys = [
- 'accept', 'accept-charset', 'accept-language', 'cache-control',
- 'connection', 'cookie', 'keep-alive', 'method',
- 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection',
- 'x-chrome-variations', 'x-client-data']
- return sorted([(k, v) for k, v in headers.items()
- if k.lower() not in undesirable_keys])
-
- def is_conditional(self):
- """Return list of headers that match conditional headers."""
- for header in self.CONDITIONAL_HEADERS:
- if header in self.headers:
- return True
- return False
-
- def create_request_without_conditions(self):
- stripped_headers = dict((k, v) for k, v in self.headers.iteritems()
- if k.lower() not in self.CONDITIONAL_HEADERS)
- return ArchivedHttpRequest(
- self.command, self.host, self.full_path, self.request_body,
- stripped_headers, self.is_ssl)
-
-class ArchivedHttpResponse(object):
- """All the data needed to recreate all HTTP response."""
-
- # CHUNK_EDIT_SEPARATOR is used to edit and view text content.
- # It is not sent in responses. It is added by get_data_as_text()
- # and removed by set_data().
- CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]'
-
- # DELAY_EDIT_SEPARATOR is used to edit and view server delays.
- DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- '
- 'Delays are above. Response content is below.]\n')
-
- def __init__(self, version, status, reason, headers, response_data,
- delays=None):
- """Initialize an ArchivedHttpResponse.
-
- Args:
- version: HTTP protocol version used by server.
- 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib).
- status: Status code returned by server (e.g. 200).
- reason: Reason phrase returned by server (e.g. "OK").
- headers: list of (header, value) tuples.
- response_data: list of content chunks.
- Concatenating the chunks gives the complete contents
- (i.e. the chunks do not have any lengths or delimiters).
- Do not include the final, zero-length chunk that marks the end.
- delays: dict of (ms) delays for 'connect', 'headers' and 'data'.
- e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]}
- connect - The time to connect to the server.
- Each resource has a value because Replay's record mode captures it.
- This includes the time for the SYN and SYN/ACK (1 rtt).
- headers - The time elapsed between the TCP connect and the headers.
- This typically includes all the server-time to generate a response.
- data - If the response is chunked, these are the times for each chunk.
- """
- self.version = version
- self.status = status
- self.reason = reason
- self.headers = headers
- self.response_data = response_data
- self.delays = delays
- self.fix_delays()
-
- def fix_delays(self):
- """Initialize delays, or check the number of data delays."""
- expected_num_delays = len(self.response_data)
- if not self.delays:
- self.delays = {
- 'connect': 0,
- 'headers': 0,
- 'data': [0] * expected_num_delays
- }
- else:
- num_delays = len(self.delays['data'])
- if num_delays != expected_num_delays:
- raise HttpArchiveException(
- 'Server delay length mismatch: %d (expected %d): %s',
- num_delays, expected_num_delays, self.delays['data'])
-
- def __repr__(self):
- return repr((self.version, self.status, self.reason, sorted(self.headers),
- self.response_data))
-
- def __hash__(self):
- """Return a integer hash to use for hashed collections including dict."""
- return hash(repr(self))
-
- def __eq__(self, other):
- """Define the __eq__ method to match the hash behavior."""
- return repr(self) == repr(other)
-
- def __setstate__(self, state):
- """Influence how to unpickle.
-
- Args:
- state: a dictionary for __dict__
- """
- if 'server_delays' in state:
- state['delays'] = {
- 'connect': 0,
- 'headers': 0,
- 'data': state['server_delays']
- }
- del state['server_delays']
- elif 'delays' not in state:
- state['delays'] = None
- self.__dict__.update(state)
- self.fix_delays()
-
- def get_header(self, key, default=None):
- for k, v in self.headers:
- if key.lower() == k.lower():
- return v
- return default
-
- def set_header(self, key, value):
- for i, (k, v) in enumerate(self.headers):
- if key == k:
- self.headers[i] = (key, value)
- return
- self.headers.append((key, value))
-
- def remove_header(self, key):
- for i, (k, v) in enumerate(self.headers):
- if key.lower() == k.lower():
- self.headers.pop(i)
- return
-
- @staticmethod
- def _get_epoch_seconds(date_str):
- """Return the epoch seconds of a date header.
-
- Args:
- date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
- Returns:
- epoch seconds as a float
- """
- date_tuple = email.utils.parsedate(date_str)
- if date_tuple:
- return calendar.timegm(date_tuple)
- return None
-
- def update_date(self, date_str, now=None):
- """Return an updated date based on its delta from the "Date" header.
-
- For example, if |date_str| is one week later than the "Date" header,
- then the returned date string is one week later than the current date.
-
- Args:
- date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT")
- Returns:
- a date string
- """
- date_seconds = self._get_epoch_seconds(self.get_header('date'))
- header_seconds = self._get_epoch_seconds(date_str)
- if date_seconds and header_seconds:
- updated_seconds = header_seconds + (now or time.time()) - date_seconds
- return email.utils.formatdate(updated_seconds, usegmt=True)
- return date_str
-
- def is_gzip(self):
- return self.get_header('content-encoding') == 'gzip'
-
- def is_compressed(self):
- return self.get_header('content-encoding') in ('gzip', 'deflate')
-
- def is_chunked(self):
- return self.get_header('transfer-encoding') == 'chunked'
-
- def get_data_as_text(self):
- """Return content as a single string.
-
- Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR.
- """
- content_type = self.get_header('content-type')
- if (not content_type or
- not (content_type.startswith('text/') or
- content_type == 'application/x-javascript' or
- content_type.startswith('application/json'))):
- return None
- if self.is_compressed():
- uncompressed_chunks = httpzlib.uncompress_chunks(
- self.response_data, self.is_gzip())
- else:
- uncompressed_chunks = self.response_data
- return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks)
-
- def get_delays_as_text(self):
- """Return delays as editable text."""
- return json.dumps(self.delays, indent=2)
-
- def get_response_as_text(self):
- """Returns response content as a single string.
-
- Server delays are separated on a per-chunk basis. Delays are in seconds.
- Response content begins after DELAY_EDIT_SEPARATOR
- """
- data = self.get_data_as_text()
- if data is None:
- logging.warning('Data can not be represented as text.')
- data = ''
- delays = self.get_delays_as_text()
- return self.DELAY_EDIT_SEPARATOR.join((delays, data))
-
- def set_data(self, text):
- """Inverse of get_data_as_text().
-
- Split on CHUNK_EDIT_SEPARATOR and compress if needed.
- """
- text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR)
- if self.is_compressed():
- self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip())
- else:
- self.response_data = text_chunks
- if not self.is_chunked():
- content_length = sum(len(c) for c in self.response_data)
- self.set_header('content-length', str(content_length))
-
- def set_delays(self, delays_text):
- """Inverse of get_delays_as_text().
-
- Args:
- delays_text: JSON encoded text such as the following:
- {
- connect: 80,
- headers: 80,
- data: [6, 55, 0]
- }
- Times are in milliseconds.
- Each data delay corresponds with one response_data value.
- """
- try:
- self.delays = json.loads(delays_text)
- except (ValueError, KeyError) as e:
- logging.critical('Unable to parse delays %s: %s', delays_text, e)
- self.fix_delays()
-
- def set_response_from_text(self, text):
- """Inverse of get_response_as_text().
-
- Modifies the state of the archive according to the textual representation.
- """
- try:
- delays, data = text.split(self.DELAY_EDIT_SEPARATOR)
- except ValueError:
- logging.critical(
- 'Error parsing text representation. Skipping edits.')
- return
- self.set_delays(delays)
- self.set_data(data)
-
-
-def create_response(status, reason=None, headers=None, body=None):
- """Convenience method for creating simple ArchivedHttpResponse objects."""
- if reason is None:
- reason = httplib.responses.get(status, 'Unknown')
- if headers is None:
- headers = [('content-type', 'text/plain')]
- if body is None:
- body = "%s %s" % (status, reason)
- return ArchivedHttpResponse(11, status, reason, headers, [body])
-
-
-def main():
- class PlainHelpFormatter(optparse.IndentedHelpFormatter):
- def format_description(self, description):
- if description:
- return description + '\n'
- else:
- return ''
-
- option_parser = optparse.OptionParser(
- usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)',
- formatter=PlainHelpFormatter(),
- description=__doc__,
- epilog='http://code.google.com/p/web-page-replay/')
-
- option_parser.add_option('-c', '--command', default=None,
- action='store',
- type='string',
- help='Only show URLs matching this command.')
- option_parser.add_option('-o', '--host', default=None,
- action='store',
- type='string',
- help='Only show URLs matching this host.')
- option_parser.add_option('-p', '--full_path', default=None,
- action='store',
- type='string',
- help='Only show URLs matching this full path.')
- option_parser.add_option('-f', '--merged_file', default=None,
- action='store',
- type='string',
- help='The output file to use when using the merge command.')
-
- options, args = option_parser.parse_args()
-
- # Merge command expects an umlimited number of archives.
- if len(args) < 2:
- print 'args: %s' % args
- option_parser.error('Must specify a command and replay_file')
-
- command = args[0]
- replay_file = args[1]
-
- if not os.path.exists(replay_file):
- option_parser.error('Replay file "%s" does not exist' % replay_file)
-
- http_archive = HttpArchive.Load(replay_file)
- if command == 'ls':
- print http_archive.ls(options.command, options.host, options.full_path)
- elif command == 'cat':
- print http_archive.cat(options.command, options.host, options.full_path)
- elif command == 'stats':
- print http_archive.stats(options.command, options.host, options.full_path)
- elif command == 'merge':
- if not options.merged_file:
- print 'Error: Must specify a merged file name (use --merged_file)'
- return
- http_archive.merge(options.merged_file, args[2:])
- elif command == 'edit':
- http_archive.edit(options.command, options.host, options.full_path)
- http_archive.Persist(replay_file)
- else:
- option_parser.error('Unknown command "%s"' % command)
- return 0
-
-
-if __name__ == '__main__':
- sys.exit(main())

Powered by Google App Engine
This is Rietveld 408576698