OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright 2010 Google Inc. All Rights Reserved. | |
3 # | |
4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
5 # you may not use this file except in compliance with the License. | |
6 # You may obtain a copy of the License at | |
7 # | |
8 # http://www.apache.org/licenses/LICENSE-2.0 | |
9 # | |
10 # Unless required by applicable law or agreed to in writing, software | |
11 # distributed under the License is distributed on an "AS IS" BASIS, | |
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 # See the License for the specific language governing permissions and | |
14 # limitations under the License. | |
15 | |
16 """View and edit HTTP Archives. | |
17 | |
18 To list all URLs in an archive: | |
19 $ ./httparchive.py ls archive.wpr | |
20 | |
21 To view the content of all URLs from example.com: | |
22 $ ./httparchive.py cat --host example.com archive.wpr | |
23 | |
24 To view the content of a particular URL: | |
25 $ ./httparchive.py cat --host www.example.com --full_path /foo archive.wpr | |
26 | |
27 To view the content of all URLs: | |
28 $ ./httparchive.py cat archive.wpr | |
29 | |
30 To edit a particular URL: | |
31 $ ./httparchive.py edit --host www.example.com --full_path /foo archive.wpr | |
32 | |
33 To print statistics of an archive: | |
34 $ ./httparchive.py stats archive.wpr | |
35 | |
36 To print statistics of a set of URLs: | |
37 $ ./httparchive.py stats --host www.example.com archive.wpr | |
38 | |
39 To merge multiple archives | |
40 $ ./httparchive.py merge --merged_file new.wpr archive1.wpr archive2.wpr ... | |
41 """ | |
42 | |
43 import calendar | |
44 import certutils | |
45 import cPickle | |
46 import difflib | |
47 import email.utils | |
48 import httplib | |
49 import httpzlib | |
50 import json | |
51 import logging | |
52 import optparse | |
53 import os | |
54 import StringIO | |
55 import subprocess | |
56 import sys | |
57 import tempfile | |
58 import time | |
59 import urlparse | |
60 from collections import defaultdict | |
61 | |
62 | |
63 | |
64 def LogRunTime(fn): | |
65 """Annotation which logs the run time of the function.""" | |
66 def wrapped(self, *args, **kwargs): | |
67 start_time = time.time() | |
68 try: | |
69 return fn(self, *args, **kwargs) | |
70 finally: | |
71 run_time = (time.time() - start_time) * 1000.0 | |
72 logging.debug('%s: %dms', fn.__name__, run_time) | |
73 return wrapped | |
74 | |
75 | |
76 class HttpArchiveException(Exception): | |
77 """Base class for all exceptions in httparchive.""" | |
78 pass | |
79 | |
80 | |
81 class HttpArchive(dict): | |
82 """Dict with ArchivedHttpRequest keys and ArchivedHttpResponse values. | |
83 | |
84 Attributes: | |
85 responses_by_host: dict of {hostname, {request: response}}. This must remain | |
86 in sync with the underlying dict of self. It is used as an optimization | |
87 so that get_requests() doesn't have to linearly search all requests in | |
88 the archive to find potential matches. | |
89 """ | |
90 | |
91 def __init__(self): # pylint: disable=super-init-not-called | |
92 self.responses_by_host = defaultdict(dict) | |
93 | |
94 def __setstate__(self, state): | |
95 """Influence how to unpickle. | |
96 | |
97 Args: | |
98 state: a dictionary for __dict__ | |
99 """ | |
100 self.__dict__.update(state) | |
101 self.responses_by_host = defaultdict(dict) | |
102 for request in self: | |
103 self.responses_by_host[request.host][request] = self[request] | |
104 | |
105 def __getstate__(self): | |
106 """Influence how to pickle. | |
107 | |
108 Returns: | |
109 a dict to use for pickling | |
110 """ | |
111 state = self.__dict__.copy() | |
112 del state['responses_by_host'] | |
113 return state | |
114 | |
115 def __setitem__(self, key, value): | |
116 super(HttpArchive, self).__setitem__(key, value) | |
117 if hasattr(self, 'responses_by_host'): | |
118 self.responses_by_host[key.host][key] = value | |
119 | |
120 def __delitem__(self, key): | |
121 super(HttpArchive, self).__delitem__(key) | |
122 del self.responses_by_host[key.host][key] | |
123 | |
124 def get(self, request, default=None): | |
125 """Return the archived response for a given request. | |
126 | |
127 Does extra checking for handling some HTTP request headers. | |
128 | |
129 Args: | |
130 request: instance of ArchivedHttpRequest | |
131 default: default value to return if request is not found | |
132 | |
133 Returns: | |
134 Instance of ArchivedHttpResponse or default if no matching | |
135 response is found | |
136 """ | |
137 if request in self: | |
138 return self[request] | |
139 return self.get_conditional_response(request, default) | |
140 | |
141 def get_conditional_response(self, request, default): | |
142 """Get the response based on the conditional HTTP request headers. | |
143 | |
144 Args: | |
145 request: an ArchivedHttpRequest representing the original request. | |
146 default: default ArchivedHttpResponse | |
147 original request with matched headers removed. | |
148 | |
149 Returns: | |
150 an ArchivedHttpResponse with a status of 200, 302 (not modified), or | |
151 412 (precondition failed) | |
152 """ | |
153 response = default | |
154 if request.is_conditional(): | |
155 stripped_request = request.create_request_without_conditions() | |
156 if stripped_request in self: | |
157 response = self[stripped_request] | |
158 if response.status == 200: | |
159 status = self.get_conditional_status(request, response) | |
160 if status != 200: | |
161 response = create_response(status) | |
162 return response | |
163 | |
164 def get_conditional_status(self, request, response): | |
165 status = 200 | |
166 last_modified = email.utils.parsedate( | |
167 response.update_date(response.get_header('last-modified'))) | |
168 response_etag = response.get_header('etag') | |
169 is_get_or_head = request.command.upper() in ('GET', 'HEAD') | |
170 | |
171 match_value = request.headers.get('if-match', None) | |
172 if match_value: | |
173 if self.is_etag_match(match_value, response_etag): | |
174 status = 200 | |
175 else: | |
176 status = 412 # precondition failed | |
177 none_match_value = request.headers.get('if-none-match', None) | |
178 if none_match_value: | |
179 if self.is_etag_match(none_match_value, response_etag): | |
180 status = 304 | |
181 elif is_get_or_head: | |
182 status = 200 | |
183 else: | |
184 status = 412 | |
185 if is_get_or_head and last_modified: | |
186 for header in ('if-modified-since', 'if-unmodified-since'): | |
187 date = email.utils.parsedate(request.headers.get(header, None)) | |
188 if date: | |
189 if ((header == 'if-modified-since' and last_modified > date) or | |
190 (header == 'if-unmodified-since' and last_modified < date)): | |
191 if status != 412: | |
192 status = 200 | |
193 else: | |
194 status = 304 # not modified | |
195 return status | |
196 | |
197 @staticmethod | |
198 def is_etag_match(request_etag, response_etag): | |
199 """Determines whether the entity tags of the request/response matches. | |
200 | |
201 Args: | |
202 request_etag: the value string of the "if-(none)-match:" | |
203 portion of the request header | |
204 response_etag: the etag value of the response | |
205 | |
206 Returns: | |
207 True on match, False otherwise | |
208 """ | |
209 response_etag = response_etag.strip('" ') | |
210 for etag in request_etag.split(','): | |
211 etag = etag.strip('" ') | |
212 if etag in ('*', response_etag): | |
213 return True | |
214 return False | |
215 | |
216 def get_requests(self, command=None, host=None, full_path=None, is_ssl=None, | |
217 use_query=True): | |
218 """Return a list of requests that match the given args.""" | |
219 if host: | |
220 return [r for r in self.responses_by_host[host] | |
221 if r.matches(command, None, full_path, is_ssl, | |
222 use_query=use_query)] | |
223 else: | |
224 return [r for r in self | |
225 if r.matches(command, host, full_path, is_ssl, | |
226 use_query=use_query)] | |
227 | |
228 def ls(self, command=None, host=None, full_path=None): | |
229 """List all URLs that match given params.""" | |
230 return ''.join(sorted( | |
231 '%s\n' % r for r in self.get_requests(command, host, full_path))) | |
232 | |
233 def cat(self, command=None, host=None, full_path=None): | |
234 """Print the contents of all URLs that match given params.""" | |
235 out = StringIO.StringIO() | |
236 for request in self.get_requests(command, host, full_path): | |
237 print >>out, str(request) | |
238 print >>out, 'Untrimmed request headers:' | |
239 for k in request.headers: | |
240 print >>out, ' %s: %s' % (k, request.headers[k]) | |
241 if request.request_body: | |
242 print >>out, request.request_body | |
243 print >>out, '---- Response Info', '-' * 51 | |
244 response = self[request] | |
245 chunk_lengths = [len(x) for x in response.response_data] | |
246 print >>out, ('Status: %s\n' | |
247 'Reason: %s\n' | |
248 'Headers delay: %s\n' | |
249 'Response headers:') % ( | |
250 response.status, response.reason, response.delays['headers']) | |
251 for k, v in response.headers: | |
252 print >>out, ' %s: %s' % (k, v) | |
253 print >>out, ('Chunk count: %s\n' | |
254 'Chunk lengths: %s\n' | |
255 'Chunk delays: %s') % ( | |
256 len(chunk_lengths), chunk_lengths, response.delays['data']) | |
257 body = response.get_data_as_text() | |
258 print >>out, '---- Response Data', '-' * 51 | |
259 if body: | |
260 print >>out, body | |
261 else: | |
262 print >>out, '[binary data]' | |
263 print >>out, '=' * 70 | |
264 return out.getvalue() | |
265 | |
266 def stats(self, command=None, host=None, full_path=None): | |
267 """Print stats about the archive for all URLs that match given params.""" | |
268 matching_requests = self.get_requests(command, host, full_path) | |
269 if not matching_requests: | |
270 print 'Failed to find any requests matching given command, host, path.' | |
271 return | |
272 | |
273 out = StringIO.StringIO() | |
274 stats = { | |
275 'Total': len(matching_requests), | |
276 'Domains': defaultdict(int), | |
277 'HTTP_response_code': defaultdict(int), | |
278 'content_type': defaultdict(int), | |
279 'Documents': defaultdict(int), | |
280 } | |
281 | |
282 for request in matching_requests: | |
283 stats['Domains'][request.host] += 1 | |
284 stats['HTTP_response_code'][self[request].status] += 1 | |
285 | |
286 content_type = self[request].get_header('content-type') | |
287 # Remove content type options for readability and higher level groupings. | |
288 str_content_type = str(content_type.split(';')[0] | |
289 if content_type else None) | |
290 stats['content_type'][str_content_type] += 1 | |
291 | |
292 # Documents are the main URL requested and not a referenced resource. | |
293 if str_content_type == 'text/html' and not 'referer' in request.headers: | |
294 stats['Documents'][request.host] += 1 | |
295 | |
296 print >>out, json.dumps(stats, indent=4) | |
297 return out.getvalue() | |
298 | |
299 def merge(self, merged_archive=None, other_archives=None): | |
300 """Merge multiple archives into merged_archive by 'chaining' resources, | |
301 only resources that are not part of the accumlated archive are added""" | |
302 if not other_archives: | |
303 print 'No archives passed to merge' | |
304 return | |
305 | |
306 # Note we already loaded 'replay_file'. | |
307 print 'Loaded %d responses' % len(self) | |
308 | |
309 for archive in other_archives: | |
310 if not os.path.exists(archive): | |
311 print 'Error: Replay file "%s" does not exist' % archive | |
312 return | |
313 | |
314 http_archive_other = HttpArchive.Load(archive) | |
315 print 'Loaded %d responses from %s' % (len(http_archive_other), archive) | |
316 for r in http_archive_other: | |
317 # Only resources that are not already part of the current archive | |
318 # get added. | |
319 if r not in self: | |
320 print '\t %s ' % r | |
321 self[r] = http_archive_other[r] | |
322 self.Persist('%s' % merged_archive) | |
323 | |
324 def edit(self, command=None, host=None, full_path=None): | |
325 """Edits the single request which matches given params.""" | |
326 editor = os.getenv('EDITOR') | |
327 if not editor: | |
328 print 'You must set the EDITOR environmental variable.' | |
329 return | |
330 | |
331 matching_requests = self.get_requests(command, host, full_path) | |
332 if not matching_requests: | |
333 print ('Failed to find any requests matching given command, host, ' | |
334 'full_path.') | |
335 return | |
336 | |
337 if len(matching_requests) > 1: | |
338 print 'Found multiple matching requests. Please refine.' | |
339 print self.ls(command, host, full_path) | |
340 | |
341 response = self[matching_requests[0]] | |
342 tmp_file = tempfile.NamedTemporaryFile(delete=False) | |
343 tmp_file.write(response.get_response_as_text()) | |
344 tmp_file.close() | |
345 subprocess.check_call([editor, tmp_file.name]) | |
346 response.set_response_from_text(''.join(open(tmp_file.name).readlines())) | |
347 os.remove(tmp_file.name) | |
348 | |
349 def find_closest_request(self, request, use_path=False): | |
350 """Find the closest matching request in the archive to the given request. | |
351 | |
352 Args: | |
353 request: an ArchivedHttpRequest | |
354 use_path: If True, closest matching request's path component must match. | |
355 (Note: this refers to the 'path' component within the URL, not the | |
356 'full path' which includes the query string component.) | |
357 | |
358 If use_path=True, candidate will NOT match in example below | |
359 e.g. request = GET www.test.com/a?p=1 | |
360 candidate = GET www.test.com/b?p=1 | |
361 | |
362 Even if use_path=False, urls with same paths are always favored. | |
363 For example, candidate1 is considered a better match than candidate2. | |
364 request = GET www.test.com/a?p=1&q=2&r=3 | |
365 candidate1 = GET www.test.com/a?s=4 | |
366 candidate2 = GET www.test.com/b?p=1&q=2&r=3 | |
367 | |
368 Returns: | |
369 If a close match is found, return the instance of ArchivedHttpRequest. | |
370 Otherwise, return None. | |
371 """ | |
372 # Start with strictest constraints. This trims search space considerably. | |
373 requests = self.get_requests(request.command, request.host, | |
374 request.full_path, is_ssl=request.is_ssl, | |
375 use_query=True) | |
376 # Relax constraint: use_query if there is no match. | |
377 if not requests: | |
378 requests = self.get_requests(request.command, request.host, | |
379 request.full_path, is_ssl=request.is_ssl, | |
380 use_query=False) | |
381 # Relax constraint: full_path if there is no match and use_path=False. | |
382 if not requests and not use_path: | |
383 requests = self.get_requests(request.command, request.host, | |
384 None, is_ssl=request.is_ssl, | |
385 use_query=False) | |
386 | |
387 if not requests: | |
388 return None | |
389 | |
390 if len(requests) == 1: | |
391 return requests[0] | |
392 | |
393 matcher = difflib.SequenceMatcher(b=request.cmp_seq) | |
394 | |
395 # quick_ratio() is cheap to compute, but ratio() is expensive. So we call | |
396 # quick_ratio() on all requests, sort them descending, and then loop through | |
397 # until we find a candidate whose ratio() is >= the next quick_ratio(). | |
398 # This works because quick_ratio() is guaranteed to be an upper bound on | |
399 # ratio(). | |
400 candidates = [] | |
401 for candidate in requests: | |
402 matcher.set_seq1(candidate.cmp_seq) | |
403 candidates.append((matcher.quick_ratio(), candidate)) | |
404 | |
405 candidates.sort(reverse=True, key=lambda c: c[0]) | |
406 | |
407 best_match = (0, None) | |
408 for i in xrange(len(candidates)): | |
409 matcher.set_seq1(candidates[i][1].cmp_seq) | |
410 best_match = max(best_match, (matcher.ratio(), candidates[i][1])) | |
411 if i + 1 < len(candidates) and best_match[0] >= candidates[i+1][0]: | |
412 break | |
413 return best_match[1] | |
414 | |
415 def diff(self, request): | |
416 """Diff the given request to the closest matching request in the archive. | |
417 | |
418 Args: | |
419 request: an ArchivedHttpRequest | |
420 Returns: | |
421 If a close match is found, return a textual diff between the requests. | |
422 Otherwise, return None. | |
423 """ | |
424 request_lines = request.formatted_request.split('\n') | |
425 closest_request = self.find_closest_request(request) | |
426 if closest_request: | |
427 closest_request_lines = closest_request.formatted_request.split('\n') | |
428 return '\n'.join(difflib.ndiff(closest_request_lines, request_lines)) | |
429 return None | |
430 | |
431 def get_server_cert(self, host): | |
432 """Gets certificate from the server and stores it in archive""" | |
433 request = ArchivedHttpRequest('SERVER_CERT', host, '', None, {}) | |
434 if request not in self: | |
435 self[request] = create_response(200, body=certutils.get_host_cert(host)) | |
436 return self[request].response_data[0] | |
437 | |
438 def get_certificate(self, host): | |
439 request = ArchivedHttpRequest('DUMMY_CERT', host, '', None, {}) | |
440 if request not in self: | |
441 self[request] = create_response(200, body=self._generate_cert(host)) | |
442 return self[request].response_data[0] | |
443 | |
444 @classmethod | |
445 def AssertWritable(cls, filename): | |
446 """Raises an IOError if filename is not writable.""" | |
447 persist_dir = os.path.dirname(os.path.abspath(filename)) | |
448 if not os.path.exists(persist_dir): | |
449 raise IOError('Directory does not exist: %s' % persist_dir) | |
450 if os.path.exists(filename): | |
451 if not os.access(filename, os.W_OK): | |
452 raise IOError('Need write permission on file: %s' % filename) | |
453 elif not os.access(persist_dir, os.W_OK): | |
454 raise IOError('Need write permission on directory: %s' % persist_dir) | |
455 | |
456 @classmethod | |
457 def Load(cls, filename): | |
458 """Load an instance from filename.""" | |
459 return cPickle.load(open(filename, 'rb')) | |
460 | |
461 def Persist(self, filename): | |
462 """Persist all state to filename.""" | |
463 try: | |
464 original_checkinterval = sys.getcheckinterval() | |
465 sys.setcheckinterval(2**31-1) # Lock out other threads so nothing can | |
466 # modify |self| during pickling. | |
467 pickled_self = cPickle.dumps(self, cPickle.HIGHEST_PROTOCOL) | |
468 finally: | |
469 sys.setcheckinterval(original_checkinterval) | |
470 with open(filename, 'wb') as f: | |
471 f.write(pickled_self) | |
472 | |
473 | |
474 class ArchivedHttpRequest(object): | |
475 """Record all the state that goes into a request. | |
476 | |
477 ArchivedHttpRequest instances are considered immutable so they can | |
478 serve as keys for HttpArchive instances. | |
479 (The immutability is not enforced.) | |
480 | |
481 Upon creation, the headers are "trimmed" (i.e. edited or dropped) | |
482 and saved to self.trimmed_headers to allow requests to match in a wider | |
483 variety of playback situations (e.g. using different user agents). | |
484 | |
485 For unpickling, 'trimmed_headers' is recreated from 'headers'. That | |
486 allows for changes to the trim function and can help with debugging. | |
487 """ | |
488 CONDITIONAL_HEADERS = [ | |
489 'if-none-match', 'if-match', | |
490 'if-modified-since', 'if-unmodified-since'] | |
491 | |
492 def __init__(self, command, host, full_path, request_body, headers, | |
493 is_ssl=False): | |
494 """Initialize an ArchivedHttpRequest. | |
495 | |
496 Args: | |
497 command: a string (e.g. 'GET' or 'POST'). | |
498 host: a host name (e.g. 'www.google.com'). | |
499 full_path: a request path. Includes everything after the host & port in | |
500 the URL (e.g. '/search?q=dogs'). | |
501 request_body: a request body string for a POST or None. | |
502 headers: {key: value, ...} where key and value are strings. | |
503 is_ssl: a boolean which is True iff request is make via SSL. | |
504 """ | |
505 self.command = command | |
506 self.host = host | |
507 self.full_path = full_path | |
508 parsed_url = urlparse.urlparse(full_path) if full_path else None | |
509 self.path = parsed_url.path if parsed_url else None | |
510 self.request_body = request_body | |
511 self.headers = headers | |
512 self.is_ssl = is_ssl | |
513 self.trimmed_headers = self._TrimHeaders(headers) | |
514 self.formatted_request = self._GetFormattedRequest() | |
515 self.cmp_seq = self._GetCmpSeq(parsed_url.query if parsed_url else None) | |
516 | |
517 def __str__(self): | |
518 scheme = 'https' if self.is_ssl else 'http' | |
519 return '%s %s://%s%s %s' % ( | |
520 self.command, scheme, self.host, self.full_path, self.trimmed_headers) | |
521 | |
522 def __repr__(self): | |
523 return repr((self.command, self.host, self.full_path, self.request_body, | |
524 self.trimmed_headers, self.is_ssl)) | |
525 | |
526 def __hash__(self): | |
527 """Return a integer hash to use for hashed collections including dict.""" | |
528 return hash(repr(self)) | |
529 | |
530 def __eq__(self, other): | |
531 """Define the __eq__ method to match the hash behavior.""" | |
532 return repr(self) == repr(other) | |
533 | |
534 def __setstate__(self, state): | |
535 """Influence how to unpickle. | |
536 | |
537 "headers" are the original request headers. | |
538 "trimmed_headers" are the trimmed headers used for matching requests | |
539 during replay. | |
540 | |
541 Args: | |
542 state: a dictionary for __dict__ | |
543 """ | |
544 if 'full_headers' in state: | |
545 # Fix older version of archive. | |
546 state['headers'] = state['full_headers'] | |
547 del state['full_headers'] | |
548 if 'headers' not in state: | |
549 raise HttpArchiveException( | |
550 'Archived HTTP request is missing "headers". The HTTP archive is' | |
551 ' likely from a previous version and must be re-recorded.') | |
552 if 'path' in state: | |
553 # before, 'path' and 'path_without_query' were used and 'path' was | |
554 # pickled. Now, 'path' has been renamed to 'full_path' and | |
555 # 'path_without_query' has been renamed to 'path'. 'full_path' is | |
556 # pickled, but 'path' is not. If we see 'path' here it means we are | |
557 # dealing with an older archive. | |
558 state['full_path'] = state['path'] | |
559 del state['path'] | |
560 state['trimmed_headers'] = self._TrimHeaders(dict(state['headers'])) | |
561 if 'is_ssl' not in state: | |
562 state['is_ssl'] = False | |
563 self.__dict__.update(state) | |
564 parsed_url = urlparse.urlparse(self.full_path) | |
565 self.path = parsed_url.path | |
566 self.formatted_request = self._GetFormattedRequest() | |
567 self.cmp_seq = self._GetCmpSeq(parsed_url.query) | |
568 | |
569 def __getstate__(self): | |
570 """Influence how to pickle. | |
571 | |
572 Returns: | |
573 a dict to use for pickling | |
574 """ | |
575 state = self.__dict__.copy() | |
576 del state['trimmed_headers'] | |
577 del state['path'] | |
578 del state['formatted_request'] | |
579 del state['cmp_seq'] | |
580 return state | |
581 | |
582 def _GetFormattedRequest(self): | |
583 """Format request to make diffs easier to read. | |
584 | |
585 Returns: | |
586 A string consisting of the request. Example: | |
587 'GET www.example.com/path\nHeader-Key: header value\n' | |
588 """ | |
589 parts = ['%s %s%s\n' % (self.command, self.host, self.full_path)] | |
590 if self.request_body: | |
591 parts.append('%s\n' % self.request_body) | |
592 for k, v in self.trimmed_headers: | |
593 k = '-'.join(x.capitalize() for x in k.split('-')) | |
594 parts.append('%s: %s\n' % (k, v)) | |
595 return ''.join(parts) | |
596 | |
597 def _GetCmpSeq(self, query=None): | |
598 """Compute a sequence out of query and header for difflib to compare. | |
599 For example: | |
600 [('q1', 'a1'), ('q2', 'a2'), ('k1', 'v1'), ('k2', 'v2')] | |
601 will be returned for a request with URL: | |
602 http://example.com/index.html?q1=a2&q2=a2 | |
603 and header: | |
604 k1: v1 | |
605 k2: v2 | |
606 | |
607 Args: | |
608 query: the query string in the URL. | |
609 | |
610 Returns: | |
611 A sequence for difflib to compare. | |
612 """ | |
613 if not query: | |
614 return self.trimmed_headers | |
615 return sorted(urlparse.parse_qsl(query)) + self.trimmed_headers | |
616 | |
617 def matches(self, command=None, host=None, full_path=None, is_ssl=None, | |
618 use_query=True): | |
619 """Returns true iff the request matches all parameters. | |
620 | |
621 Args: | |
622 command: a string (e.g. 'GET' or 'POST'). | |
623 host: a host name (e.g. 'www.google.com'). | |
624 full_path: a request path with query string (e.g. '/search?q=dogs') | |
625 is_ssl: whether the request is secure. | |
626 use_query: | |
627 If use_query is True, request matching uses both the hierarchical path | |
628 and query string component. | |
629 If use_query is False, request matching only uses the hierarchical path | |
630 | |
631 e.g. req1 = GET www.test.com/index?aaaa | |
632 req2 = GET www.test.com/index?bbbb | |
633 | |
634 If use_query is True, req1.matches(req2) evaluates to False | |
635 If use_query is False, req1.matches(req2) evaluates to True | |
636 | |
637 Returns: | |
638 True iff the request matches all parameters | |
639 """ | |
640 if command is not None and command != self.command: | |
641 return False | |
642 if is_ssl is not None and is_ssl != self.is_ssl: | |
643 return False | |
644 if host is not None and host != self.host: | |
645 return False | |
646 if full_path is None: | |
647 return True | |
648 if use_query: | |
649 return full_path == self.full_path | |
650 else: | |
651 return self.path == urlparse.urlparse(full_path).path | |
652 | |
653 @classmethod | |
654 def _TrimHeaders(cls, headers): | |
655 """Removes headers that are known to cause problems during replay. | |
656 | |
657 These headers are removed for the following reasons: | |
658 - accept: Causes problems with www.bing.com. During record, CSS is fetched | |
659 with *. During replay, it's text/css. | |
660 - accept-charset, accept-language, referer: vary between clients. | |
661 - cache-control: sometimes sent from Chrome with 'max-age=0' as value. | |
662 - connection, method, scheme, url, version: Cause problems with spdy. | |
663 - cookie: Extremely sensitive to request/response order. | |
664 - keep-alive: Doesn't affect the content of the request, only some | |
665 transient state of the transport layer. | |
666 - user-agent: Changes with every Chrome version. | |
667 - proxy-connection: Sent for proxy requests. | |
668 - x-chrome-variations, x-client-data: Unique to each Chrome binary. Used by | |
669 Google to collect statistics about Chrome's enabled features. | |
670 | |
671 Another variant to consider is dropping only the value from the header. | |
672 However, this is particularly bad for the cookie header, because the | |
673 presence of the cookie depends on the responses we've seen when the request | |
674 is made. | |
675 | |
676 Args: | |
677 headers: {header_key: header_value, ...} | |
678 | |
679 Returns: | |
680 [(header_key, header_value), ...] # (with undesirable headers removed) | |
681 """ | |
682 # TODO(tonyg): Strip sdch from the request headers because we can't | |
683 # guarantee that the dictionary will be recorded, so replay may not work. | |
684 if 'accept-encoding' in headers: | |
685 accept_encoding = headers['accept-encoding'] | |
686 accept_encoding = accept_encoding.replace('sdch', '') | |
687 # Strip lzma so Opera's requests matches archives recorded using Chrome. | |
688 accept_encoding = accept_encoding.replace('lzma', '') | |
689 stripped_encodings = [e.strip() for e in accept_encoding.split(',')] | |
690 accept_encoding = ','.join(filter(bool, stripped_encodings)) | |
691 headers['accept-encoding'] = accept_encoding | |
692 undesirable_keys = [ | |
693 'accept', 'accept-charset', 'accept-language', 'cache-control', | |
694 'connection', 'cookie', 'keep-alive', 'method', | |
695 'referer', 'scheme', 'url', 'version', 'user-agent', 'proxy-connection', | |
696 'x-chrome-variations', 'x-client-data'] | |
697 return sorted([(k, v) for k, v in headers.items() | |
698 if k.lower() not in undesirable_keys]) | |
699 | |
700 def is_conditional(self): | |
701 """Return list of headers that match conditional headers.""" | |
702 for header in self.CONDITIONAL_HEADERS: | |
703 if header in self.headers: | |
704 return True | |
705 return False | |
706 | |
707 def create_request_without_conditions(self): | |
708 stripped_headers = dict((k, v) for k, v in self.headers.iteritems() | |
709 if k.lower() not in self.CONDITIONAL_HEADERS) | |
710 return ArchivedHttpRequest( | |
711 self.command, self.host, self.full_path, self.request_body, | |
712 stripped_headers, self.is_ssl) | |
713 | |
714 class ArchivedHttpResponse(object): | |
715 """All the data needed to recreate all HTTP response.""" | |
716 | |
717 # CHUNK_EDIT_SEPARATOR is used to edit and view text content. | |
718 # It is not sent in responses. It is added by get_data_as_text() | |
719 # and removed by set_data(). | |
720 CHUNK_EDIT_SEPARATOR = '[WEB_PAGE_REPLAY_CHUNK_BOUNDARY]' | |
721 | |
722 # DELAY_EDIT_SEPARATOR is used to edit and view server delays. | |
723 DELAY_EDIT_SEPARATOR = ('\n[WEB_PAGE_REPLAY_EDIT_ARCHIVE --- ' | |
724 'Delays are above. Response content is below.]\n') | |
725 | |
726 def __init__(self, version, status, reason, headers, response_data, | |
727 delays=None): | |
728 """Initialize an ArchivedHttpResponse. | |
729 | |
730 Args: | |
731 version: HTTP protocol version used by server. | |
732 10 for HTTP/1.0, 11 for HTTP/1.1 (same as httplib). | |
733 status: Status code returned by server (e.g. 200). | |
734 reason: Reason phrase returned by server (e.g. "OK"). | |
735 headers: list of (header, value) tuples. | |
736 response_data: list of content chunks. | |
737 Concatenating the chunks gives the complete contents | |
738 (i.e. the chunks do not have any lengths or delimiters). | |
739 Do not include the final, zero-length chunk that marks the end. | |
740 delays: dict of (ms) delays for 'connect', 'headers' and 'data'. | |
741 e.g. {'connect': 50, 'headers': 150, 'data': [0, 10, 10]} | |
742 connect - The time to connect to the server. | |
743 Each resource has a value because Replay's record mode captures it. | |
744 This includes the time for the SYN and SYN/ACK (1 rtt). | |
745 headers - The time elapsed between the TCP connect and the headers. | |
746 This typically includes all the server-time to generate a response. | |
747 data - If the response is chunked, these are the times for each chunk. | |
748 """ | |
749 self.version = version | |
750 self.status = status | |
751 self.reason = reason | |
752 self.headers = headers | |
753 self.response_data = response_data | |
754 self.delays = delays | |
755 self.fix_delays() | |
756 | |
757 def fix_delays(self): | |
758 """Initialize delays, or check the number of data delays.""" | |
759 expected_num_delays = len(self.response_data) | |
760 if not self.delays: | |
761 self.delays = { | |
762 'connect': 0, | |
763 'headers': 0, | |
764 'data': [0] * expected_num_delays | |
765 } | |
766 else: | |
767 num_delays = len(self.delays['data']) | |
768 if num_delays != expected_num_delays: | |
769 raise HttpArchiveException( | |
770 'Server delay length mismatch: %d (expected %d): %s', | |
771 num_delays, expected_num_delays, self.delays['data']) | |
772 | |
773 def __repr__(self): | |
774 return repr((self.version, self.status, self.reason, sorted(self.headers), | |
775 self.response_data)) | |
776 | |
777 def __hash__(self): | |
778 """Return a integer hash to use for hashed collections including dict.""" | |
779 return hash(repr(self)) | |
780 | |
781 def __eq__(self, other): | |
782 """Define the __eq__ method to match the hash behavior.""" | |
783 return repr(self) == repr(other) | |
784 | |
785 def __setstate__(self, state): | |
786 """Influence how to unpickle. | |
787 | |
788 Args: | |
789 state: a dictionary for __dict__ | |
790 """ | |
791 if 'server_delays' in state: | |
792 state['delays'] = { | |
793 'connect': 0, | |
794 'headers': 0, | |
795 'data': state['server_delays'] | |
796 } | |
797 del state['server_delays'] | |
798 elif 'delays' not in state: | |
799 state['delays'] = None | |
800 self.__dict__.update(state) | |
801 self.fix_delays() | |
802 | |
803 def get_header(self, key, default=None): | |
804 for k, v in self.headers: | |
805 if key.lower() == k.lower(): | |
806 return v | |
807 return default | |
808 | |
809 def set_header(self, key, value): | |
810 for i, (k, v) in enumerate(self.headers): | |
811 if key == k: | |
812 self.headers[i] = (key, value) | |
813 return | |
814 self.headers.append((key, value)) | |
815 | |
816 def remove_header(self, key): | |
817 for i, (k, v) in enumerate(self.headers): | |
818 if key.lower() == k.lower(): | |
819 self.headers.pop(i) | |
820 return | |
821 | |
822 @staticmethod | |
823 def _get_epoch_seconds(date_str): | |
824 """Return the epoch seconds of a date header. | |
825 | |
826 Args: | |
827 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") | |
828 Returns: | |
829 epoch seconds as a float | |
830 """ | |
831 date_tuple = email.utils.parsedate(date_str) | |
832 if date_tuple: | |
833 return calendar.timegm(date_tuple) | |
834 return None | |
835 | |
836 def update_date(self, date_str, now=None): | |
837 """Return an updated date based on its delta from the "Date" header. | |
838 | |
839 For example, if |date_str| is one week later than the "Date" header, | |
840 then the returned date string is one week later than the current date. | |
841 | |
842 Args: | |
843 date_str: a date string (e.g. "Thu, 01 Dec 1994 16:00:00 GMT") | |
844 Returns: | |
845 a date string | |
846 """ | |
847 date_seconds = self._get_epoch_seconds(self.get_header('date')) | |
848 header_seconds = self._get_epoch_seconds(date_str) | |
849 if date_seconds and header_seconds: | |
850 updated_seconds = header_seconds + (now or time.time()) - date_seconds | |
851 return email.utils.formatdate(updated_seconds, usegmt=True) | |
852 return date_str | |
853 | |
854 def is_gzip(self): | |
855 return self.get_header('content-encoding') == 'gzip' | |
856 | |
857 def is_compressed(self): | |
858 return self.get_header('content-encoding') in ('gzip', 'deflate') | |
859 | |
860 def is_chunked(self): | |
861 return self.get_header('transfer-encoding') == 'chunked' | |
862 | |
863 def get_data_as_text(self): | |
864 """Return content as a single string. | |
865 | |
866 Uncompresses and concatenates chunks with CHUNK_EDIT_SEPARATOR. | |
867 """ | |
868 content_type = self.get_header('content-type') | |
869 if (not content_type or | |
870 not (content_type.startswith('text/') or | |
871 content_type == 'application/x-javascript' or | |
872 content_type.startswith('application/json'))): | |
873 return None | |
874 if self.is_compressed(): | |
875 uncompressed_chunks = httpzlib.uncompress_chunks( | |
876 self.response_data, self.is_gzip()) | |
877 else: | |
878 uncompressed_chunks = self.response_data | |
879 return self.CHUNK_EDIT_SEPARATOR.join(uncompressed_chunks) | |
880 | |
881 def get_delays_as_text(self): | |
882 """Return delays as editable text.""" | |
883 return json.dumps(self.delays, indent=2) | |
884 | |
885 def get_response_as_text(self): | |
886 """Returns response content as a single string. | |
887 | |
888 Server delays are separated on a per-chunk basis. Delays are in seconds. | |
889 Response content begins after DELAY_EDIT_SEPARATOR | |
890 """ | |
891 data = self.get_data_as_text() | |
892 if data is None: | |
893 logging.warning('Data can not be represented as text.') | |
894 data = '' | |
895 delays = self.get_delays_as_text() | |
896 return self.DELAY_EDIT_SEPARATOR.join((delays, data)) | |
897 | |
898 def set_data(self, text): | |
899 """Inverse of get_data_as_text(). | |
900 | |
901 Split on CHUNK_EDIT_SEPARATOR and compress if needed. | |
902 """ | |
903 text_chunks = text.split(self.CHUNK_EDIT_SEPARATOR) | |
904 if self.is_compressed(): | |
905 self.response_data = httpzlib.compress_chunks(text_chunks, self.is_gzip()) | |
906 else: | |
907 self.response_data = text_chunks | |
908 if not self.is_chunked(): | |
909 content_length = sum(len(c) for c in self.response_data) | |
910 self.set_header('content-length', str(content_length)) | |
911 | |
912 def set_delays(self, delays_text): | |
913 """Inverse of get_delays_as_text(). | |
914 | |
915 Args: | |
916 delays_text: JSON encoded text such as the following: | |
917 { | |
918 connect: 80, | |
919 headers: 80, | |
920 data: [6, 55, 0] | |
921 } | |
922 Times are in milliseconds. | |
923 Each data delay corresponds with one response_data value. | |
924 """ | |
925 try: | |
926 self.delays = json.loads(delays_text) | |
927 except (ValueError, KeyError) as e: | |
928 logging.critical('Unable to parse delays %s: %s', delays_text, e) | |
929 self.fix_delays() | |
930 | |
931 def set_response_from_text(self, text): | |
932 """Inverse of get_response_as_text(). | |
933 | |
934 Modifies the state of the archive according to the textual representation. | |
935 """ | |
936 try: | |
937 delays, data = text.split(self.DELAY_EDIT_SEPARATOR) | |
938 except ValueError: | |
939 logging.critical( | |
940 'Error parsing text representation. Skipping edits.') | |
941 return | |
942 self.set_delays(delays) | |
943 self.set_data(data) | |
944 | |
945 | |
946 def create_response(status, reason=None, headers=None, body=None): | |
947 """Convenience method for creating simple ArchivedHttpResponse objects.""" | |
948 if reason is None: | |
949 reason = httplib.responses.get(status, 'Unknown') | |
950 if headers is None: | |
951 headers = [('content-type', 'text/plain')] | |
952 if body is None: | |
953 body = "%s %s" % (status, reason) | |
954 return ArchivedHttpResponse(11, status, reason, headers, [body]) | |
955 | |
956 | |
957 def main(): | |
958 class PlainHelpFormatter(optparse.IndentedHelpFormatter): | |
959 def format_description(self, description): | |
960 if description: | |
961 return description + '\n' | |
962 else: | |
963 return '' | |
964 | |
965 option_parser = optparse.OptionParser( | |
966 usage='%prog [ls|cat|edit|stats|merge] [options] replay_file(s)', | |
967 formatter=PlainHelpFormatter(), | |
968 description=__doc__, | |
969 epilog='http://code.google.com/p/web-page-replay/') | |
970 | |
971 option_parser.add_option('-c', '--command', default=None, | |
972 action='store', | |
973 type='string', | |
974 help='Only show URLs matching this command.') | |
975 option_parser.add_option('-o', '--host', default=None, | |
976 action='store', | |
977 type='string', | |
978 help='Only show URLs matching this host.') | |
979 option_parser.add_option('-p', '--full_path', default=None, | |
980 action='store', | |
981 type='string', | |
982 help='Only show URLs matching this full path.') | |
983 option_parser.add_option('-f', '--merged_file', default=None, | |
984 action='store', | |
985 type='string', | |
986 help='The output file to use when using the merge command.') | |
987 | |
988 options, args = option_parser.parse_args() | |
989 | |
990 # Merge command expects an umlimited number of archives. | |
991 if len(args) < 2: | |
992 print 'args: %s' % args | |
993 option_parser.error('Must specify a command and replay_file') | |
994 | |
995 command = args[0] | |
996 replay_file = args[1] | |
997 | |
998 if not os.path.exists(replay_file): | |
999 option_parser.error('Replay file "%s" does not exist' % replay_file) | |
1000 | |
1001 http_archive = HttpArchive.Load(replay_file) | |
1002 if command == 'ls': | |
1003 print http_archive.ls(options.command, options.host, options.full_path) | |
1004 elif command == 'cat': | |
1005 print http_archive.cat(options.command, options.host, options.full_path) | |
1006 elif command == 'stats': | |
1007 print http_archive.stats(options.command, options.host, options.full_path) | |
1008 elif command == 'merge': | |
1009 if not options.merged_file: | |
1010 print 'Error: Must specify a merged file name (use --merged_file)' | |
1011 return | |
1012 http_archive.merge(options.merged_file, args[2:]) | |
1013 elif command == 'edit': | |
1014 http_archive.edit(options.command, options.host, options.full_path) | |
1015 http_archive.Persist(replay_file) | |
1016 else: | |
1017 option_parser.error('Unknown command "%s"' % command) | |
1018 return 0 | |
1019 | |
1020 | |
1021 if __name__ == '__main__': | |
1022 sys.exit(main()) | |
OLD | NEW |