| OLD | NEW |
| (Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright 2011 Google Inc. All Rights Reserved. |
| 3 # |
| 4 # Licensed under the Apache License, Version 2.0 (the "License"); |
| 5 # you may not use this file except in compliance with the License. |
| 6 # You may obtain a copy of the License at |
| 7 # |
| 8 # http://www.apache.org/licenses/LICENSE-2.0 |
| 9 # |
| 10 # Unless required by applicable law or agreed to in writing, software |
| 11 # distributed under the License is distributed on an "AS IS" BASIS, |
| 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 # See the License for the specific language governing permissions and |
| 14 # limitations under the License. |
| 15 |
| 16 """Retrieve web resources over http.""" |
| 17 |
| 18 import copy |
| 19 import httparchive |
| 20 import httplib |
| 21 import logging |
| 22 import os |
| 23 import platformsettings |
| 24 import re |
| 25 import util |
| 26 |
| 27 |
| 28 HTML_RE = re.compile(r'^.{,256}?<html.*?>', re.IGNORECASE | re.DOTALL) |
| 29 HEAD_RE = re.compile(r'^.{,256}?<head.*?>', re.IGNORECASE | re.DOTALL) |
| 30 TIMER = platformsettings.get_platform_settings().timer |
| 31 |
| 32 |
| 33 class HttpClientException(Exception): |
| 34 """Base class for all exceptions in httpclient.""" |
| 35 pass |
| 36 |
| 37 |
| 38 def GetInjectScript(scripts): |
| 39 """Loads |scripts| from disk and returns a string of their content.""" |
| 40 lines = [] |
| 41 for script in scripts: |
| 42 if os.path.exists(script): |
| 43 lines += open(script).read() |
| 44 elif util.resource_exists(script): |
| 45 lines += util.resource_string(script) |
| 46 else: |
| 47 raise HttpClientException('Script does not exist: %s', script) |
| 48 return ''.join(lines) |
| 49 |
| 50 |
| 51 def _InjectScripts(response, inject_script): |
| 52 """Injects |inject_script| immediately after <head> or <html>. |
| 53 |
| 54 Copies |response| if it is modified. |
| 55 |
| 56 Args: |
| 57 response: an ArchivedHttpResponse |
| 58 inject_script: JavaScript string (e.g. "Math.random = function(){...}") |
| 59 Returns: |
| 60 an ArchivedHttpResponse |
| 61 """ |
| 62 if type(response) == tuple: |
| 63 logging.warn('tuple response: %s', response) |
| 64 content_type = response.get_header('content-type') |
| 65 if content_type and content_type.startswith('text/html'): |
| 66 text = response.get_data_as_text() |
| 67 |
| 68 def InsertScriptAfter(matchobj): |
| 69 return '%s<script>%s</script>' % (matchobj.group(0), inject_script) |
| 70 |
| 71 if text and not inject_script in text: |
| 72 text, is_injected = HEAD_RE.subn(InsertScriptAfter, text, 1) |
| 73 if not is_injected: |
| 74 text, is_injected = HTML_RE.subn(InsertScriptAfter, text, 1) |
| 75 if not is_injected: |
| 76 logging.warning('Failed to inject scripts.') |
| 77 logging.debug('Response content: %s', text) |
| 78 else: |
| 79 response = copy.deepcopy(response) |
| 80 response.set_data(text) |
| 81 return response |
| 82 |
| 83 |
| 84 class DetailedHTTPResponse(httplib.HTTPResponse): |
| 85 """Preserve details relevant to replaying responses. |
| 86 |
| 87 WARNING: This code uses attributes and methods of HTTPResponse |
| 88 that are not part of the public interface. |
| 89 """ |
| 90 |
| 91 def read_chunks(self): |
| 92 """Return the response body content and timing data. |
| 93 |
| 94 The returned chunks have the chunk size and CRLFs stripped off. |
| 95 If the response was compressed, the returned data is still compressed. |
| 96 |
| 97 Returns: |
| 98 (chunks, delays) |
| 99 chunks: |
| 100 [response_body] # non-chunked responses |
| 101 [chunk_1, chunk_2, ...] # chunked responses |
| 102 delays: |
| 103 [0] # non-chunked responses |
| 104 [chunk_1_first_byte_delay, ...] # chunked responses |
| 105 |
| 106 The delay for the first body item should be recorded by the caller. |
| 107 """ |
| 108 buf = [] |
| 109 chunks = [] |
| 110 delays = [] |
| 111 if not self.chunked: |
| 112 chunks.append(self.read()) |
| 113 delays.append(0) |
| 114 else: |
| 115 start = TIMER() |
| 116 try: |
| 117 while True: |
| 118 line = self.fp.readline() |
| 119 chunk_size = self._read_chunk_size(line) |
| 120 if chunk_size is None: |
| 121 raise httplib.IncompleteRead(''.join(chunks)) |
| 122 if chunk_size == 0: |
| 123 break |
| 124 delays.append(TIMER() - start) |
| 125 chunks.append(self._safe_read(chunk_size)) |
| 126 self._safe_read(2) # skip the CRLF at the end of the chunk |
| 127 start = TIMER() |
| 128 |
| 129 # Ignore any trailers. |
| 130 while True: |
| 131 line = self.fp.readline() |
| 132 if not line or line == '\r\n': |
| 133 break |
| 134 finally: |
| 135 self.close() |
| 136 return chunks, delays |
| 137 |
| 138 @classmethod |
| 139 def _read_chunk_size(cls, line): |
| 140 chunk_extensions_pos = line.find(';') |
| 141 if chunk_extensions_pos != -1: |
| 142 line = line[:extention_pos] # strip chunk-extensions |
| 143 try: |
| 144 chunk_size = int(line, 16) |
| 145 except ValueError: |
| 146 return None |
| 147 return chunk_size |
| 148 |
| 149 |
| 150 class DetailedHTTPConnection(httplib.HTTPConnection): |
| 151 """Preserve details relevant to replaying connections.""" |
| 152 response_class = DetailedHTTPResponse |
| 153 |
| 154 |
| 155 class DetailedHTTPSResponse(DetailedHTTPResponse): |
| 156 """Preserve details relevant to replaying SSL responses.""" |
| 157 pass |
| 158 |
| 159 class DetailedHTTPSConnection(httplib.HTTPSConnection): |
| 160 """Preserve details relevant to replaying SSL connections.""" |
| 161 response_class = DetailedHTTPSResponse |
| 162 |
| 163 |
| 164 class RealHttpFetch(object): |
| 165 def __init__(self, real_dns_lookup, get_server_rtt): |
| 166 """Initialize RealHttpFetch. |
| 167 |
| 168 Args: |
| 169 real_dns_lookup: a function that resolves a host to an IP. |
| 170 get_server_rtt: a function that returns the round-trip time of a host. |
| 171 """ |
| 172 self._real_dns_lookup = real_dns_lookup |
| 173 self._get_server_rtt = get_server_rtt |
| 174 |
| 175 def __call__(self, request): |
| 176 """Fetch an HTTP request. |
| 177 |
| 178 Args: |
| 179 request: an ArchivedHttpRequest |
| 180 Returns: |
| 181 an ArchivedHttpResponse |
| 182 """ |
| 183 logging.debug('RealHttpFetch: %s %s', request.host, request.path) |
| 184 host_ip = self._real_dns_lookup(request.host) |
| 185 if not host_ip: |
| 186 logging.critical('Unable to find host ip for name: %s', request.host) |
| 187 return None |
| 188 retries = 3 |
| 189 while True: |
| 190 try: |
| 191 if request.is_ssl: |
| 192 connection = DetailedHTTPSConnection(host_ip) |
| 193 else: |
| 194 connection = DetailedHTTPConnection(host_ip) |
| 195 start = TIMER() |
| 196 connection.request( |
| 197 request.command, |
| 198 request.path, |
| 199 request.request_body, |
| 200 request.headers) |
| 201 response = connection.getresponse() |
| 202 headers_delay = int((TIMER() - start) * 1000) |
| 203 headers_delay -= self._get_server_rtt(request.host) |
| 204 |
| 205 chunks, chunk_delays = response.read_chunks() |
| 206 delays = { |
| 207 'headers': headers_delay, |
| 208 'data': chunk_delays |
| 209 } |
| 210 archived_http_response = httparchive.ArchivedHttpResponse( |
| 211 response.version, |
| 212 response.status, |
| 213 response.reason, |
| 214 response.getheaders(), |
| 215 chunks, |
| 216 delays) |
| 217 return archived_http_response |
| 218 except Exception, e: |
| 219 if retries: |
| 220 retries -= 1 |
| 221 logging.warning('Retrying fetch %s: %s', request, e) |
| 222 continue |
| 223 logging.critical('Could not fetch %s: %s', request, e) |
| 224 return None |
| 225 |
| 226 |
| 227 class RecordHttpArchiveFetch(object): |
| 228 """Make real HTTP fetches and save responses in the given HttpArchive.""" |
| 229 |
| 230 def __init__(self, http_archive, real_dns_lookup, inject_script, |
| 231 cache_misses=None): |
| 232 """Initialize RecordHttpArchiveFetch. |
| 233 |
| 234 Args: |
| 235 http_archive: an instance of a HttpArchive |
| 236 real_dns_lookup: a function that resolves a host to an IP. |
| 237 inject_script: script string to inject in all pages |
| 238 cache_misses: instance of CacheMissArchive |
| 239 """ |
| 240 self.http_archive = http_archive |
| 241 self.real_http_fetch = RealHttpFetch(real_dns_lookup, |
| 242 http_archive.get_server_rtt) |
| 243 self.inject_script = inject_script |
| 244 self.cache_misses = cache_misses |
| 245 |
| 246 def __call__(self, request): |
| 247 """Fetch the request and return the response. |
| 248 |
| 249 Args: |
| 250 request: an ArchivedHttpRequest. |
| 251 Returns: |
| 252 an ArchivedHttpResponse |
| 253 """ |
| 254 if self.cache_misses: |
| 255 self.cache_misses.record_request( |
| 256 request, is_record_mode=True, is_cache_miss=False) |
| 257 |
| 258 # If request is already in the archive, return the archived response. |
| 259 if request in self.http_archive: |
| 260 logging.debug('Repeated request found: %s', request) |
| 261 response = self.http_archive[request] |
| 262 else: |
| 263 response = self.real_http_fetch(request) |
| 264 if response is None: |
| 265 return None |
| 266 self.http_archive[request] = response |
| 267 if self.inject_script: |
| 268 response = _InjectScripts(response, self.inject_script) |
| 269 logging.debug('Recorded: %s', request) |
| 270 return response |
| 271 |
| 272 |
| 273 class ReplayHttpArchiveFetch(object): |
| 274 """Serve responses from the given HttpArchive.""" |
| 275 |
| 276 def __init__(self, http_archive, inject_script, |
| 277 use_diff_on_unknown_requests=False, cache_misses=None, |
| 278 use_closest_match=False): |
| 279 """Initialize ReplayHttpArchiveFetch. |
| 280 |
| 281 Args: |
| 282 http_archive: an instance of a HttpArchive |
| 283 inject_script: script string to inject in all pages |
| 284 use_diff_on_unknown_requests: If True, log unknown requests |
| 285 with a diff to requests that look similar. |
| 286 cache_misses: Instance of CacheMissArchive. |
| 287 Callback updates archive on cache misses |
| 288 use_closest_match: If True, on replay mode, serve the closest match |
| 289 in the archive instead of giving a 404. |
| 290 """ |
| 291 self.http_archive = http_archive |
| 292 self.inject_script = inject_script |
| 293 self.use_diff_on_unknown_requests = use_diff_on_unknown_requests |
| 294 self.cache_misses = cache_misses |
| 295 self.use_closest_match = use_closest_match |
| 296 |
| 297 def __call__(self, request): |
| 298 """Fetch the request and return the response. |
| 299 |
| 300 Args: |
| 301 request: an instance of an ArchivedHttpRequest. |
| 302 Returns: |
| 303 Instance of ArchivedHttpResponse (if found) or None |
| 304 """ |
| 305 response = self.http_archive.get(request) |
| 306 |
| 307 if self.use_closest_match and not response: |
| 308 closest_request = self.http_archive.find_closest_request( |
| 309 request, use_path=True) |
| 310 if closest_request: |
| 311 response = self.http_archive.get(closest_request) |
| 312 if response: |
| 313 logging.info('Request not found: %s\nUsing closest match: %s', |
| 314 request, closest_request) |
| 315 |
| 316 if self.cache_misses: |
| 317 self.cache_misses.record_request( |
| 318 request, is_record_mode=False, is_cache_miss=not response) |
| 319 |
| 320 if not response: |
| 321 reason = str(request) |
| 322 if self.use_diff_on_unknown_requests: |
| 323 diff = self.http_archive.diff(request) |
| 324 if diff: |
| 325 reason += ( |
| 326 "\nNearest request diff " |
| 327 "('-' for archived request, '+' for current request):\n%s" % diff) |
| 328 logging.warning('Could not replay: %s', reason) |
| 329 else: |
| 330 response = _InjectScripts(response, self.inject_script) |
| 331 return response |
| 332 |
| 333 |
| 334 class ControllableHttpArchiveFetch(object): |
| 335 """Controllable fetch function that can swap between record and replay.""" |
| 336 |
| 337 def __init__(self, http_archive, real_dns_lookup, |
| 338 inject_script, use_diff_on_unknown_requests, |
| 339 use_record_mode, cache_misses, use_closest_match): |
| 340 """Initialize HttpArchiveFetch. |
| 341 |
| 342 Args: |
| 343 http_archive: an instance of a HttpArchive |
| 344 real_dns_lookup: a function that resolves a host to an IP. |
| 345 inject_script: script string to inject in all pages. |
| 346 use_diff_on_unknown_requests: If True, log unknown requests |
| 347 with a diff to requests that look similar. |
| 348 use_record_mode: If True, start in server in record mode. |
| 349 cache_misses: Instance of CacheMissArchive. |
| 350 use_closest_match: If True, on replay mode, serve the closest match |
| 351 in the archive instead of giving a 404. |
| 352 """ |
| 353 self.record_fetch = RecordHttpArchiveFetch( |
| 354 http_archive, real_dns_lookup, inject_script, |
| 355 cache_misses) |
| 356 self.replay_fetch = ReplayHttpArchiveFetch( |
| 357 http_archive, inject_script, use_diff_on_unknown_requests, cache_misses, |
| 358 use_closest_match) |
| 359 if use_record_mode: |
| 360 self.SetRecordMode() |
| 361 else: |
| 362 self.SetReplayMode() |
| 363 |
| 364 def SetRecordMode(self): |
| 365 self.fetch = self.record_fetch |
| 366 self.is_record_mode = True |
| 367 |
| 368 def SetReplayMode(self): |
| 369 self.fetch = self.replay_fetch |
| 370 self.is_record_mode = False |
| 371 |
| 372 def __call__(self, *args, **kwargs): |
| 373 """Forward calls to Replay/Record fetch functions depending on mode.""" |
| 374 return self.fetch(*args, **kwargs) |
| OLD | NEW |