| OLD | NEW |
| (Empty) |
| 1 #!/usr/bin/env python | |
| 2 # Copyright 2012 Google Inc. All Rights Reserved. | |
| 3 # | |
| 4 # Licensed under the Apache License, Version 2.0 (the "License"); | |
| 5 # you may not use this file except in compliance with the License. | |
| 6 # You may obtain a copy of the License at | |
| 7 # | |
| 8 # http://www.apache.org/licenses/LICENSE-2.0 | |
| 9 # | |
| 10 # Unless required by applicable law or agreed to in writing, software | |
| 11 # distributed under the License is distributed on an "AS IS" BASIS, | |
| 12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 13 # See the License for the specific language governing permissions and | |
| 14 # limitations under the License. | |
| 15 | |
| 16 """Retrieve web resources over http.""" | |
| 17 | |
| 18 import copy | |
| 19 import httplib | |
| 20 import logging | |
| 21 import random | |
| 22 import ssl | |
| 23 import StringIO | |
| 24 | |
| 25 import httparchive | |
| 26 import platformsettings | |
| 27 import script_injector | |
| 28 | |
| 29 | |
| 30 # PIL isn't always available, but we still want to be able to run without | |
| 31 # the image scrambling functionality in this case. | |
| 32 try: | |
| 33 import Image | |
| 34 except ImportError: | |
| 35 Image = None | |
| 36 | |
| 37 TIMER = platformsettings.timer | |
| 38 | |
| 39 | |
| 40 class HttpClientException(Exception): | |
| 41 """Base class for all exceptions in httpclient.""" | |
| 42 pass | |
| 43 | |
| 44 | |
| 45 def _InjectScripts(response, inject_script): | |
| 46 """Injects |inject_script| immediately after <head> or <html>. | |
| 47 | |
| 48 Copies |response| if it is modified. | |
| 49 | |
| 50 Args: | |
| 51 response: an ArchivedHttpResponse | |
| 52 inject_script: JavaScript string (e.g. "Math.random = function(){...}") | |
| 53 Returns: | |
| 54 an ArchivedHttpResponse | |
| 55 """ | |
| 56 if type(response) == tuple: | |
| 57 logging.warn('tuple response: %s', response) | |
| 58 content_type = response.get_header('content-type') | |
| 59 if content_type and content_type.startswith('text/html'): | |
| 60 text_chunks = response.get_data_as_chunks() | |
| 61 text_chunks, just_injected = script_injector.InjectScript( | |
| 62 text_chunks, 'text/html', inject_script) | |
| 63 if just_injected: | |
| 64 response = copy.deepcopy(response) | |
| 65 response.set_data_from_chunks(text_chunks) | |
| 66 return response | |
| 67 | |
| 68 | |
| 69 def _ScrambleImages(response): | |
| 70 """If the |response| is an image, attempt to scramble it. | |
| 71 | |
| 72 Copies |response| if it is modified. | |
| 73 | |
| 74 Args: | |
| 75 response: an ArchivedHttpResponse | |
| 76 Returns: | |
| 77 an ArchivedHttpResponse | |
| 78 """ | |
| 79 | |
| 80 assert Image, '--scramble_images requires the PIL module to be installed.' | |
| 81 | |
| 82 content_type = response.get_header('content-type') | |
| 83 if content_type and content_type.startswith('image/'): | |
| 84 try: | |
| 85 image_data = response.response_data[0] | |
| 86 image_data.decode(encoding='base64') | |
| 87 im = Image.open(StringIO.StringIO(image_data)) | |
| 88 | |
| 89 pixel_data = list(im.getdata()) | |
| 90 random.shuffle(pixel_data) | |
| 91 | |
| 92 scrambled_image = im.copy() | |
| 93 scrambled_image.putdata(pixel_data) | |
| 94 | |
| 95 output_image_io = StringIO.StringIO() | |
| 96 scrambled_image.save(output_image_io, im.format) | |
| 97 output_image_data = output_image_io.getvalue() | |
| 98 output_image_data.encode(encoding='base64') | |
| 99 | |
| 100 response = copy.deepcopy(response) | |
| 101 response.set_data(output_image_data) | |
| 102 except Exception: | |
| 103 pass | |
| 104 | |
| 105 return response | |
| 106 | |
| 107 | |
| 108 class DetailedHTTPResponse(httplib.HTTPResponse): | |
| 109 """Preserve details relevant to replaying responses. | |
| 110 | |
| 111 WARNING: This code uses attributes and methods of HTTPResponse | |
| 112 that are not part of the public interface. | |
| 113 """ | |
| 114 | |
| 115 def read_chunks(self): | |
| 116 """Return the response body content and timing data. | |
| 117 | |
| 118 The returned chunks have the chunk size and CRLFs stripped off. | |
| 119 If the response was compressed, the returned data is still compressed. | |
| 120 | |
| 121 Returns: | |
| 122 (chunks, delays) | |
| 123 chunks: | |
| 124 [response_body] # non-chunked responses | |
| 125 [chunk_1, chunk_2, ...] # chunked responses | |
| 126 delays: | |
| 127 [0] # non-chunked responses | |
| 128 [chunk_1_first_byte_delay, ...] # chunked responses | |
| 129 | |
| 130 The delay for the first body item should be recorded by the caller. | |
| 131 """ | |
| 132 buf = [] | |
| 133 chunks = [] | |
| 134 delays = [] | |
| 135 if not self.chunked: | |
| 136 chunks.append(self.read()) | |
| 137 delays.append(0) | |
| 138 else: | |
| 139 start = TIMER() | |
| 140 try: | |
| 141 while True: | |
| 142 line = self.fp.readline() | |
| 143 chunk_size = self._read_chunk_size(line) | |
| 144 if chunk_size is None: | |
| 145 raise httplib.IncompleteRead(''.join(chunks)) | |
| 146 if chunk_size == 0: | |
| 147 break | |
| 148 delays.append(TIMER() - start) | |
| 149 chunks.append(self._safe_read(chunk_size)) | |
| 150 self._safe_read(2) # skip the CRLF at the end of the chunk | |
| 151 start = TIMER() | |
| 152 | |
| 153 # Ignore any trailers. | |
| 154 while True: | |
| 155 line = self.fp.readline() | |
| 156 if not line or line == '\r\n': | |
| 157 break | |
| 158 finally: | |
| 159 self.close() | |
| 160 return chunks, delays | |
| 161 | |
| 162 @classmethod | |
| 163 def _read_chunk_size(cls, line): | |
| 164 chunk_extensions_pos = line.find(';') | |
| 165 if chunk_extensions_pos != -1: | |
| 166 line = line[:chunk_extensions_pos] # strip chunk-extensions | |
| 167 try: | |
| 168 chunk_size = int(line, 16) | |
| 169 except ValueError: | |
| 170 return None | |
| 171 return chunk_size | |
| 172 | |
| 173 | |
| 174 class DetailedHTTPConnection(httplib.HTTPConnection): | |
| 175 """Preserve details relevant to replaying connections.""" | |
| 176 response_class = DetailedHTTPResponse | |
| 177 | |
| 178 | |
| 179 class DetailedHTTPSResponse(DetailedHTTPResponse): | |
| 180 """Preserve details relevant to replaying SSL responses.""" | |
| 181 pass | |
| 182 | |
| 183 | |
| 184 class DetailedHTTPSConnection(httplib.HTTPSConnection): | |
| 185 """Preserve details relevant to replaying SSL connections.""" | |
| 186 response_class = DetailedHTTPSResponse | |
| 187 | |
| 188 def __init__(self, host, port): | |
| 189 # https://www.python.org/dev/peps/pep-0476/#opting-out | |
| 190 if hasattr(ssl, '_create_unverified_context'): | |
| 191 httplib.HTTPSConnection.__init__( | |
| 192 self, host=host, port=port, context=ssl._create_unverified_context()) | |
| 193 else: | |
| 194 httplib.HTTPSConnection.__init__(self, host=host, port=port) | |
| 195 | |
| 196 | |
| 197 class RealHttpFetch(object): | |
| 198 | |
| 199 def __init__(self, real_dns_lookup): | |
| 200 """Initialize RealHttpFetch. | |
| 201 | |
| 202 Args: | |
| 203 real_dns_lookup: a function that resolves a host to an IP. RealHttpFetch | |
| 204 will resolve host name to the IP before making fetching request if this | |
| 205 is not None. | |
| 206 """ | |
| 207 self._real_dns_lookup = real_dns_lookup | |
| 208 | |
| 209 @staticmethod | |
| 210 def _GetHeaderNameValue(header): | |
| 211 """Parse the header line and return a name/value tuple. | |
| 212 | |
| 213 Args: | |
| 214 header: a string for a header such as "Content-Length: 314". | |
| 215 Returns: | |
| 216 A tuple (header_name, header_value) on success or None if the header | |
| 217 is not in expected format. header_name is in lowercase. | |
| 218 """ | |
| 219 i = header.find(':') | |
| 220 if i > 0: | |
| 221 return (header[:i].lower(), header[i+1:].strip()) | |
| 222 return None | |
| 223 | |
| 224 @staticmethod | |
| 225 def _ToTuples(headers): | |
| 226 """Parse headers and save them to a list of tuples. | |
| 227 | |
| 228 This method takes HttpResponse.msg.headers as input and convert it | |
| 229 to a list of (header_name, header_value) tuples. | |
| 230 HttpResponse.msg.headers is a list of strings where each string | |
| 231 represents either a header or a continuation line of a header. | |
| 232 1. a normal header consists of two parts which are separated by colon : | |
| 233 "header_name:header_value..." | |
| 234 2. a continuation line is a string starting with whitespace | |
| 235 "[whitespace]continued_header_value..." | |
| 236 If a header is not in good shape or an unexpected continuation line is | |
| 237 seen, it will be ignored. | |
| 238 | |
| 239 Should avoid using response.getheaders() directly | |
| 240 because response.getheaders() can't handle multiple headers | |
| 241 with the same name properly. Instead, parse the | |
| 242 response.msg.headers using this method to get all headers. | |
| 243 | |
| 244 Args: | |
| 245 headers: an instance of HttpResponse.msg.headers. | |
| 246 Returns: | |
| 247 A list of tuples which looks like: | |
| 248 [(header_name, header_value), (header_name2, header_value2)...] | |
| 249 """ | |
| 250 all_headers = [] | |
| 251 for line in headers: | |
| 252 if line[0] in '\t ': | |
| 253 if not all_headers: | |
| 254 logging.warning( | |
| 255 'Unexpected response header continuation line [%s]', line) | |
| 256 continue | |
| 257 name, value = all_headers.pop() | |
| 258 value += '\n ' + line.strip() | |
| 259 else: | |
| 260 name_value = RealHttpFetch._GetHeaderNameValue(line) | |
| 261 if not name_value: | |
| 262 logging.warning( | |
| 263 'Response header in wrong format [%s]', line) | |
| 264 continue | |
| 265 name, value = name_value # pylint: disable=unpacking-non-sequence | |
| 266 all_headers.append((name, value)) | |
| 267 return all_headers | |
| 268 | |
| 269 @staticmethod | |
| 270 def _get_request_host_port(request): | |
| 271 host_parts = request.host.split(':') | |
| 272 host = host_parts[0] | |
| 273 port = int(host_parts[1]) if len(host_parts) == 2 else None | |
| 274 return host, port | |
| 275 | |
| 276 @staticmethod | |
| 277 def _get_system_proxy(is_ssl): | |
| 278 return platformsettings.get_system_proxy(is_ssl) | |
| 279 | |
| 280 def _get_connection(self, request_host, request_port, is_ssl): | |
| 281 """Return a detailed connection object for host/port pair. | |
| 282 | |
| 283 If a system proxy is defined (see platformsettings.py), it will be used. | |
| 284 | |
| 285 Args: | |
| 286 request_host: a host string (e.g. "www.example.com"). | |
| 287 request_port: a port integer (e.g. 8080) or None (for the default port). | |
| 288 is_ssl: True if HTTPS connection is needed. | |
| 289 Returns: | |
| 290 A DetailedHTTPSConnection or DetailedHTTPConnection instance. | |
| 291 """ | |
| 292 connection_host = request_host | |
| 293 connection_port = request_port | |
| 294 system_proxy = self._get_system_proxy(is_ssl) | |
| 295 if system_proxy: | |
| 296 connection_host = system_proxy.host | |
| 297 connection_port = system_proxy.port | |
| 298 | |
| 299 # Use an IP address because WPR may override DNS settings. | |
| 300 if self._real_dns_lookup: | |
| 301 connection_ip = self._real_dns_lookup(connection_host) | |
| 302 if not connection_ip: | |
| 303 logging.critical( | |
| 304 'Unable to find IP for host name: %s', connection_host) | |
| 305 return None | |
| 306 connection_host = connection_ip | |
| 307 | |
| 308 if is_ssl: | |
| 309 connection = DetailedHTTPSConnection(connection_host, connection_port) | |
| 310 if system_proxy: | |
| 311 connection.set_tunnel(request_host, request_port) | |
| 312 else: | |
| 313 connection = DetailedHTTPConnection(connection_host, connection_port) | |
| 314 return connection | |
| 315 | |
| 316 def __call__(self, request): | |
| 317 """Fetch an HTTP request. | |
| 318 | |
| 319 Args: | |
| 320 request: an ArchivedHttpRequest | |
| 321 Returns: | |
| 322 an ArchivedHttpResponse | |
| 323 """ | |
| 324 logging.debug('RealHttpFetch: %s %s', request.host, request.full_path) | |
| 325 request_host, request_port = self._get_request_host_port(request) | |
| 326 retries = 3 | |
| 327 while True: | |
| 328 try: | |
| 329 connection = self._get_connection( | |
| 330 request_host, request_port, request.is_ssl) | |
| 331 connect_start = TIMER() | |
| 332 connection.connect() | |
| 333 connect_delay = int((TIMER() - connect_start) * 1000) | |
| 334 start = TIMER() | |
| 335 connection.request( | |
| 336 request.command, | |
| 337 request.full_path, | |
| 338 request.request_body, | |
| 339 request.headers) | |
| 340 response = connection.getresponse() | |
| 341 headers_delay = int((TIMER() - start) * 1000) | |
| 342 | |
| 343 chunks, chunk_delays = response.read_chunks() | |
| 344 delays = { | |
| 345 'connect': connect_delay, | |
| 346 'headers': headers_delay, | |
| 347 'data': chunk_delays | |
| 348 } | |
| 349 archived_http_response = httparchive.ArchivedHttpResponse( | |
| 350 response.version, | |
| 351 response.status, | |
| 352 response.reason, | |
| 353 RealHttpFetch._ToTuples(response.msg.headers), | |
| 354 chunks, | |
| 355 delays) | |
| 356 return archived_http_response | |
| 357 except Exception, e: | |
| 358 if retries: | |
| 359 retries -= 1 | |
| 360 logging.warning('Retrying fetch %s: %s', request, repr(e)) | |
| 361 continue | |
| 362 logging.critical('Could not fetch %s: %s', request, repr(e)) | |
| 363 return None | |
| 364 | |
| 365 | |
| 366 class RecordHttpArchiveFetch(object): | |
| 367 """Make real HTTP fetches and save responses in the given HttpArchive.""" | |
| 368 | |
| 369 def __init__(self, http_archive, inject_script): | |
| 370 """Initialize RecordHttpArchiveFetch. | |
| 371 | |
| 372 Args: | |
| 373 http_archive: an instance of a HttpArchive | |
| 374 inject_script: script string to inject in all pages | |
| 375 """ | |
| 376 self.http_archive = http_archive | |
| 377 # Do not resolve host name to IP when recording to avoid SSL3 handshake | |
| 378 # failure. | |
| 379 # See https://github.com/chromium/web-page-replay/issues/73 for details. | |
| 380 self.real_http_fetch = RealHttpFetch(real_dns_lookup=None) | |
| 381 self.inject_script = inject_script | |
| 382 | |
| 383 def __call__(self, request): | |
| 384 """Fetch the request and return the response. | |
| 385 | |
| 386 Args: | |
| 387 request: an ArchivedHttpRequest. | |
| 388 Returns: | |
| 389 an ArchivedHttpResponse | |
| 390 """ | |
| 391 # If request is already in the archive, return the archived response. | |
| 392 if request in self.http_archive: | |
| 393 logging.debug('Repeated request found: %s', request) | |
| 394 response = self.http_archive[request] | |
| 395 else: | |
| 396 response = self.real_http_fetch(request) | |
| 397 if response is None: | |
| 398 return None | |
| 399 self.http_archive[request] = response | |
| 400 if self.inject_script: | |
| 401 response = _InjectScripts(response, self.inject_script) | |
| 402 logging.debug('Recorded: %s', request) | |
| 403 return response | |
| 404 | |
| 405 | |
| 406 class ReplayHttpArchiveFetch(object): | |
| 407 """Serve responses from the given HttpArchive.""" | |
| 408 | |
| 409 def __init__(self, http_archive, real_dns_lookup, inject_script, | |
| 410 use_diff_on_unknown_requests=False, | |
| 411 use_closest_match=False, scramble_images=False): | |
| 412 """Initialize ReplayHttpArchiveFetch. | |
| 413 | |
| 414 Args: | |
| 415 http_archive: an instance of a HttpArchive | |
| 416 real_dns_lookup: a function that resolves a host to an IP. | |
| 417 inject_script: script string to inject in all pages | |
| 418 use_diff_on_unknown_requests: If True, log unknown requests | |
| 419 with a diff to requests that look similar. | |
| 420 use_closest_match: If True, on replay mode, serve the closest match | |
| 421 in the archive instead of giving a 404. | |
| 422 """ | |
| 423 self.http_archive = http_archive | |
| 424 self.inject_script = inject_script | |
| 425 self.use_diff_on_unknown_requests = use_diff_on_unknown_requests | |
| 426 self.use_closest_match = use_closest_match | |
| 427 self.scramble_images = scramble_images | |
| 428 self.real_http_fetch = RealHttpFetch(real_dns_lookup) | |
| 429 | |
| 430 def __call__(self, request): | |
| 431 """Fetch the request and return the response. | |
| 432 | |
| 433 Args: | |
| 434 request: an instance of an ArchivedHttpRequest. | |
| 435 Returns: | |
| 436 Instance of ArchivedHttpResponse (if found) or None | |
| 437 """ | |
| 438 if request.host.startswith('127.0.0.1:'): | |
| 439 return self.real_http_fetch(request) | |
| 440 | |
| 441 response = self.http_archive.get(request) | |
| 442 | |
| 443 if self.use_closest_match and not response: | |
| 444 closest_request = self.http_archive.find_closest_request( | |
| 445 request, use_path=True) | |
| 446 if closest_request: | |
| 447 response = self.http_archive.get(closest_request) | |
| 448 if response: | |
| 449 logging.info('Request not found: %s\nUsing closest match: %s', | |
| 450 request, closest_request) | |
| 451 | |
| 452 if not response: | |
| 453 reason = str(request) | |
| 454 if self.use_diff_on_unknown_requests: | |
| 455 diff = self.http_archive.diff(request) | |
| 456 if diff: | |
| 457 reason += ( | |
| 458 "\nNearest request diff " | |
| 459 "('-' for archived request, '+' for current request):\n%s" % diff) | |
| 460 logging.warning('Could not replay: %s', reason) | |
| 461 else: | |
| 462 if self.inject_script: | |
| 463 response = _InjectScripts(response, self.inject_script) | |
| 464 if self.scramble_images: | |
| 465 response = _ScrambleImages(response) | |
| 466 return response | |
| 467 | |
| 468 | |
| 469 class ControllableHttpArchiveFetch(object): | |
| 470 """Controllable fetch function that can swap between record and replay.""" | |
| 471 | |
| 472 def __init__(self, http_archive, real_dns_lookup, | |
| 473 inject_script, use_diff_on_unknown_requests, | |
| 474 use_record_mode, use_closest_match, scramble_images): | |
| 475 """Initialize HttpArchiveFetch. | |
| 476 | |
| 477 Args: | |
| 478 http_archive: an instance of a HttpArchive | |
| 479 real_dns_lookup: a function that resolves a host to an IP. | |
| 480 inject_script: script string to inject in all pages. | |
| 481 use_diff_on_unknown_requests: If True, log unknown requests | |
| 482 with a diff to requests that look similar. | |
| 483 use_record_mode: If True, start in server in record mode. | |
| 484 use_closest_match: If True, on replay mode, serve the closest match | |
| 485 in the archive instead of giving a 404. | |
| 486 """ | |
| 487 self.http_archive = http_archive | |
| 488 self.record_fetch = RecordHttpArchiveFetch(http_archive, inject_script) | |
| 489 self.replay_fetch = ReplayHttpArchiveFetch( | |
| 490 http_archive, real_dns_lookup, inject_script, | |
| 491 use_diff_on_unknown_requests, use_closest_match, scramble_images) | |
| 492 if use_record_mode: | |
| 493 self.SetRecordMode() | |
| 494 else: | |
| 495 self.SetReplayMode() | |
| 496 | |
| 497 def SetRecordMode(self): | |
| 498 self.fetch = self.record_fetch | |
| 499 self.is_record_mode = True | |
| 500 | |
| 501 def SetReplayMode(self): | |
| 502 self.fetch = self.replay_fetch | |
| 503 self.is_record_mode = False | |
| 504 | |
| 505 def __call__(self, *args, **kwargs): | |
| 506 """Forward calls to Replay/Record fetch functions depending on mode.""" | |
| 507 return self.fetch(*args, **kwargs) | |
| OLD | NEW |