Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(9)

Side by Side Diff: telemetry/third_party/webpagereplay/httpclient.py

Issue 2210063003: Rename third_party/webpagereplay to third_party/web-page-replay (Closed) Base URL: https://github.com/catapult-project/catapult@master
Patch Set: Created 4 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2012 Google Inc. All Rights Reserved.
3 #
4 # Licensed under the Apache License, Version 2.0 (the "License");
5 # you may not use this file except in compliance with the License.
6 # You may obtain a copy of the License at
7 #
8 # http://www.apache.org/licenses/LICENSE-2.0
9 #
10 # Unless required by applicable law or agreed to in writing, software
11 # distributed under the License is distributed on an "AS IS" BASIS,
12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 # See the License for the specific language governing permissions and
14 # limitations under the License.
15
16 """Retrieve web resources over http."""
17
18 import copy
19 import httplib
20 import logging
21 import random
22 import ssl
23 import StringIO
24
25 import httparchive
26 import platformsettings
27 import script_injector
28
29
30 # PIL isn't always available, but we still want to be able to run without
31 # the image scrambling functionality in this case.
32 try:
33 import Image
34 except ImportError:
35 Image = None
36
37 TIMER = platformsettings.timer
38
39
40 class HttpClientException(Exception):
41 """Base class for all exceptions in httpclient."""
42 pass
43
44
45 def _InjectScripts(response, inject_script):
46 """Injects |inject_script| immediately after <head> or <html>.
47
48 Copies |response| if it is modified.
49
50 Args:
51 response: an ArchivedHttpResponse
52 inject_script: JavaScript string (e.g. "Math.random = function(){...}")
53 Returns:
54 an ArchivedHttpResponse
55 """
56 if type(response) == tuple:
57 logging.warn('tuple response: %s', response)
58 content_type = response.get_header('content-type')
59 if content_type and content_type.startswith('text/html'):
60 text_chunks = response.get_data_as_chunks()
61 text_chunks, just_injected = script_injector.InjectScript(
62 text_chunks, 'text/html', inject_script)
63 if just_injected:
64 response = copy.deepcopy(response)
65 response.set_data_from_chunks(text_chunks)
66 return response
67
68
69 def _ScrambleImages(response):
70 """If the |response| is an image, attempt to scramble it.
71
72 Copies |response| if it is modified.
73
74 Args:
75 response: an ArchivedHttpResponse
76 Returns:
77 an ArchivedHttpResponse
78 """
79
80 assert Image, '--scramble_images requires the PIL module to be installed.'
81
82 content_type = response.get_header('content-type')
83 if content_type and content_type.startswith('image/'):
84 try:
85 image_data = response.response_data[0]
86 image_data.decode(encoding='base64')
87 im = Image.open(StringIO.StringIO(image_data))
88
89 pixel_data = list(im.getdata())
90 random.shuffle(pixel_data)
91
92 scrambled_image = im.copy()
93 scrambled_image.putdata(pixel_data)
94
95 output_image_io = StringIO.StringIO()
96 scrambled_image.save(output_image_io, im.format)
97 output_image_data = output_image_io.getvalue()
98 output_image_data.encode(encoding='base64')
99
100 response = copy.deepcopy(response)
101 response.set_data(output_image_data)
102 except Exception:
103 pass
104
105 return response
106
107
108 class DetailedHTTPResponse(httplib.HTTPResponse):
109 """Preserve details relevant to replaying responses.
110
111 WARNING: This code uses attributes and methods of HTTPResponse
112 that are not part of the public interface.
113 """
114
115 def read_chunks(self):
116 """Return the response body content and timing data.
117
118 The returned chunks have the chunk size and CRLFs stripped off.
119 If the response was compressed, the returned data is still compressed.
120
121 Returns:
122 (chunks, delays)
123 chunks:
124 [response_body] # non-chunked responses
125 [chunk_1, chunk_2, ...] # chunked responses
126 delays:
127 [0] # non-chunked responses
128 [chunk_1_first_byte_delay, ...] # chunked responses
129
130 The delay for the first body item should be recorded by the caller.
131 """
132 buf = []
133 chunks = []
134 delays = []
135 if not self.chunked:
136 chunks.append(self.read())
137 delays.append(0)
138 else:
139 start = TIMER()
140 try:
141 while True:
142 line = self.fp.readline()
143 chunk_size = self._read_chunk_size(line)
144 if chunk_size is None:
145 raise httplib.IncompleteRead(''.join(chunks))
146 if chunk_size == 0:
147 break
148 delays.append(TIMER() - start)
149 chunks.append(self._safe_read(chunk_size))
150 self._safe_read(2) # skip the CRLF at the end of the chunk
151 start = TIMER()
152
153 # Ignore any trailers.
154 while True:
155 line = self.fp.readline()
156 if not line or line == '\r\n':
157 break
158 finally:
159 self.close()
160 return chunks, delays
161
162 @classmethod
163 def _read_chunk_size(cls, line):
164 chunk_extensions_pos = line.find(';')
165 if chunk_extensions_pos != -1:
166 line = line[:chunk_extensions_pos] # strip chunk-extensions
167 try:
168 chunk_size = int(line, 16)
169 except ValueError:
170 return None
171 return chunk_size
172
173
174 class DetailedHTTPConnection(httplib.HTTPConnection):
175 """Preserve details relevant to replaying connections."""
176 response_class = DetailedHTTPResponse
177
178
179 class DetailedHTTPSResponse(DetailedHTTPResponse):
180 """Preserve details relevant to replaying SSL responses."""
181 pass
182
183
184 class DetailedHTTPSConnection(httplib.HTTPSConnection):
185 """Preserve details relevant to replaying SSL connections."""
186 response_class = DetailedHTTPSResponse
187
188 def __init__(self, host, port):
189 # https://www.python.org/dev/peps/pep-0476/#opting-out
190 if hasattr(ssl, '_create_unverified_context'):
191 httplib.HTTPSConnection.__init__(
192 self, host=host, port=port, context=ssl._create_unverified_context())
193 else:
194 httplib.HTTPSConnection.__init__(self, host=host, port=port)
195
196
197 class RealHttpFetch(object):
198
199 def __init__(self, real_dns_lookup):
200 """Initialize RealHttpFetch.
201
202 Args:
203 real_dns_lookup: a function that resolves a host to an IP. RealHttpFetch
204 will resolve host name to the IP before making fetching request if this
205 is not None.
206 """
207 self._real_dns_lookup = real_dns_lookup
208
209 @staticmethod
210 def _GetHeaderNameValue(header):
211 """Parse the header line and return a name/value tuple.
212
213 Args:
214 header: a string for a header such as "Content-Length: 314".
215 Returns:
216 A tuple (header_name, header_value) on success or None if the header
217 is not in expected format. header_name is in lowercase.
218 """
219 i = header.find(':')
220 if i > 0:
221 return (header[:i].lower(), header[i+1:].strip())
222 return None
223
224 @staticmethod
225 def _ToTuples(headers):
226 """Parse headers and save them to a list of tuples.
227
228 This method takes HttpResponse.msg.headers as input and convert it
229 to a list of (header_name, header_value) tuples.
230 HttpResponse.msg.headers is a list of strings where each string
231 represents either a header or a continuation line of a header.
232 1. a normal header consists of two parts which are separated by colon :
233 "header_name:header_value..."
234 2. a continuation line is a string starting with whitespace
235 "[whitespace]continued_header_value..."
236 If a header is not in good shape or an unexpected continuation line is
237 seen, it will be ignored.
238
239 Should avoid using response.getheaders() directly
240 because response.getheaders() can't handle multiple headers
241 with the same name properly. Instead, parse the
242 response.msg.headers using this method to get all headers.
243
244 Args:
245 headers: an instance of HttpResponse.msg.headers.
246 Returns:
247 A list of tuples which looks like:
248 [(header_name, header_value), (header_name2, header_value2)...]
249 """
250 all_headers = []
251 for line in headers:
252 if line[0] in '\t ':
253 if not all_headers:
254 logging.warning(
255 'Unexpected response header continuation line [%s]', line)
256 continue
257 name, value = all_headers.pop()
258 value += '\n ' + line.strip()
259 else:
260 name_value = RealHttpFetch._GetHeaderNameValue(line)
261 if not name_value:
262 logging.warning(
263 'Response header in wrong format [%s]', line)
264 continue
265 name, value = name_value # pylint: disable=unpacking-non-sequence
266 all_headers.append((name, value))
267 return all_headers
268
269 @staticmethod
270 def _get_request_host_port(request):
271 host_parts = request.host.split(':')
272 host = host_parts[0]
273 port = int(host_parts[1]) if len(host_parts) == 2 else None
274 return host, port
275
276 @staticmethod
277 def _get_system_proxy(is_ssl):
278 return platformsettings.get_system_proxy(is_ssl)
279
280 def _get_connection(self, request_host, request_port, is_ssl):
281 """Return a detailed connection object for host/port pair.
282
283 If a system proxy is defined (see platformsettings.py), it will be used.
284
285 Args:
286 request_host: a host string (e.g. "www.example.com").
287 request_port: a port integer (e.g. 8080) or None (for the default port).
288 is_ssl: True if HTTPS connection is needed.
289 Returns:
290 A DetailedHTTPSConnection or DetailedHTTPConnection instance.
291 """
292 connection_host = request_host
293 connection_port = request_port
294 system_proxy = self._get_system_proxy(is_ssl)
295 if system_proxy:
296 connection_host = system_proxy.host
297 connection_port = system_proxy.port
298
299 # Use an IP address because WPR may override DNS settings.
300 if self._real_dns_lookup:
301 connection_ip = self._real_dns_lookup(connection_host)
302 if not connection_ip:
303 logging.critical(
304 'Unable to find IP for host name: %s', connection_host)
305 return None
306 connection_host = connection_ip
307
308 if is_ssl:
309 connection = DetailedHTTPSConnection(connection_host, connection_port)
310 if system_proxy:
311 connection.set_tunnel(request_host, request_port)
312 else:
313 connection = DetailedHTTPConnection(connection_host, connection_port)
314 return connection
315
316 def __call__(self, request):
317 """Fetch an HTTP request.
318
319 Args:
320 request: an ArchivedHttpRequest
321 Returns:
322 an ArchivedHttpResponse
323 """
324 logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)
325 request_host, request_port = self._get_request_host_port(request)
326 retries = 3
327 while True:
328 try:
329 connection = self._get_connection(
330 request_host, request_port, request.is_ssl)
331 connect_start = TIMER()
332 connection.connect()
333 connect_delay = int((TIMER() - connect_start) * 1000)
334 start = TIMER()
335 connection.request(
336 request.command,
337 request.full_path,
338 request.request_body,
339 request.headers)
340 response = connection.getresponse()
341 headers_delay = int((TIMER() - start) * 1000)
342
343 chunks, chunk_delays = response.read_chunks()
344 delays = {
345 'connect': connect_delay,
346 'headers': headers_delay,
347 'data': chunk_delays
348 }
349 archived_http_response = httparchive.ArchivedHttpResponse(
350 response.version,
351 response.status,
352 response.reason,
353 RealHttpFetch._ToTuples(response.msg.headers),
354 chunks,
355 delays)
356 return archived_http_response
357 except Exception, e:
358 if retries:
359 retries -= 1
360 logging.warning('Retrying fetch %s: %s', request, repr(e))
361 continue
362 logging.critical('Could not fetch %s: %s', request, repr(e))
363 return None
364
365
366 class RecordHttpArchiveFetch(object):
367 """Make real HTTP fetches and save responses in the given HttpArchive."""
368
369 def __init__(self, http_archive, inject_script):
370 """Initialize RecordHttpArchiveFetch.
371
372 Args:
373 http_archive: an instance of a HttpArchive
374 inject_script: script string to inject in all pages
375 """
376 self.http_archive = http_archive
377 # Do not resolve host name to IP when recording to avoid SSL3 handshake
378 # failure.
379 # See https://github.com/chromium/web-page-replay/issues/73 for details.
380 self.real_http_fetch = RealHttpFetch(real_dns_lookup=None)
381 self.inject_script = inject_script
382
383 def __call__(self, request):
384 """Fetch the request and return the response.
385
386 Args:
387 request: an ArchivedHttpRequest.
388 Returns:
389 an ArchivedHttpResponse
390 """
391 # If request is already in the archive, return the archived response.
392 if request in self.http_archive:
393 logging.debug('Repeated request found: %s', request)
394 response = self.http_archive[request]
395 else:
396 response = self.real_http_fetch(request)
397 if response is None:
398 return None
399 self.http_archive[request] = response
400 if self.inject_script:
401 response = _InjectScripts(response, self.inject_script)
402 logging.debug('Recorded: %s', request)
403 return response
404
405
406 class ReplayHttpArchiveFetch(object):
407 """Serve responses from the given HttpArchive."""
408
409 def __init__(self, http_archive, real_dns_lookup, inject_script,
410 use_diff_on_unknown_requests=False,
411 use_closest_match=False, scramble_images=False):
412 """Initialize ReplayHttpArchiveFetch.
413
414 Args:
415 http_archive: an instance of a HttpArchive
416 real_dns_lookup: a function that resolves a host to an IP.
417 inject_script: script string to inject in all pages
418 use_diff_on_unknown_requests: If True, log unknown requests
419 with a diff to requests that look similar.
420 use_closest_match: If True, on replay mode, serve the closest match
421 in the archive instead of giving a 404.
422 """
423 self.http_archive = http_archive
424 self.inject_script = inject_script
425 self.use_diff_on_unknown_requests = use_diff_on_unknown_requests
426 self.use_closest_match = use_closest_match
427 self.scramble_images = scramble_images
428 self.real_http_fetch = RealHttpFetch(real_dns_lookup)
429
430 def __call__(self, request):
431 """Fetch the request and return the response.
432
433 Args:
434 request: an instance of an ArchivedHttpRequest.
435 Returns:
436 Instance of ArchivedHttpResponse (if found) or None
437 """
438 if request.host.startswith('127.0.0.1:'):
439 return self.real_http_fetch(request)
440
441 response = self.http_archive.get(request)
442
443 if self.use_closest_match and not response:
444 closest_request = self.http_archive.find_closest_request(
445 request, use_path=True)
446 if closest_request:
447 response = self.http_archive.get(closest_request)
448 if response:
449 logging.info('Request not found: %s\nUsing closest match: %s',
450 request, closest_request)
451
452 if not response:
453 reason = str(request)
454 if self.use_diff_on_unknown_requests:
455 diff = self.http_archive.diff(request)
456 if diff:
457 reason += (
458 "\nNearest request diff "
459 "('-' for archived request, '+' for current request):\n%s" % diff)
460 logging.warning('Could not replay: %s', reason)
461 else:
462 if self.inject_script:
463 response = _InjectScripts(response, self.inject_script)
464 if self.scramble_images:
465 response = _ScrambleImages(response)
466 return response
467
468
469 class ControllableHttpArchiveFetch(object):
470 """Controllable fetch function that can swap between record and replay."""
471
472 def __init__(self, http_archive, real_dns_lookup,
473 inject_script, use_diff_on_unknown_requests,
474 use_record_mode, use_closest_match, scramble_images):
475 """Initialize HttpArchiveFetch.
476
477 Args:
478 http_archive: an instance of a HttpArchive
479 real_dns_lookup: a function that resolves a host to an IP.
480 inject_script: script string to inject in all pages.
481 use_diff_on_unknown_requests: If True, log unknown requests
482 with a diff to requests that look similar.
483 use_record_mode: If True, start in server in record mode.
484 use_closest_match: If True, on replay mode, serve the closest match
485 in the archive instead of giving a 404.
486 """
487 self.http_archive = http_archive
488 self.record_fetch = RecordHttpArchiveFetch(http_archive, inject_script)
489 self.replay_fetch = ReplayHttpArchiveFetch(
490 http_archive, real_dns_lookup, inject_script,
491 use_diff_on_unknown_requests, use_closest_match, scramble_images)
492 if use_record_mode:
493 self.SetRecordMode()
494 else:
495 self.SetReplayMode()
496
497 def SetRecordMode(self):
498 self.fetch = self.record_fetch
499 self.is_record_mode = True
500
501 def SetReplayMode(self):
502 self.fetch = self.replay_fetch
503 self.is_record_mode = False
504
505 def __call__(self, *args, **kwargs):
506 """Forward calls to Replay/Record fetch functions depending on mode."""
507 return self.fetch(*args, **kwargs)
OLDNEW
« no previous file with comments | « telemetry/third_party/webpagereplay/httparchive_test.py ('k') | telemetry/third_party/webpagereplay/httpclient_test.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698