telemetry/third_party/webpagereplay/httpclient.py - Issue 2210063003: Rename third_party/webpagereplay to third_party/web-page-replay

Side by Side Diff: telemetry/third_party/webpagereplay/httpclient.py

Issue 2210063003: Rename third_party/webpagereplay to third_party/web-page-replay (Closed) Base URL: https://github.com/catapult-project/catapult@master

Patch Set: Created 4 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 #!/usr/bin/env python

2 # Copyright 2012 Google Inc. All Rights Reserved.

3 #

4 # Licensed under the Apache License, Version 2.0 (the "License");

5 # you may not use this file except in compliance with the License.

6 # You may obtain a copy of the License at

7 #

8 # http://www.apache.org/licenses/LICENSE-2.0

9 #

10 # Unless required by applicable law or agreed to in writing, software

11 # distributed under the License is distributed on an "AS IS" BASIS,

12 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

13 # See the License for the specific language governing permissions and

14 # limitations under the License.

15

16 """Retrieve web resources over http."""

17

18 import copy

19 import httplib

20 import logging

21 import random

22 import ssl

23 import StringIO

24

25 import httparchive

26 import platformsettings

27 import script_injector

28

29

30 # PIL isn't always available, but we still want to be able to run without

31 # the image scrambling functionality in this case.

32 try:

33 import Image

34 except ImportError:

35 Image = None

36

37 TIMER = platformsettings.timer

38

39

40 class HttpClientException(Exception):

41 """Base class for all exceptions in httpclient."""

42 pass

43

44

45 def _InjectScripts(response, inject_script):

46 """Injects \|inject_script\| immediately after <head> or <html>.

47

48 Copies \|response\| if it is modified.

49

50 Args:

51 response: an ArchivedHttpResponse

52 inject_script: JavaScript string (e.g. "Math.random = function(){...}")

53 Returns:

54 an ArchivedHttpResponse

55 """

56 if type(response) == tuple:

57 logging.warn('tuple response: %s', response)

58 content_type = response.get_header('content-type')

59 if content_type and content_type.startswith('text/html'):

60 text_chunks = response.get_data_as_chunks()

61 text_chunks, just_injected = script_injector.InjectScript(

62 text_chunks, 'text/html', inject_script)

63 if just_injected:

64 response = copy.deepcopy(response)

65 response.set_data_from_chunks(text_chunks)

66 return response

67

68

69 def _ScrambleImages(response):

70 """If the \|response\| is an image, attempt to scramble it.

71

72 Copies \|response\| if it is modified.

73

74 Args:

75 response: an ArchivedHttpResponse

76 Returns:

77 an ArchivedHttpResponse

78 """

79

80 assert Image, '--scramble_images requires the PIL module to be installed.'

81

82 content_type = response.get_header('content-type')

83 if content_type and content_type.startswith('image/'):

84 try:

85 image_data = response.response_data[0]

86 image_data.decode(encoding='base64')

87 im = Image.open(StringIO.StringIO(image_data))

88

89 pixel_data = list(im.getdata())

90 random.shuffle(pixel_data)

91

92 scrambled_image = im.copy()

93 scrambled_image.putdata(pixel_data)

94

95 output_image_io = StringIO.StringIO()

96 scrambled_image.save(output_image_io, im.format)

97 output_image_data = output_image_io.getvalue()

98 output_image_data.encode(encoding='base64')

99

100 response = copy.deepcopy(response)

101 response.set_data(output_image_data)

102 except Exception:

103 pass

104

105 return response

106

107

108 class DetailedHTTPResponse(httplib.HTTPResponse):

109 """Preserve details relevant to replaying responses.

110

111 WARNING: This code uses attributes and methods of HTTPResponse

112 that are not part of the public interface.

113 """

114

115 def read_chunks(self):

116 """Return the response body content and timing data.

117

118 The returned chunks have the chunk size and CRLFs stripped off.

119 If the response was compressed, the returned data is still compressed.

120

121 Returns:

122 (chunks, delays)

123 chunks:

124 [response_body] # non-chunked responses

125 [chunk_1, chunk_2, ...] # chunked responses

126 delays:

127 [0] # non-chunked responses

128 [chunk_1_first_byte_delay, ...] # chunked responses

129

130 The delay for the first body item should be recorded by the caller.

131 """

132 buf = []

133 chunks = []

134 delays = []

135 if not self.chunked:

136 chunks.append(self.read())

137 delays.append(0)

138 else:

139 start = TIMER()

140 try:

141 while True:

142 line = self.fp.readline()

143 chunk_size = self._read_chunk_size(line)

144 if chunk_size is None:

145 raise httplib.IncompleteRead(''.join(chunks))

146 if chunk_size == 0:

147 break

148 delays.append(TIMER() - start)

149 chunks.append(self._safe_read(chunk_size))

150 self._safe_read(2) # skip the CRLF at the end of the chunk

151 start = TIMER()

152

153 # Ignore any trailers.

154 while True:

155 line = self.fp.readline()

156 if not line or line == '\r\n':

157 break

158 finally:

159 self.close()

160 return chunks, delays

161

162 @classmethod

163 def _read_chunk_size(cls, line):

164 chunk_extensions_pos = line.find(';')

165 if chunk_extensions_pos != -1:

166 line = line[:chunk_extensions_pos] # strip chunk-extensions

167 try:

168 chunk_size = int(line, 16)

169 except ValueError:

170 return None

171 return chunk_size

172

173

174 class DetailedHTTPConnection(httplib.HTTPConnection):

175 """Preserve details relevant to replaying connections."""

176 response_class = DetailedHTTPResponse

177

178

179 class DetailedHTTPSResponse(DetailedHTTPResponse):

180 """Preserve details relevant to replaying SSL responses."""

181 pass

182

183

184 class DetailedHTTPSConnection(httplib.HTTPSConnection):

185 """Preserve details relevant to replaying SSL connections."""

186 response_class = DetailedHTTPSResponse

187

188 def __init__(self, host, port):

189 # https://www.python.org/dev/peps/pep-0476/#opting-out

190 if hasattr(ssl, '_create_unverified_context'):

191 httplib.HTTPSConnection.__init__(

192 self, host=host, port=port, context=ssl._create_unverified_context())

193 else:

194 httplib.HTTPSConnection.__init__(self, host=host, port=port)

195

196

197 class RealHttpFetch(object):

198

199 def __init__(self, real_dns_lookup):

200 """Initialize RealHttpFetch.

201

202 Args:

203 real_dns_lookup: a function that resolves a host to an IP. RealHttpFetch

204 will resolve host name to the IP before making fetching request if this

205 is not None.

206 """

207 self._real_dns_lookup = real_dns_lookup

208

209 @staticmethod

210 def _GetHeaderNameValue(header):

211 """Parse the header line and return a name/value tuple.

212

213 Args:

214 header: a string for a header such as "Content-Length: 314".

215 Returns:

216 A tuple (header_name, header_value) on success or None if the header

217 is not in expected format. header_name is in lowercase.

218 """

219 i = header.find(':')

220 if i > 0:

221 return (header[:i].lower(), header[i+1:].strip())

222 return None

223

224 @staticmethod

225 def _ToTuples(headers):

226 """Parse headers and save them to a list of tuples.

227

228 This method takes HttpResponse.msg.headers as input and convert it

229 to a list of (header_name, header_value) tuples.

230 HttpResponse.msg.headers is a list of strings where each string

231 represents either a header or a continuation line of a header.

232 1. a normal header consists of two parts which are separated by colon :

233 "header_name:header_value..."

234 2. a continuation line is a string starting with whitespace

235 "[whitespace]continued_header_value..."

236 If a header is not in good shape or an unexpected continuation line is

237 seen, it will be ignored.

238

239 Should avoid using response.getheaders() directly

240 because response.getheaders() can't handle multiple headers

241 with the same name properly. Instead, parse the

242 response.msg.headers using this method to get all headers.

243

244 Args:

245 headers: an instance of HttpResponse.msg.headers.

246 Returns:

247 A list of tuples which looks like:

248 [(header_name, header_value), (header_name2, header_value2)...]

249 """

250 all_headers = []

251 for line in headers:

252 if line[0] in '\t ':

253 if not all_headers:

254 logging.warning(

255 'Unexpected response header continuation line [%s]', line)

256 continue

257 name, value = all_headers.pop()

258 value += '\n ' + line.strip()

259 else:

260 name_value = RealHttpFetch._GetHeaderNameValue(line)

261 if not name_value:

262 logging.warning(

263 'Response header in wrong format [%s]', line)

264 continue

265 name, value = name_value # pylint: disable=unpacking-non-sequence

266 all_headers.append((name, value))

267 return all_headers

268

269 @staticmethod

270 def _get_request_host_port(request):

271 host_parts = request.host.split(':')

272 host = host_parts[0]

273 port = int(host_parts[1]) if len(host_parts) == 2 else None

274 return host, port

275

276 @staticmethod

277 def _get_system_proxy(is_ssl):

278 return platformsettings.get_system_proxy(is_ssl)

279

280 def _get_connection(self, request_host, request_port, is_ssl):

281 """Return a detailed connection object for host/port pair.

282

283 If a system proxy is defined (see platformsettings.py), it will be used.

284

285 Args:

286 request_host: a host string (e.g. "www.example.com").

287 request_port: a port integer (e.g. 8080) or None (for the default port).

288 is_ssl: True if HTTPS connection is needed.

289 Returns:

290 A DetailedHTTPSConnection or DetailedHTTPConnection instance.

291 """

292 connection_host = request_host

293 connection_port = request_port

294 system_proxy = self._get_system_proxy(is_ssl)

295 if system_proxy:

296 connection_host = system_proxy.host

297 connection_port = system_proxy.port

298

299 # Use an IP address because WPR may override DNS settings.

300 if self._real_dns_lookup:

301 connection_ip = self._real_dns_lookup(connection_host)

302 if not connection_ip:

303 logging.critical(

304 'Unable to find IP for host name: %s', connection_host)

305 return None

306 connection_host = connection_ip

307

308 if is_ssl:

309 connection = DetailedHTTPSConnection(connection_host, connection_port)

310 if system_proxy:

311 connection.set_tunnel(request_host, request_port)

312 else:

313 connection = DetailedHTTPConnection(connection_host, connection_port)

314 return connection

315

316 def __call__(self, request):

317 """Fetch an HTTP request.

318

319 Args:

320 request: an ArchivedHttpRequest

321 Returns:

322 an ArchivedHttpResponse

323 """

324 logging.debug('RealHttpFetch: %s %s', request.host, request.full_path)

325 request_host, request_port = self._get_request_host_port(request)

326 retries = 3

327 while True:

328 try:

329 connection = self._get_connection(

330 request_host, request_port, request.is_ssl)

331 connect_start = TIMER()

332 connection.connect()

333 connect_delay = int((TIMER() - connect_start) * 1000)

334 start = TIMER()

335 connection.request(

336 request.command,

337 request.full_path,

338 request.request_body,

339 request.headers)

340 response = connection.getresponse()

341 headers_delay = int((TIMER() - start) * 1000)

342

343 chunks, chunk_delays = response.read_chunks()

344 delays = {

345 'connect': connect_delay,

346 'headers': headers_delay,

347 'data': chunk_delays

348 }

349 archived_http_response = httparchive.ArchivedHttpResponse(

350 response.version,

351 response.status,

352 response.reason,

353 RealHttpFetch._ToTuples(response.msg.headers),

354 chunks,

355 delays)

356 return archived_http_response

357 except Exception, e:

358 if retries:

359 retries -= 1

360 logging.warning('Retrying fetch %s: %s', request, repr(e))

361 continue

362 logging.critical('Could not fetch %s: %s', request, repr(e))

363 return None

364

365

366 class RecordHttpArchiveFetch(object):

367 """Make real HTTP fetches and save responses in the given HttpArchive."""

368

369 def __init__(self, http_archive, inject_script):

370 """Initialize RecordHttpArchiveFetch.

371

372 Args:

373 http_archive: an instance of a HttpArchive

374 inject_script: script string to inject in all pages

375 """

376 self.http_archive = http_archive

377 # Do not resolve host name to IP when recording to avoid SSL3 handshake

378 # failure.

379 # See https://github.com/chromium/web-page-replay/issues/73 for details.

380 self.real_http_fetch = RealHttpFetch(real_dns_lookup=None)

381 self.inject_script = inject_script

382

383 def __call__(self, request):

384 """Fetch the request and return the response.

385

386 Args:

387 request: an ArchivedHttpRequest.

388 Returns:

389 an ArchivedHttpResponse

390 """

391 # If request is already in the archive, return the archived response.

392 if request in self.http_archive:

393 logging.debug('Repeated request found: %s', request)

394 response = self.http_archive[request]

395 else:

396 response = self.real_http_fetch(request)

397 if response is None:

398 return None

399 self.http_archive[request] = response

400 if self.inject_script:

401 response = _InjectScripts(response, self.inject_script)

402 logging.debug('Recorded: %s', request)

403 return response

404

405

406 class ReplayHttpArchiveFetch(object):

407 """Serve responses from the given HttpArchive."""

408

409 def __init__(self, http_archive, real_dns_lookup, inject_script,

410 use_diff_on_unknown_requests=False,

411 use_closest_match=False, scramble_images=False):

412 """Initialize ReplayHttpArchiveFetch.

413

414 Args:

415 http_archive: an instance of a HttpArchive

416 real_dns_lookup: a function that resolves a host to an IP.

417 inject_script: script string to inject in all pages

418 use_diff_on_unknown_requests: If True, log unknown requests

419 with a diff to requests that look similar.

420 use_closest_match: If True, on replay mode, serve the closest match

421 in the archive instead of giving a 404.

422 """

423 self.http_archive = http_archive

424 self.inject_script = inject_script

425 self.use_diff_on_unknown_requests = use_diff_on_unknown_requests

426 self.use_closest_match = use_closest_match

427 self.scramble_images = scramble_images

428 self.real_http_fetch = RealHttpFetch(real_dns_lookup)

429

430 def __call__(self, request):

431 """Fetch the request and return the response.

432

433 Args:

434 request: an instance of an ArchivedHttpRequest.

435 Returns:

436 Instance of ArchivedHttpResponse (if found) or None

437 """

438 if request.host.startswith('127.0.0.1:'):

439 return self.real_http_fetch(request)

440

441 response = self.http_archive.get(request)

442

443 if self.use_closest_match and not response:

444 closest_request = self.http_archive.find_closest_request(

445 request, use_path=True)

446 if closest_request:

447 response = self.http_archive.get(closest_request)

448 if response:

449 logging.info('Request not found: %s\nUsing closest match: %s',

450 request, closest_request)

451

452 if not response:

453 reason = str(request)

454 if self.use_diff_on_unknown_requests:

455 diff = self.http_archive.diff(request)

456 if diff:

457 reason += (

458 "\nNearest request diff "

459 "('-' for archived request, '+' for current request):\n%s" % diff)

460 logging.warning('Could not replay: %s', reason)

461 else:

462 if self.inject_script:

463 response = _InjectScripts(response, self.inject_script)

464 if self.scramble_images:

465 response = _ScrambleImages(response)

466 return response

467

468

469 class ControllableHttpArchiveFetch(object):

470 """Controllable fetch function that can swap between record and replay."""

471

472 def __init__(self, http_archive, real_dns_lookup,

473 inject_script, use_diff_on_unknown_requests,

474 use_record_mode, use_closest_match, scramble_images):

475 """Initialize HttpArchiveFetch.

476

477 Args:

478 http_archive: an instance of a HttpArchive

479 real_dns_lookup: a function that resolves a host to an IP.

480 inject_script: script string to inject in all pages.

481 use_diff_on_unknown_requests: If True, log unknown requests

482 with a diff to requests that look similar.

483 use_record_mode: If True, start in server in record mode.

484 use_closest_match: If True, on replay mode, serve the closest match

485 in the archive instead of giving a 404.

486 """

487 self.http_archive = http_archive

488 self.record_fetch = RecordHttpArchiveFetch(http_archive, inject_script)

489 self.replay_fetch = ReplayHttpArchiveFetch(

490 http_archive, real_dns_lookup, inject_script,

491 use_diff_on_unknown_requests, use_closest_match, scramble_images)

492 if use_record_mode:

493 self.SetRecordMode()

494 else:

495 self.SetReplayMode()

496

497 def SetRecordMode(self):

498 self.fetch = self.record_fetch

499 self.is_record_mode = True

500

501 def SetReplayMode(self):

502 self.fetch = self.replay_fetch

503 self.is_record_mode = False

504

505 def __call__(self, args, *kwargs):

506 """Forward calls to Replay/Record fetch functions depending on mode."""

507 return self.fetch(args, *kwargs)

OLD	NEW

« no previous file with comments | « telemetry/third_party/webpagereplay/httparchive_test.py ('k') | telemetry/third_party/webpagereplay/httpclient_test.py » ('j') | no next file with comments »