tools/android/loading/sandwich_prefetch.py - Issue 2112483002: sandwich: Fixes two sources of KeyError task failures

Side by Side Diff: tools/android/loading/sandwich_prefetch.py

Issue 2112483002: sandwich: Fixes two sources of KeyError task failures (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addresses Egor's comments Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 """	5 """

6 Implements a task builder for benchmarking effects of NoState Prefetch.	6 Implements a task builder for benchmarking effects of NoState Prefetch.

7 Noticeable steps of the task pipeline:	7 Noticeable steps of the task pipeline:

8 * Save a WPR archive	8 * Save a WPR archive

9 * Process the WPR archive to make all resources cacheable	9 * Process the WPR archive to make all resources cacheable

10 * Process cache archive to patch response headers back to their original	10 * Process cache archive to patch response headers back to their original

11 values.	11 values.

12 * Find out which resources are discoverable by NoState Prefetch	12 * Find out which resources are discoverable by NoState Prefetch

13 (HTMLPreloadScanner)	13 (HTMLPreloadScanner)

14 * Load pages with empty/full/prefetched cache	14 * Load pages with empty/full/prefetched cache

15 * Extract most important metrics to a CSV	15 * Extract most important metrics to a CSV

16 """	16 """

17	17

18 import csv	18 import csv

19 import logging	19 import logging

20 import json	20 import json

21 import os	21 import os

22 import re	22 import re

23 import shutil	23 import shutil

24 from urlparse import urlparse	24 import urlparse

25	25

26 import chrome_cache	26 import chrome_cache

27 import common_util	27 import common_util

28 import loading_trace	28 import loading_trace

29 from prefetch_view import PrefetchSimulationView	29 from prefetch_view import PrefetchSimulationView

30 from request_dependencies_lens import RequestDependencyLens	30 from request_dependencies_lens import RequestDependencyLens

31 import sandwich_metrics	31 import sandwich_metrics

32 import sandwich_runner	32 import sandwich_runner

	33 import sandwich_utils

33 import task_manager	34 import task_manager

34 import wpr_backend	35 import wpr_backend

35	36

36	37

37 class Discoverer(object):	38 class Discoverer(object):

38 # Do not prefetch anything.	39 # Do not prefetch anything.

39 EmptyCache = 'empty-cache'	40 EmptyCache = 'empty-cache'

40	41

41 # Prefetches everything to load fully from cache (impossible in practice).	42 # Prefetches everything to load fully from cache (impossible in practice).

42 FullCache = 'full-cache'	43 FullCache = 'full-cache'

(...skipping 19 matching lines...) Expand all Loading...
62 Discoverer.MainDocument,	63 Discoverer.MainDocument,

63 Discoverer.Parser,	64 Discoverer.Parser,

64 Discoverer.HTMLPreloadScanner,	65 Discoverer.HTMLPreloadScanner,

65 Discoverer.HTMLPreloadScannerStore,	66 Discoverer.HTMLPreloadScannerStore,

66 ])	67 ])

67	68

68	69

69 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')	70 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')

70	71

71	72

	73 def _NormalizeUrl(url):

	74 """Returns normalized URL such as removing trailing slashes."""

	75 parsed_url = list(urlparse.urlparse(url))

	76 parsed_url[2] = re.sub(r'/{2,}', r'/', parsed_url[2])

	77 return urlparse.urlunparse(parsed_url)

	78

	79

72 def _PatchWpr(wpr_archive):	80 def _PatchWpr(wpr_archive):

73 """Patches a WPR archive to get all resources into the HTTP cache and avoid	81 """Patches a WPR archive to get all resources into the HTTP cache and avoid

74 invalidation and revalidations.	82 invalidation and revalidations.

75	83

76 Args:	84 Args:

77 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch.	85 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch.

78 """	86 """

79 # Sets the resources cache max-age to 10 years.	87 # Sets the resources cache max-age to 10 years.

80 MAX_AGE = 10 * 365 * 24 * 60 * 60	88 MAX_AGE = 10 * 365 * 24 * 60 * 60

81 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE)	89 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE)

(...skipping 112 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
194 logging.info('number of requests discovered by %s: %d',	202 logging.info('number of requests discovered by %s: %d',

195 subresource_discoverer, len(requests))	203 subresource_discoverer, len(requests))

196 return requests	204 return requests

197	205

198	206

199 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests):	207 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests):

200 with open(original_headers_path) as file_input:	208 with open(original_headers_path) as file_input:

201 original_headers = json.load(file_input)	209 original_headers = json.load(file_input)

202 pruned_requests = set()	210 pruned_requests = set()

203 for request in requests:	211 for request in requests:

204 request_original_headers = original_headers[request.url]	212 url = _NormalizeUrl(request.url)

	213 if url not in original_headers:

	214 # TODO(gabadie): Investigate why these requests were not in WPR.

	215 assert request.failed

	216 logging.warning(

	217 'could not find original headers for: %s (failure: %s)',

	218 url, request.error_text)

	219 continue

	220 request_original_headers = original_headers[url]

205 if ('cache-control' in request_original_headers and	221 if ('cache-control' in request_original_headers and

206 'no-store' in request_original_headers['cache-control'].lower()):	222 'no-store' in request_original_headers['cache-control'].lower()):

207 pruned_requests.add(request)	223 pruned_requests.add(request)

208 return [r for r in requests if r not in pruned_requests]	224 return [r for r in requests if r not in pruned_requests]

209	225

210	226

211 def _ExtractDiscoverableUrls(	227 def _ExtractDiscoverableUrls(

212 original_headers_path, loading_trace_path, subresource_discoverer):	228 original_headers_path, loading_trace_path, subresource_discoverer):

213 """Extracts discoverable resource urls from a loading trace according to a	229 """Extracts discoverable resource urls from a loading trace according to a

214 sub-resource discoverer.	230 sub-resource discoverer.

(...skipping 147 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
362 def VerifyWprLog(self, wpr_log_path):	378 def VerifyWprLog(self, wpr_log_path):

363 """Verifies WPR log with previously verified traces."""	379 """Verifies WPR log with previously verified traces."""

364 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)	380 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)

365 all_wpr_urls = set()	381 all_wpr_urls = set()

366 unserved_wpr_urls = set()	382 unserved_wpr_urls = set()

367 wpr_command_colliding_urls = set()	383 wpr_command_colliding_urls = set()

368	384

369 for request in all_wpr_requests:	385 for request in all_wpr_requests:

370 if request.is_wpr_host:	386 if request.is_wpr_host:

371 continue	387 continue

372 if urlparse(request.url).path.startswith('/web-page-replay'):	388 if urlparse.urlparse(request.url).path.startswith('/web-page-replay'):

373 wpr_command_colliding_urls.add(request.url)	389 wpr_command_colliding_urls.add(request.url)

374 elif request.is_served is False:	390 elif request.is_served is False:

375 unserved_wpr_urls.add(request.url)	391 unserved_wpr_urls.add(request.url)

376 all_wpr_urls.add(request.url)	392 all_wpr_urls.add(request.url)

377	393

378 _PrintUrlSetComparison(set(), unserved_wpr_urls,	394 _PrintUrlSetComparison(set(), unserved_wpr_urls,

379 'Distinct unserved resources from WPR')	395 'Distinct unserved resources from WPR')

380 _PrintUrlSetComparison(set(), wpr_command_colliding_urls,	396 _PrintUrlSetComparison(set(), wpr_command_colliding_urls,

381 'Distinct resources colliding to WPR commands')	397 'Distinct resources colliding to WPR commands')

382 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests,	398 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests,

(...skipping 91 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
474 logging.info('loading trace: %s', trace_path)	490 logging.info('loading trace: %s', trace_path)

475 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)	491 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)

476	492

477 logging.info('verifying trace: %s', trace_path)	493 logging.info('verifying trace: %s', trace_path)

478 run_output_verifier.VerifyTrace(trace)	494 run_output_verifier.VerifyTrace(trace)

479	495

480 logging.info('extracting metrics from trace: %s', trace_path)	496 logging.info('extracting metrics from trace: %s', trace_path)

481 served_from_network_bytes = 0	497 served_from_network_bytes = 0

482 served_from_cache_bytes = 0	498 served_from_cache_bytes = 0

483 urls_hitting_network = set()	499 urls_hitting_network = set()

	500 response_sizes = {}

484 for request in _FilterOutDataAndIncompleteRequests(	501 for request in _FilterOutDataAndIncompleteRequests(

485 trace.request_track.GetEvents()):	502 trace.request_track.GetEvents()):

486 # Ignore requests served from the blink's cache.	503 # Ignore requests served from the blink's cache.

487 if request.served_from_cache:	504 if request.served_from_cache:

488 continue	505 continue

489 urls_hitting_network.add(request.url)	506 urls_hitting_network.add(request.url)

490 if request.from_disk_cache:	507 if request.from_disk_cache:

491 served_from_cache_bytes += cached_encoded_data_lengths[request.url]	508 if request.url in cached_encoded_data_lengths:

	509 response_size = cached_encoded_data_lengths[request.url]

	510 else:

	511 # Some fat webpages may overflow the Memory cache, and so some

	512 # requests might be served from disk cache couple of times per page

	513 # load.

	514 logging.warning('Looks like could be served from memory cache: %s',

	515 request.url)

	516 response_size = response_sizes[request.url]

	517 served_from_cache_bytes += response_size

492 else:	518 else:

493 served_from_network_bytes += request.GetEncodedDataLength()	519 response_size = request.GetEncodedDataLength()

	520 served_from_network_bytes += response_size

	521 response_sizes[request.url] = response_size

494	522

495 # Make sure the served from blink's cache requests have at least one	523 # Make sure the served from blink's cache requests have at least one

496 # corresponding request that was not served from the blink's cache.	524 # corresponding request that was not served from the blink's cache.

497 for request in _FilterOutDataAndIncompleteRequests(	525 for request in _FilterOutDataAndIncompleteRequests(

498 trace.request_track.GetEvents()):	526 trace.request_track.GetEvents()):

499 assert (request.url in urls_hitting_network or	527 assert (request.url in urls_hitting_network or

500 not request.served_from_cache)	528 not request.served_from_cache)

501	529

502 run_metrics = {	530 run_metrics = {

503 'url': trace.url,	531 'url': trace.url,

(...skipping 63 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
567 dependencies=[self._common_builder.original_wpr_task])	595 dependencies=[self._common_builder.original_wpr_task])

568 def BuildPatchedWpr():	596 def BuildPatchedWpr():

569 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path)	597 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path)

570 shutil.copyfile(	598 shutil.copyfile(

571 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path)	599 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path)

572 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path)	600 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path)

573	601

574 # Save up original response headers.	602 # Save up original response headers.

575 original_response_headers = {e.url: e.GetResponseHeadersDict() \	603 original_response_headers = {e.url: e.GetResponseHeadersDict() \

576 for e in wpr_archive.ListUrlEntries()}	604 for e in wpr_archive.ListUrlEntries()}

	605 logging.info('save up response headers for %d resources',

	606 len(original_response_headers))

	607 if not original_response_headers:

	608 # TODO(gabadie): How is it possible to not even have the main resource

	609 # in the WPR archive? Example URL can be found in:

	610 # http://crbug.com/623966#c5

	611 raise Exception(

	612 'Looks like no resources were recorded in WPR during: {}'.format(

	613 self._common_builder.original_wpr_task.name))

577 with open(self._original_headers_path, 'w') as file_output:	614 with open(self._original_headers_path, 'w') as file_output:

578 json.dump(original_response_headers, file_output)	615 json.dump(original_response_headers, file_output)

579	616

580 # Patch WPR.	617 # Patch WPR.

581 _PatchWpr(wpr_archive)	618 _PatchWpr(wpr_archive)

582 wpr_archive.Persist()	619 wpr_archive.Persist()

583	620

584 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr])	621 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr])

585 def BuildOriginalCache():	622 def BuildOriginalCache():

586 runner = self._common_builder.CreateSandwichRunner()	623 runner = self._common_builder.CreateSandwichRunner()

(...skipping 111 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
698 run_metrics_list = _ProcessRunOutputDir(	735 run_metrics_list = _ProcessRunOutputDir(

699 cache_validation_result, benchmark_setup, RunBenchmark.path)	736 cache_validation_result, benchmark_setup, RunBenchmark.path)

700 with open(ProcessRunOutputDir.path, 'w') as csv_file:	737 with open(ProcessRunOutputDir.path, 'w') as csv_file:

701 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names +	738 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names +

702 sandwich_metrics.COMMON_CSV_COLUMN_NAMES))	739 sandwich_metrics.COMMON_CSV_COLUMN_NAMES))

703 writer.writeheader()	740 writer.writeheader()

704 for trace_metrics in run_metrics_list:	741 for trace_metrics in run_metrics_list:

705 writer.writerow(trace_metrics)	742 writer.writerow(trace_metrics)

706	743

707 self._common_builder.default_final_tasks.append(ProcessRunOutputDir)	744 self._common_builder.default_final_tasks.append(ProcessRunOutputDir)

OLD	NEW

« no previous file with comments | « no previous file | no next file » | no next file with comments »