Index: tools/android/loading/sandwich_misc.py |
diff --git a/tools/android/loading/sandwich_misc.py b/tools/android/loading/sandwich_misc.py |
deleted file mode 100644 |
index 172f4e40b0552eeedd797d36d4f1f72c18a39932..0000000000000000000000000000000000000000 |
--- a/tools/android/loading/sandwich_misc.py |
+++ /dev/null |
@@ -1,402 +0,0 @@ |
-# Copyright 2016 The Chromium Authors. All rights reserved. |
-# Use of this source code is governed by a BSD-style license that can be |
-# found in the LICENSE file. |
- |
-import logging |
-import json |
-import os |
-import re |
-from urlparse import urlparse |
- |
-import chrome_cache |
-import common_util |
-from loading_trace import LoadingTrace |
-from prefetch_view import PrefetchSimulationView |
-from request_dependencies_lens import RequestDependencyLens |
-import sandwich_runner |
-import wpr_backend |
- |
- |
-# Do not prefetch anything. |
-EMPTY_CACHE_DISCOVERER = 'empty-cache' |
- |
-# Prefetches everything to load fully from cache (impossible in practice). |
-FULL_CACHE_DISCOVERER = 'full-cache' |
- |
-# Prefetches the first resource following the redirection chain. |
-REDIRECTED_MAIN_DISCOVERER = 'redirected-main' |
- |
-# All resources which are fetched from the main document and their redirections. |
-PARSER_DISCOVERER = 'parser' |
- |
-# Simulation of HTMLPreloadScanner on the main document and their redirections. |
-HTML_PRELOAD_SCANNER_DISCOVERER = 'html-scanner' |
- |
-SUBRESOURCE_DISCOVERERS = set([ |
- EMPTY_CACHE_DISCOVERER, |
- FULL_CACHE_DISCOVERER, |
- REDIRECTED_MAIN_DISCOVERER, |
- PARSER_DISCOVERER, |
- HTML_PRELOAD_SCANNER_DISCOVERER |
-]) |
- |
-_UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$') |
- |
- |
-def PatchWpr(wpr_archive_path): |
- """Patches a WPR archive to get all resources into the HTTP cache and avoid |
- invalidation and revalidations. |
- |
- Args: |
- wpr_archive_path: Path of the WPR archive to patch. |
- """ |
- # Sets the resources cache max-age to 10 years. |
- MAX_AGE = 10 * 365 * 24 * 60 * 60 |
- CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE) |
- |
- wpr_archive = wpr_backend.WprArchiveBackend(wpr_archive_path) |
- for url_entry in wpr_archive.ListUrlEntries(): |
- response_headers = url_entry.GetResponseHeadersDict() |
- if 'cache-control' in response_headers and \ |
- response_headers['cache-control'] == CACHE_CONTROL: |
- continue |
- logging.info('patching %s' % url_entry.url) |
- # TODO(gabadie): may need to patch Last-Modified and If-Modified-Since. |
- # TODO(gabadie): may need to delete ETag. |
- # TODO(gabadie): may need to take care of x-cache. |
- # |
- # Override the cache-control header to set the resources max age to MAX_AGE. |
- # |
- # Important note: Some resources holding sensitive information might have |
- # cache-control set to no-store which allow the resource to be cached but |
- # not cached in the file system. NoState-Prefetch is going to take care of |
- # this case. But in here, to simulate NoState-Prefetch, we don't have other |
- # choices but save absolutely all cached resources on disk so they survive |
- # after killing chrome for cache save, modification and push. |
- url_entry.SetResponseHeader('cache-control', CACHE_CONTROL) |
- |
- # TODO(gabadie): May need to extend Vary blacklist (referer?) |
- # |
- # All of these Vary and Pragma possibilities need to be removed from |
- # response headers in order for Chrome to store a resource in HTTP cache and |
- # not to invalidate it. |
- # |
- # Note: HttpVaryData::Init() in Chrome adds an implicit 'Vary: cookie' |
- # header to any redirect. |
- # TODO(gabadie): Find a way to work around this issue. |
- url_entry.RemoveResponseHeaderDirectives('vary', {'*', 'cookie'}) |
- url_entry.RemoveResponseHeaderDirectives('pragma', {'no-cache'}) |
- |
- wpr_archive.Persist() |
- |
- |
-def _FilterOutDataAndIncompleteRequests(requests): |
- for request in filter(lambda r: not r.IsDataRequest(), requests): |
- # The protocol is only known once the response has been received. But the |
- # trace recording might have been stopped with still some JavaScript |
- # originated requests that have not received any responses yet. |
- if request.protocol is None: |
- assert not request.HasReceivedResponse() |
- assert request.initiator['type'] == 'script' |
- continue |
- if request.protocol not in {'http/0.9', 'http/1.0', 'http/1.1'}: |
- raise RuntimeError('Unknown request protocol {}'.format(request.protocol)) |
- yield request |
- |
- |
-def PatchCacheArchive(cache_archive_path, loading_trace_path, |
- cache_archive_dest_path): |
- """Patch the cache archive. |
- |
- Note: This method update the raw response headers of cache entries' to store |
- the ones such as Set-Cookie that were pruned by the |
- net::HttpCacheTransaction, and remove the stream index 2 holding resource's |
- compile meta data. |
- |
- Args: |
- cache_archive_path: Input archive's path to patch. |
- loading_trace_path: Path of the loading trace that have recorded the cache |
- archive <cache_archive_path>. |
- cache_archive_dest_path: Archive destination's path. |
- """ |
- trace = LoadingTrace.FromJsonFile(loading_trace_path) |
- with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path: |
- cache_path = os.path.join(tmp_path, 'cache') |
- chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path) |
- cache_backend = chrome_cache.CacheBackend(cache_path, 'simple') |
- cache_entries = set(cache_backend.ListKeys()) |
- logging.info('Original cache size: %d bytes' % cache_backend.GetSize()) |
- for request in _FilterOutDataAndIncompleteRequests( |
- trace.request_track.GetEvents()): |
- # On requests having an upload data stream such as POST requests, |
- # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with |
- # the upload data stream's session unique identifier. |
- # |
- # It is fine to not patch these requests since when reopening Chrome, |
- # there is no way the entry can be reused since the upload data stream's |
- # identifier will be different. |
- # |
- # The fact that these entries are kept in the cache after closing Chrome |
- # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath() |
- # do is known chrome bug (crbug.com/610725). |
- if request.url not in cache_entries: |
- continue |
- # Chrome prunes Set-Cookie from response headers before storing them in |
- # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect |
- # response headers. Sandwich manages the cache, but between recording the |
- # cache and benchmarking the cookie jar is invalidated. This leads to |
- # invalidation of all cacheable redirects. |
- raw_headers = request.GetRawResponseHeaders() |
- cache_backend.UpdateRawResponseHeaders(request.url, raw_headers) |
- # NoState-Prefetch would only fetch the resources, but not parse them. |
- cache_backend.DeleteStreamForKey(request.url, 2) |
- chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path) |
- logging.info('Patched cache size: %d bytes' % cache_backend.GetSize()) |
- |
- |
-def ExtractDiscoverableUrls(loading_trace_path, subresource_discoverer): |
- """Extracts discoverable resource urls from a loading trace according to a |
- sub-resource discoverer. |
- |
- Args: |
- loading_trace_path: The loading trace's path. |
- subresource_discoverer: The sub-resources discoverer that should white-list |
- the resources to keep in cache for the NoState-Prefetch benchmarks. |
- |
- Returns: |
- A set of urls. |
- """ |
- assert subresource_discoverer in SUBRESOURCE_DISCOVERERS, \ |
- 'unknown prefetch simulation {}'.format(subresource_discoverer) |
- |
- # Load trace and related infos. |
- logging.info('loading %s' % loading_trace_path) |
- trace = LoadingTrace.FromJsonFile(loading_trace_path) |
- dependencies_lens = RequestDependencyLens(trace) |
- first_resource_request = trace.request_track.GetFirstResourceRequest() |
- |
- # Build the list of discovered requests according to the desired simulation. |
- discovered_requests = [] |
- if subresource_discoverer == EMPTY_CACHE_DISCOVERER: |
- pass |
- elif subresource_discoverer == FULL_CACHE_DISCOVERER: |
- discovered_requests = trace.request_track.GetEvents() |
- elif subresource_discoverer == REDIRECTED_MAIN_DISCOVERER: |
- discovered_requests = \ |
- [dependencies_lens.GetRedirectChain(first_resource_request)[-1]] |
- elif subresource_discoverer == PARSER_DISCOVERER: |
- discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests( |
- first_resource_request, dependencies_lens) |
- elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER: |
- discovered_requests = PrefetchSimulationView.PreloadedRequests( |
- first_resource_request, dependencies_lens, trace) |
- else: |
- assert False |
- |
- whitelisted_urls = set() |
- logging.info('white-listing %s' % first_resource_request.url) |
- for request in _FilterOutDataAndIncompleteRequests(discovered_requests): |
- logging.info('white-listing %s' % request.url) |
- whitelisted_urls.add(request.url) |
- return whitelisted_urls |
- |
- |
-def _PrintUrlSetComparison(ref_url_set, url_set, url_set_name): |
- """Compare URL sets and log the diffs. |
- |
- Args: |
- ref_url_set: Set of reference urls. |
- url_set: Set of urls to compare to the reference. |
- url_set_name: The set name for logging purposes. |
- """ |
- assert type(ref_url_set) == set |
- assert type(url_set) == set |
- if ref_url_set == url_set: |
- logging.info(' %d %s are matching.' % (len(ref_url_set), url_set_name)) |
- return |
- missing_urls = ref_url_set.difference(url_set) |
- unexpected_urls = url_set.difference(ref_url_set) |
- logging.error(' %s are not matching (expected %d, had %d)' % \ |
- (url_set_name, len(ref_url_set), len(url_set))) |
- logging.error(' List of %d missing resources:' % len(missing_urls)) |
- for url in sorted(missing_urls): |
- logging.error('- ' + url) |
- logging.error(' List of %d unexpected resources:' % len(unexpected_urls)) |
- for url in sorted(unexpected_urls): |
- logging.error('+ ' + url) |
- |
- |
-class RequestOutcome: |
- All, ServedFromCache, NotServedFromCache, Post = range(4) |
- |
- |
-def ListUrlRequests(trace, request_kind): |
- """Lists requested URLs from a trace. |
- |
- Args: |
- trace: (LoadingTrace) loading trace. |
- request_kind: RequestOutcome indicating the subset of requests to output. |
- |
- Returns: |
- set([str]) |
- """ |
- urls = set() |
- for request_event in _FilterOutDataAndIncompleteRequests( |
- trace.request_track.GetEvents()): |
- if (request_kind == RequestOutcome.ServedFromCache and |
- request_event.from_disk_cache): |
- urls.add(request_event.url) |
- elif (request_kind == RequestOutcome.Post and |
- request_event.method.upper().strip() == 'POST'): |
- urls.add(request_event.url) |
- elif (request_kind == RequestOutcome.NotServedFromCache and |
- not request_event.from_disk_cache): |
- urls.add(request_event.url) |
- elif request_kind == RequestOutcome.All: |
- urls.add(request_event.url) |
- return urls |
- |
- |
-def VerifyBenchmarkOutputDirectory(benchmark_setup_path, |
- benchmark_output_directory_path): |
- """Verifies that all run inside the run_output_directory worked as expected. |
- |
- Args: |
- benchmark_setup_path: Path of the JSON of the benchmark setup. |
- benchmark_output_directory_path: Path of the benchmark output directory to |
- verify. |
- """ |
- # TODO(gabadie): What's the best way of propagating errors happening in here? |
- benchmark_setup = json.load(open(benchmark_setup_path)) |
- cache_whitelist = set(benchmark_setup['cache_whitelist']) |
- original_requests = set(benchmark_setup['url_resources']) |
- original_cached_requests = original_requests.intersection(cache_whitelist) |
- original_uncached_requests = original_requests.difference(cache_whitelist) |
- all_sent_url_requests = set() |
- |
- # Verify requests from traces. |
- run_id = -1 |
- while True: |
- run_id += 1 |
- run_path = os.path.join(benchmark_output_directory_path, str(run_id)) |
- if not os.path.isdir(run_path): |
- break |
- trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME) |
- if not os.path.isfile(trace_path): |
- logging.error('missing trace %s' % trace_path) |
- continue |
- trace = LoadingTrace.FromJsonFile(trace_path) |
- logging.info('verifying %s from %s' % (trace.url, trace_path)) |
- |
- effective_requests = ListUrlRequests(trace, RequestOutcome.All) |
- effective_post_requests = ListUrlRequests(trace, RequestOutcome.Post) |
- effective_cached_requests = \ |
- ListUrlRequests(trace, RequestOutcome.ServedFromCache) |
- effective_uncached_requests = \ |
- ListUrlRequests(trace, RequestOutcome.NotServedFromCache) |
- |
- missing_requests = original_requests.difference(effective_requests) |
- unexpected_requests = effective_requests.difference(original_requests) |
- expected_cached_requests = \ |
- original_cached_requests.difference(missing_requests) |
- missing_cached_requests = \ |
- expected_cached_requests.difference(effective_cached_requests) |
- expected_uncached_requests = original_uncached_requests.union( |
- unexpected_requests).union(missing_cached_requests) |
- all_sent_url_requests.update(effective_uncached_requests) |
- |
- # POST requests are known to be unable to use the cache. |
- expected_cached_requests.difference_update(effective_post_requests) |
- expected_uncached_requests.update(effective_post_requests) |
- |
- _PrintUrlSetComparison(original_requests, effective_requests, |
- 'All resources') |
- _PrintUrlSetComparison(set(), effective_post_requests, |
- 'POST resources') |
- _PrintUrlSetComparison(expected_cached_requests, effective_cached_requests, |
- 'Cached resources') |
- _PrintUrlSetComparison(expected_uncached_requests, |
- effective_uncached_requests, 'Non cached resources') |
- |
- # Verify requests from WPR. |
- wpr_log_path = os.path.join( |
- benchmark_output_directory_path, sandwich_runner.WPR_LOG_FILENAME) |
- logging.info('verifying requests from %s' % wpr_log_path) |
- all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) |
- all_wpr_urls = set() |
- unserved_wpr_urls = set() |
- wpr_command_colliding_urls = set() |
- |
- for request in all_wpr_requests: |
- if request.is_wpr_host: |
- continue |
- if urlparse(request.url).path.startswith('/web-page-replay'): |
- wpr_command_colliding_urls.add(request.url) |
- elif request.is_served is False: |
- unserved_wpr_urls.add(request.url) |
- all_wpr_urls.add(request.url) |
- |
- _PrintUrlSetComparison(set(), unserved_wpr_urls, |
- 'Distinct unserved resources from WPR') |
- _PrintUrlSetComparison(set(), wpr_command_colliding_urls, |
- 'Distinct resources colliding to WPR commands') |
- _PrintUrlSetComparison(all_wpr_urls, all_sent_url_requests, |
- 'Distinct resource requests to WPR') |
- |
- |
-def ReadSubresourceFromRunnerOutputDir(runner_output_dir): |
- """Extracts a list of subresources in runner output directory. |
- |
- Args: |
- runner_output_dir: Path of the runner's output directory. |
- |
- Returns: |
- [URLs of sub-resources] |
- """ |
- trace_path = os.path.join( |
- runner_output_dir, '0', sandwich_runner.TRACE_FILENAME) |
- trace = LoadingTrace.FromJsonFile(trace_path) |
- url_set = set() |
- for request_event in _FilterOutDataAndIncompleteRequests( |
- trace.request_track.GetEvents()): |
- url_set.add(request_event.url) |
- logging.info('lists %s resources of %s from %s' % \ |
- (len(url_set), trace.url, trace_path)) |
- return [url for url in url_set] |
- |
- |
-def ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path): |
- """Validates a cache archive content. |
- |
- Args: |
- cache_build_trace_path: Path of the generated trace at the cache build time. |
- cache_archive_path: Cache archive's path to validate. |
- """ |
- # TODO(gabadie): What's the best way of propagating errors happening in here? |
- logging.info('lists cached urls from %s' % cache_archive_path) |
- with common_util.TemporaryDirectory() as cache_directory: |
- chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory) |
- cache_keys = set( |
- chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys()) |
- trace = LoadingTrace.FromJsonFile(cache_build_trace_path) |
- effective_requests = ListUrlRequests(trace, RequestOutcome.All) |
- effective_post_requests = ListUrlRequests(trace, RequestOutcome.Post) |
- |
- upload_data_stream_cache_entry_keys = set() |
- upload_data_stream_requests = set() |
- for cache_entry_key in cache_keys: |
- match = _UPLOAD_DATA_STREAM_REQUESTS_REGEX.match(cache_entry_key) |
- if not match: |
- continue |
- upload_data_stream_cache_entry_keys.add(cache_entry_key) |
- upload_data_stream_requests.add(match.group('url')) |
- |
- expected_cached_requests = effective_requests.difference( |
- effective_post_requests) |
- effective_cache_keys = cache_keys.difference( |
- upload_data_stream_cache_entry_keys) |
- |
- _PrintUrlSetComparison(effective_post_requests, upload_data_stream_requests, |
- 'POST resources') |
- _PrintUrlSetComparison(expected_cached_requests, effective_cache_keys, |
- 'Cached resources') |