Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(15)

Unified Diff: tools/android/loading/sandwich_misc.py

Issue 2023263002: sandwich: Move all NoState-Prefetch related code in sandwich_prefetch.py (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « tools/android/loading/sandwich_metrics.py ('k') | tools/android/loading/sandwich_misc_unittest.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: tools/android/loading/sandwich_misc.py
diff --git a/tools/android/loading/sandwich_misc.py b/tools/android/loading/sandwich_misc.py
deleted file mode 100644
index 172f4e40b0552eeedd797d36d4f1f72c18a39932..0000000000000000000000000000000000000000
--- a/tools/android/loading/sandwich_misc.py
+++ /dev/null
@@ -1,402 +0,0 @@
-# Copyright 2016 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-import logging
-import json
-import os
-import re
-from urlparse import urlparse
-
-import chrome_cache
-import common_util
-from loading_trace import LoadingTrace
-from prefetch_view import PrefetchSimulationView
-from request_dependencies_lens import RequestDependencyLens
-import sandwich_runner
-import wpr_backend
-
-
-# Do not prefetch anything.
-EMPTY_CACHE_DISCOVERER = 'empty-cache'
-
-# Prefetches everything to load fully from cache (impossible in practice).
-FULL_CACHE_DISCOVERER = 'full-cache'
-
-# Prefetches the first resource following the redirection chain.
-REDIRECTED_MAIN_DISCOVERER = 'redirected-main'
-
-# All resources which are fetched from the main document and their redirections.
-PARSER_DISCOVERER = 'parser'
-
-# Simulation of HTMLPreloadScanner on the main document and their redirections.
-HTML_PRELOAD_SCANNER_DISCOVERER = 'html-scanner'
-
-SUBRESOURCE_DISCOVERERS = set([
- EMPTY_CACHE_DISCOVERER,
- FULL_CACHE_DISCOVERER,
- REDIRECTED_MAIN_DISCOVERER,
- PARSER_DISCOVERER,
- HTML_PRELOAD_SCANNER_DISCOVERER
-])
-
-_UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')
-
-
-def PatchWpr(wpr_archive_path):
- """Patches a WPR archive to get all resources into the HTTP cache and avoid
- invalidation and revalidations.
-
- Args:
- wpr_archive_path: Path of the WPR archive to patch.
- """
- # Sets the resources cache max-age to 10 years.
- MAX_AGE = 10 * 365 * 24 * 60 * 60
- CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE)
-
- wpr_archive = wpr_backend.WprArchiveBackend(wpr_archive_path)
- for url_entry in wpr_archive.ListUrlEntries():
- response_headers = url_entry.GetResponseHeadersDict()
- if 'cache-control' in response_headers and \
- response_headers['cache-control'] == CACHE_CONTROL:
- continue
- logging.info('patching %s' % url_entry.url)
- # TODO(gabadie): may need to patch Last-Modified and If-Modified-Since.
- # TODO(gabadie): may need to delete ETag.
- # TODO(gabadie): may need to take care of x-cache.
- #
- # Override the cache-control header to set the resources max age to MAX_AGE.
- #
- # Important note: Some resources holding sensitive information might have
- # cache-control set to no-store which allow the resource to be cached but
- # not cached in the file system. NoState-Prefetch is going to take care of
- # this case. But in here, to simulate NoState-Prefetch, we don't have other
- # choices but save absolutely all cached resources on disk so they survive
- # after killing chrome for cache save, modification and push.
- url_entry.SetResponseHeader('cache-control', CACHE_CONTROL)
-
- # TODO(gabadie): May need to extend Vary blacklist (referer?)
- #
- # All of these Vary and Pragma possibilities need to be removed from
- # response headers in order for Chrome to store a resource in HTTP cache and
- # not to invalidate it.
- #
- # Note: HttpVaryData::Init() in Chrome adds an implicit 'Vary: cookie'
- # header to any redirect.
- # TODO(gabadie): Find a way to work around this issue.
- url_entry.RemoveResponseHeaderDirectives('vary', {'*', 'cookie'})
- url_entry.RemoveResponseHeaderDirectives('pragma', {'no-cache'})
-
- wpr_archive.Persist()
-
-
-def _FilterOutDataAndIncompleteRequests(requests):
- for request in filter(lambda r: not r.IsDataRequest(), requests):
- # The protocol is only known once the response has been received. But the
- # trace recording might have been stopped with still some JavaScript
- # originated requests that have not received any responses yet.
- if request.protocol is None:
- assert not request.HasReceivedResponse()
- assert request.initiator['type'] == 'script'
- continue
- if request.protocol not in {'http/0.9', 'http/1.0', 'http/1.1'}:
- raise RuntimeError('Unknown request protocol {}'.format(request.protocol))
- yield request
-
-
-def PatchCacheArchive(cache_archive_path, loading_trace_path,
- cache_archive_dest_path):
- """Patch the cache archive.
-
- Note: This method update the raw response headers of cache entries' to store
- the ones such as Set-Cookie that were pruned by the
- net::HttpCacheTransaction, and remove the stream index 2 holding resource's
- compile meta data.
-
- Args:
- cache_archive_path: Input archive's path to patch.
- loading_trace_path: Path of the loading trace that have recorded the cache
- archive <cache_archive_path>.
- cache_archive_dest_path: Archive destination's path.
- """
- trace = LoadingTrace.FromJsonFile(loading_trace_path)
- with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path:
- cache_path = os.path.join(tmp_path, 'cache')
- chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path)
- cache_backend = chrome_cache.CacheBackend(cache_path, 'simple')
- cache_entries = set(cache_backend.ListKeys())
- logging.info('Original cache size: %d bytes' % cache_backend.GetSize())
- for request in _FilterOutDataAndIncompleteRequests(
- trace.request_track.GetEvents()):
- # On requests having an upload data stream such as POST requests,
- # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with
- # the upload data stream's session unique identifier.
- #
- # It is fine to not patch these requests since when reopening Chrome,
- # there is no way the entry can be reused since the upload data stream's
- # identifier will be different.
- #
- # The fact that these entries are kept in the cache after closing Chrome
- # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath()
- # do is known chrome bug (crbug.com/610725).
- if request.url not in cache_entries:
- continue
- # Chrome prunes Set-Cookie from response headers before storing them in
- # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect
- # response headers. Sandwich manages the cache, but between recording the
- # cache and benchmarking the cookie jar is invalidated. This leads to
- # invalidation of all cacheable redirects.
- raw_headers = request.GetRawResponseHeaders()
- cache_backend.UpdateRawResponseHeaders(request.url, raw_headers)
- # NoState-Prefetch would only fetch the resources, but not parse them.
- cache_backend.DeleteStreamForKey(request.url, 2)
- chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path)
- logging.info('Patched cache size: %d bytes' % cache_backend.GetSize())
-
-
-def ExtractDiscoverableUrls(loading_trace_path, subresource_discoverer):
- """Extracts discoverable resource urls from a loading trace according to a
- sub-resource discoverer.
-
- Args:
- loading_trace_path: The loading trace's path.
- subresource_discoverer: The sub-resources discoverer that should white-list
- the resources to keep in cache for the NoState-Prefetch benchmarks.
-
- Returns:
- A set of urls.
- """
- assert subresource_discoverer in SUBRESOURCE_DISCOVERERS, \
- 'unknown prefetch simulation {}'.format(subresource_discoverer)
-
- # Load trace and related infos.
- logging.info('loading %s' % loading_trace_path)
- trace = LoadingTrace.FromJsonFile(loading_trace_path)
- dependencies_lens = RequestDependencyLens(trace)
- first_resource_request = trace.request_track.GetFirstResourceRequest()
-
- # Build the list of discovered requests according to the desired simulation.
- discovered_requests = []
- if subresource_discoverer == EMPTY_CACHE_DISCOVERER:
- pass
- elif subresource_discoverer == FULL_CACHE_DISCOVERER:
- discovered_requests = trace.request_track.GetEvents()
- elif subresource_discoverer == REDIRECTED_MAIN_DISCOVERER:
- discovered_requests = \
- [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]
- elif subresource_discoverer == PARSER_DISCOVERER:
- discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests(
- first_resource_request, dependencies_lens)
- elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER:
- discovered_requests = PrefetchSimulationView.PreloadedRequests(
- first_resource_request, dependencies_lens, trace)
- else:
- assert False
-
- whitelisted_urls = set()
- logging.info('white-listing %s' % first_resource_request.url)
- for request in _FilterOutDataAndIncompleteRequests(discovered_requests):
- logging.info('white-listing %s' % request.url)
- whitelisted_urls.add(request.url)
- return whitelisted_urls
-
-
-def _PrintUrlSetComparison(ref_url_set, url_set, url_set_name):
- """Compare URL sets and log the diffs.
-
- Args:
- ref_url_set: Set of reference urls.
- url_set: Set of urls to compare to the reference.
- url_set_name: The set name for logging purposes.
- """
- assert type(ref_url_set) == set
- assert type(url_set) == set
- if ref_url_set == url_set:
- logging.info(' %d %s are matching.' % (len(ref_url_set), url_set_name))
- return
- missing_urls = ref_url_set.difference(url_set)
- unexpected_urls = url_set.difference(ref_url_set)
- logging.error(' %s are not matching (expected %d, had %d)' % \
- (url_set_name, len(ref_url_set), len(url_set)))
- logging.error(' List of %d missing resources:' % len(missing_urls))
- for url in sorted(missing_urls):
- logging.error('- ' + url)
- logging.error(' List of %d unexpected resources:' % len(unexpected_urls))
- for url in sorted(unexpected_urls):
- logging.error('+ ' + url)
-
-
-class RequestOutcome:
- All, ServedFromCache, NotServedFromCache, Post = range(4)
-
-
-def ListUrlRequests(trace, request_kind):
- """Lists requested URLs from a trace.
-
- Args:
- trace: (LoadingTrace) loading trace.
- request_kind: RequestOutcome indicating the subset of requests to output.
-
- Returns:
- set([str])
- """
- urls = set()
- for request_event in _FilterOutDataAndIncompleteRequests(
- trace.request_track.GetEvents()):
- if (request_kind == RequestOutcome.ServedFromCache and
- request_event.from_disk_cache):
- urls.add(request_event.url)
- elif (request_kind == RequestOutcome.Post and
- request_event.method.upper().strip() == 'POST'):
- urls.add(request_event.url)
- elif (request_kind == RequestOutcome.NotServedFromCache and
- not request_event.from_disk_cache):
- urls.add(request_event.url)
- elif request_kind == RequestOutcome.All:
- urls.add(request_event.url)
- return urls
-
-
-def VerifyBenchmarkOutputDirectory(benchmark_setup_path,
- benchmark_output_directory_path):
- """Verifies that all run inside the run_output_directory worked as expected.
-
- Args:
- benchmark_setup_path: Path of the JSON of the benchmark setup.
- benchmark_output_directory_path: Path of the benchmark output directory to
- verify.
- """
- # TODO(gabadie): What's the best way of propagating errors happening in here?
- benchmark_setup = json.load(open(benchmark_setup_path))
- cache_whitelist = set(benchmark_setup['cache_whitelist'])
- original_requests = set(benchmark_setup['url_resources'])
- original_cached_requests = original_requests.intersection(cache_whitelist)
- original_uncached_requests = original_requests.difference(cache_whitelist)
- all_sent_url_requests = set()
-
- # Verify requests from traces.
- run_id = -1
- while True:
- run_id += 1
- run_path = os.path.join(benchmark_output_directory_path, str(run_id))
- if not os.path.isdir(run_path):
- break
- trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME)
- if not os.path.isfile(trace_path):
- logging.error('missing trace %s' % trace_path)
- continue
- trace = LoadingTrace.FromJsonFile(trace_path)
- logging.info('verifying %s from %s' % (trace.url, trace_path))
-
- effective_requests = ListUrlRequests(trace, RequestOutcome.All)
- effective_post_requests = ListUrlRequests(trace, RequestOutcome.Post)
- effective_cached_requests = \
- ListUrlRequests(trace, RequestOutcome.ServedFromCache)
- effective_uncached_requests = \
- ListUrlRequests(trace, RequestOutcome.NotServedFromCache)
-
- missing_requests = original_requests.difference(effective_requests)
- unexpected_requests = effective_requests.difference(original_requests)
- expected_cached_requests = \
- original_cached_requests.difference(missing_requests)
- missing_cached_requests = \
- expected_cached_requests.difference(effective_cached_requests)
- expected_uncached_requests = original_uncached_requests.union(
- unexpected_requests).union(missing_cached_requests)
- all_sent_url_requests.update(effective_uncached_requests)
-
- # POST requests are known to be unable to use the cache.
- expected_cached_requests.difference_update(effective_post_requests)
- expected_uncached_requests.update(effective_post_requests)
-
- _PrintUrlSetComparison(original_requests, effective_requests,
- 'All resources')
- _PrintUrlSetComparison(set(), effective_post_requests,
- 'POST resources')
- _PrintUrlSetComparison(expected_cached_requests, effective_cached_requests,
- 'Cached resources')
- _PrintUrlSetComparison(expected_uncached_requests,
- effective_uncached_requests, 'Non cached resources')
-
- # Verify requests from WPR.
- wpr_log_path = os.path.join(
- benchmark_output_directory_path, sandwich_runner.WPR_LOG_FILENAME)
- logging.info('verifying requests from %s' % wpr_log_path)
- all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)
- all_wpr_urls = set()
- unserved_wpr_urls = set()
- wpr_command_colliding_urls = set()
-
- for request in all_wpr_requests:
- if request.is_wpr_host:
- continue
- if urlparse(request.url).path.startswith('/web-page-replay'):
- wpr_command_colliding_urls.add(request.url)
- elif request.is_served is False:
- unserved_wpr_urls.add(request.url)
- all_wpr_urls.add(request.url)
-
- _PrintUrlSetComparison(set(), unserved_wpr_urls,
- 'Distinct unserved resources from WPR')
- _PrintUrlSetComparison(set(), wpr_command_colliding_urls,
- 'Distinct resources colliding to WPR commands')
- _PrintUrlSetComparison(all_wpr_urls, all_sent_url_requests,
- 'Distinct resource requests to WPR')
-
-
-def ReadSubresourceFromRunnerOutputDir(runner_output_dir):
- """Extracts a list of subresources in runner output directory.
-
- Args:
- runner_output_dir: Path of the runner's output directory.
-
- Returns:
- [URLs of sub-resources]
- """
- trace_path = os.path.join(
- runner_output_dir, '0', sandwich_runner.TRACE_FILENAME)
- trace = LoadingTrace.FromJsonFile(trace_path)
- url_set = set()
- for request_event in _FilterOutDataAndIncompleteRequests(
- trace.request_track.GetEvents()):
- url_set.add(request_event.url)
- logging.info('lists %s resources of %s from %s' % \
- (len(url_set), trace.url, trace_path))
- return [url for url in url_set]
-
-
-def ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path):
- """Validates a cache archive content.
-
- Args:
- cache_build_trace_path: Path of the generated trace at the cache build time.
- cache_archive_path: Cache archive's path to validate.
- """
- # TODO(gabadie): What's the best way of propagating errors happening in here?
- logging.info('lists cached urls from %s' % cache_archive_path)
- with common_util.TemporaryDirectory() as cache_directory:
- chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory)
- cache_keys = set(
- chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys())
- trace = LoadingTrace.FromJsonFile(cache_build_trace_path)
- effective_requests = ListUrlRequests(trace, RequestOutcome.All)
- effective_post_requests = ListUrlRequests(trace, RequestOutcome.Post)
-
- upload_data_stream_cache_entry_keys = set()
- upload_data_stream_requests = set()
- for cache_entry_key in cache_keys:
- match = _UPLOAD_DATA_STREAM_REQUESTS_REGEX.match(cache_entry_key)
- if not match:
- continue
- upload_data_stream_cache_entry_keys.add(cache_entry_key)
- upload_data_stream_requests.add(match.group('url'))
-
- expected_cached_requests = effective_requests.difference(
- effective_post_requests)
- effective_cache_keys = cache_keys.difference(
- upload_data_stream_cache_entry_keys)
-
- _PrintUrlSetComparison(effective_post_requests, upload_data_stream_requests,
- 'POST resources')
- _PrintUrlSetComparison(expected_cached_requests, effective_cache_keys,
- 'Cached resources')
« no previous file with comments | « tools/android/loading/sandwich_metrics.py ('k') | tools/android/loading/sandwich_misc_unittest.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698