tools/android/loading/sandwich_misc.py - Issue 2023263002: sandwich: Move all NoState-Prefetch related code in sandwich_prefetch.py

Unified Diff: tools/android/loading/sandwich_misc.py

Issue 2023263002: sandwich: Move all NoState-Prefetch related code in sandwich_prefetch.py (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/android/loading/sandwich_misc.py

diff --git a/tools/android/loading/sandwich_misc.py b/tools/android/loading/sandwich_misc.py

deleted file mode 100644

index 172f4e40b0552eeedd797d36d4f1f72c18a39932..0000000000000000000000000000000000000000

--- a/tools/android/loading/sandwich_misc.py

+++ /dev/null

@@ -1,402 +0,0 @@

-# Use of this source code is governed by a BSD-style license that can be

-# found in the LICENSE file.

-import logging

-import json

-import os

-import re

-from urlparse import urlparse

-import chrome_cache

-import common_util

-from loading_trace import LoadingTrace

-from prefetch_view import PrefetchSimulationView

-from request_dependencies_lens import RequestDependencyLens

-import sandwich_runner

-import wpr_backend

-# Do not prefetch anything.

-EMPTY_CACHE_DISCOVERER = 'empty-cache'

-# Prefetches everything to load fully from cache (impossible in practice).

-FULL_CACHE_DISCOVERER = 'full-cache'

-# Prefetches the first resource following the redirection chain.

-REDIRECTED_MAIN_DISCOVERER = 'redirected-main'

-# All resources which are fetched from the main document and their redirections.

-PARSER_DISCOVERER = 'parser'

-# Simulation of HTMLPreloadScanner on the main document and their redirections.

-HTML_PRELOAD_SCANNER_DISCOVERER = 'html-scanner'

-SUBRESOURCE_DISCOVERERS = set([

- EMPTY_CACHE_DISCOVERER,

- FULL_CACHE_DISCOVERER,

- REDIRECTED_MAIN_DISCOVERER,

- PARSER_DISCOVERER,

- HTML_PRELOAD_SCANNER_DISCOVERER

-])

-_UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')

-def PatchWpr(wpr_archive_path):

- """Patches a WPR archive to get all resources into the HTTP cache and avoid

- invalidation and revalidations.

- Args:

- wpr_archive_path: Path of the WPR archive to patch.

- """

- # Sets the resources cache max-age to 10 years.

- MAX_AGE = 10 * 365 * 24 * 60 * 60

- CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE)

- wpr_archive = wpr_backend.WprArchiveBackend(wpr_archive_path)

- for url_entry in wpr_archive.ListUrlEntries():

- response_headers = url_entry.GetResponseHeadersDict()

- if 'cache-control' in response_headers and \

- response_headers['cache-control'] == CACHE_CONTROL:

- continue

- logging.info('patching %s' % url_entry.url)

- # TODO(gabadie): may need to patch Last-Modified and If-Modified-Since.

- # TODO(gabadie): may need to delete ETag.

- # TODO(gabadie): may need to take care of x-cache.

- #

- # Override the cache-control header to set the resources max age to MAX_AGE.

- #

- # Important note: Some resources holding sensitive information might have

- # cache-control set to no-store which allow the resource to be cached but

- # not cached in the file system. NoState-Prefetch is going to take care of

- # this case. But in here, to simulate NoState-Prefetch, we don't have other

- # choices but save absolutely all cached resources on disk so they survive

- # after killing chrome for cache save, modification and push.

- url_entry.SetResponseHeader('cache-control', CACHE_CONTROL)

- # TODO(gabadie): May need to extend Vary blacklist (referer?)

- #

- # All of these Vary and Pragma possibilities need to be removed from

- # response headers in order for Chrome to store a resource in HTTP cache and

- # not to invalidate it.

- #

- # Note: HttpVaryData::Init() in Chrome adds an implicit 'Vary: cookie'

- # header to any redirect.

- # TODO(gabadie): Find a way to work around this issue.

- url_entry.RemoveResponseHeaderDirectives('vary', {'*', 'cookie'})

- url_entry.RemoveResponseHeaderDirectives('pragma', {'no-cache'})

- wpr_archive.Persist()

-def _FilterOutDataAndIncompleteRequests(requests):

- for request in filter(lambda r: not r.IsDataRequest(), requests):

- # The protocol is only known once the response has been received. But the

- # trace recording might have been stopped with still some JavaScript

- # originated requests that have not received any responses yet.

- if request.protocol is None:

- assert not request.HasReceivedResponse()

- assert request.initiator['type'] == 'script'

- continue

- if request.protocol not in {'http/0.9', 'http/1.0', 'http/1.1'}:

- raise RuntimeError('Unknown request protocol {}'.format(request.protocol))

- yield request

-def PatchCacheArchive(cache_archive_path, loading_trace_path,

- cache_archive_dest_path):

- """Patch the cache archive.

- Note: This method update the raw response headers of cache entries' to store

- the ones such as Set-Cookie that were pruned by the

- net::HttpCacheTransaction, and remove the stream index 2 holding resource's

- compile meta data.

- Args:

- cache_archive_path: Input archive's path to patch.

- loading_trace_path: Path of the loading trace that have recorded the cache

- archive <cache_archive_path>.

- cache_archive_dest_path: Archive destination's path.

- """

- trace = LoadingTrace.FromJsonFile(loading_trace_path)

- with common_util.TemporaryDirectory(prefix='sandwich_tmp') as tmp_path:

- cache_path = os.path.join(tmp_path, 'cache')

- chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_path)

- cache_backend = chrome_cache.CacheBackend(cache_path, 'simple')

- cache_entries = set(cache_backend.ListKeys())

- logging.info('Original cache size: %d bytes' % cache_backend.GetSize())

- for request in _FilterOutDataAndIncompleteRequests(

- trace.request_track.GetEvents()):

- # On requests having an upload data stream such as POST requests,

- # net::HttpCache::GenerateCacheKey() prefixes the cache entry's key with

- # the upload data stream's session unique identifier.

- #

- # It is fine to not patch these requests since when reopening Chrome,

- # there is no way the entry can be reused since the upload data stream's

- # identifier will be different.

- #

- # The fact that these entries are kept in the cache after closing Chrome

- # properly by closing the Chrome tab as the ChromeControler.SetSlowDeath()

- # do is known chrome bug (crbug.com/610725).

- if request.url not in cache_entries:

- continue

- # Chrome prunes Set-Cookie from response headers before storing them in

- # disk cache. Also, it adds implicit "Vary: cookie" header to all redirect

- # response headers. Sandwich manages the cache, but between recording the

- # cache and benchmarking the cookie jar is invalidated. This leads to

- # invalidation of all cacheable redirects.

- raw_headers = request.GetRawResponseHeaders()

- cache_backend.UpdateRawResponseHeaders(request.url, raw_headers)

- # NoState-Prefetch would only fetch the resources, but not parse them.

- cache_backend.DeleteStreamForKey(request.url, 2)

- chrome_cache.ZipDirectoryContent(cache_path, cache_archive_dest_path)

- logging.info('Patched cache size: %d bytes' % cache_backend.GetSize())

-def ExtractDiscoverableUrls(loading_trace_path, subresource_discoverer):

- """Extracts discoverable resource urls from a loading trace according to a

- sub-resource discoverer.

- Args:

- loading_trace_path: The loading trace's path.

- subresource_discoverer: The sub-resources discoverer that should white-list

- the resources to keep in cache for the NoState-Prefetch benchmarks.

- Returns:

- A set of urls.

- """

- assert subresource_discoverer in SUBRESOURCE_DISCOVERERS, \

- 'unknown prefetch simulation {}'.format(subresource_discoverer)

- # Load trace and related infos.

- logging.info('loading %s' % loading_trace_path)

- trace = LoadingTrace.FromJsonFile(loading_trace_path)

- dependencies_lens = RequestDependencyLens(trace)

- first_resource_request = trace.request_track.GetFirstResourceRequest()

- # Build the list of discovered requests according to the desired simulation.

- discovered_requests = []

- if subresource_discoverer == EMPTY_CACHE_DISCOVERER:

- pass

- elif subresource_discoverer == FULL_CACHE_DISCOVERER:

- discovered_requests = trace.request_track.GetEvents()

- elif subresource_discoverer == REDIRECTED_MAIN_DISCOVERER:

- discovered_requests = \

- [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]

- elif subresource_discoverer == PARSER_DISCOVERER:

- discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests(

- first_resource_request, dependencies_lens)

- elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER:

- discovered_requests = PrefetchSimulationView.PreloadedRequests(

- first_resource_request, dependencies_lens, trace)

- else:

- assert False

- whitelisted_urls = set()

- logging.info('white-listing %s' % first_resource_request.url)

- for request in _FilterOutDataAndIncompleteRequests(discovered_requests):

- logging.info('white-listing %s' % request.url)

- whitelisted_urls.add(request.url)

- return whitelisted_urls

-def _PrintUrlSetComparison(ref_url_set, url_set, url_set_name):

- """Compare URL sets and log the diffs.

- Args:

- ref_url_set: Set of reference urls.

- url_set: Set of urls to compare to the reference.

- url_set_name: The set name for logging purposes.

- """

- assert type(ref_url_set) == set

- assert type(url_set) == set

- if ref_url_set == url_set:

- logging.info(' %d %s are matching.' % (len(ref_url_set), url_set_name))

- return

- missing_urls = ref_url_set.difference(url_set)

- unexpected_urls = url_set.difference(ref_url_set)

- logging.error(' %s are not matching (expected %d, had %d)' % \

- (url_set_name, len(ref_url_set), len(url_set)))

- logging.error(' List of %d missing resources:' % len(missing_urls))

- for url in sorted(missing_urls):

- logging.error('- ' + url)

- logging.error(' List of %d unexpected resources:' % len(unexpected_urls))

- for url in sorted(unexpected_urls):

- logging.error('+ ' + url)

-class RequestOutcome:

- All, ServedFromCache, NotServedFromCache, Post = range(4)

-def ListUrlRequests(trace, request_kind):

- """Lists requested URLs from a trace.

- Args:

- trace: (LoadingTrace) loading trace.

- request_kind: RequestOutcome indicating the subset of requests to output.

- Returns:

- set([str])

- """

- urls = set()

- for request_event in _FilterOutDataAndIncompleteRequests(

- trace.request_track.GetEvents()):

- if (request_kind == RequestOutcome.ServedFromCache and

- request_event.from_disk_cache):

- urls.add(request_event.url)

- elif (request_kind == RequestOutcome.Post and

- request_event.method.upper().strip() == 'POST'):

- urls.add(request_event.url)

- elif (request_kind == RequestOutcome.NotServedFromCache and

- not request_event.from_disk_cache):

- urls.add(request_event.url)

- elif request_kind == RequestOutcome.All:

- urls.add(request_event.url)

- return urls

-def VerifyBenchmarkOutputDirectory(benchmark_setup_path,

- benchmark_output_directory_path):

- """Verifies that all run inside the run_output_directory worked as expected.

- Args:

- benchmark_setup_path: Path of the JSON of the benchmark setup.

- benchmark_output_directory_path: Path of the benchmark output directory to

- verify.

- """

- # TODO(gabadie): What's the best way of propagating errors happening in here?

- benchmark_setup = json.load(open(benchmark_setup_path))

- cache_whitelist = set(benchmark_setup['cache_whitelist'])

- original_requests = set(benchmark_setup['url_resources'])

- original_cached_requests = original_requests.intersection(cache_whitelist)

- original_uncached_requests = original_requests.difference(cache_whitelist)

- all_sent_url_requests = set()

- # Verify requests from traces.

- run_id = -1

- while True:

- run_id += 1

- run_path = os.path.join(benchmark_output_directory_path, str(run_id))

- if not os.path.isdir(run_path):

- break

- trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME)

- if not os.path.isfile(trace_path):

- logging.error('missing trace %s' % trace_path)

- continue

- trace = LoadingTrace.FromJsonFile(trace_path)

- logging.info('verifying %s from %s' % (trace.url, trace_path))

- effective_requests = ListUrlRequests(trace, RequestOutcome.All)

- effective_post_requests = ListUrlRequests(trace, RequestOutcome.Post)

- effective_cached_requests = \

- ListUrlRequests(trace, RequestOutcome.ServedFromCache)

- effective_uncached_requests = \

- ListUrlRequests(trace, RequestOutcome.NotServedFromCache)

- missing_requests = original_requests.difference(effective_requests)

- unexpected_requests = effective_requests.difference(original_requests)

- expected_cached_requests = \

- original_cached_requests.difference(missing_requests)

- missing_cached_requests = \

- expected_cached_requests.difference(effective_cached_requests)

- expected_uncached_requests = original_uncached_requests.union(

- unexpected_requests).union(missing_cached_requests)

- all_sent_url_requests.update(effective_uncached_requests)

- # POST requests are known to be unable to use the cache.

- expected_cached_requests.difference_update(effective_post_requests)

- expected_uncached_requests.update(effective_post_requests)

- _PrintUrlSetComparison(original_requests, effective_requests,

- 'All resources')

- _PrintUrlSetComparison(set(), effective_post_requests,

- 'POST resources')

- _PrintUrlSetComparison(expected_cached_requests, effective_cached_requests,

- 'Cached resources')

- _PrintUrlSetComparison(expected_uncached_requests,

- effective_uncached_requests, 'Non cached resources')

- # Verify requests from WPR.

- wpr_log_path = os.path.join(

- benchmark_output_directory_path, sandwich_runner.WPR_LOG_FILENAME)

- logging.info('verifying requests from %s' % wpr_log_path)

- all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)

- all_wpr_urls = set()

- unserved_wpr_urls = set()

- wpr_command_colliding_urls = set()

- for request in all_wpr_requests:

- if request.is_wpr_host:

- continue

- if urlparse(request.url).path.startswith('/web-page-replay'):

- wpr_command_colliding_urls.add(request.url)

- elif request.is_served is False:

- unserved_wpr_urls.add(request.url)

- all_wpr_urls.add(request.url)

- _PrintUrlSetComparison(set(), unserved_wpr_urls,

- 'Distinct unserved resources from WPR')

- _PrintUrlSetComparison(set(), wpr_command_colliding_urls,

- 'Distinct resources colliding to WPR commands')

- _PrintUrlSetComparison(all_wpr_urls, all_sent_url_requests,

- 'Distinct resource requests to WPR')

-def ReadSubresourceFromRunnerOutputDir(runner_output_dir):

- """Extracts a list of subresources in runner output directory.

- Args:

- runner_output_dir: Path of the runner's output directory.

- Returns:

- [URLs of sub-resources]

- """

- trace_path = os.path.join(

- runner_output_dir, '0', sandwich_runner.TRACE_FILENAME)

- trace = LoadingTrace.FromJsonFile(trace_path)

- url_set = set()

- for request_event in _FilterOutDataAndIncompleteRequests(

- trace.request_track.GetEvents()):

- url_set.add(request_event.url)

- logging.info('lists %s resources of %s from %s' % \

- (len(url_set), trace.url, trace_path))

- return [url for url in url_set]

-def ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path):

- """Validates a cache archive content.

- Args:

- cache_build_trace_path: Path of the generated trace at the cache build time.

- cache_archive_path: Cache archive's path to validate.

- """

- # TODO(gabadie): What's the best way of propagating errors happening in here?

- logging.info('lists cached urls from %s' % cache_archive_path)

- with common_util.TemporaryDirectory() as cache_directory:

- chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory)

- cache_keys = set(

- chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys())

- trace = LoadingTrace.FromJsonFile(cache_build_trace_path)

- effective_requests = ListUrlRequests(trace, RequestOutcome.All)

- effective_post_requests = ListUrlRequests(trace, RequestOutcome.Post)

- upload_data_stream_cache_entry_keys = set()

- upload_data_stream_requests = set()

- for cache_entry_key in cache_keys:

- match = _UPLOAD_DATA_STREAM_REQUESTS_REGEX.match(cache_entry_key)

- if not match:

- continue

- upload_data_stream_cache_entry_keys.add(cache_entry_key)

- upload_data_stream_requests.add(match.group('url'))

- expected_cached_requests = effective_requests.difference(

- effective_post_requests)

- effective_cache_keys = cache_keys.difference(

- upload_data_stream_cache_entry_keys)

- _PrintUrlSetComparison(effective_post_requests, upload_data_stream_requests,

- 'POST resources')

- _PrintUrlSetComparison(expected_cached_requests, effective_cache_keys,

- 'Cached resources')

« no previous file with comments | « tools/android/loading/sandwich_metrics.py ('k') | tools/android/loading/sandwich_misc_unittest.py » ('j') | no next file with comments »