Index: tools/android/loading/sandwich_prefetch.py |
diff --git a/tools/android/loading/sandwich_prefetch.py b/tools/android/loading/sandwich_prefetch.py |
index 173d0bffbb1d820583bc2f808fe2b453ee634c7f..ab093c2c280cbf1731a1ce038b200124e2968bac 100644 |
--- a/tools/android/loading/sandwich_prefetch.py |
+++ b/tools/android/loading/sandwich_prefetch.py |
@@ -2,6 +2,19 @@ |
# Use of this source code is governed by a BSD-style license that can be |
# found in the LICENSE file. |
+""" |
+Implements a task builder for benchmarking effects of NoState Prefetch. |
+Noticeable steps of the task pipeline: |
+ * Save a WPR archive |
+ * Process the WPR archive to make all resources cacheable |
+ * Process cache archive to patch response headers back to their original |
+ values. |
+ * Find out which resources are discoverable by NoState Prefetch |
+ (HTMLPreloadScanner) |
+ * Load pages with empty/full/prefetched cache |
+ * Extract most important metrics to a CSV |
+""" |
+ |
import csv |
import logging |
import json |
@@ -198,10 +211,10 @@ def _ExtractDiscoverableUrls(loading_trace_path, subresource_discoverer): |
assert False |
whitelisted_urls = set() |
- logging.info('white-listing %s' % first_resource_request.url) |
for request in _FilterOutDataAndIncompleteRequests(discovered_requests): |
- logging.info('white-listing %s' % request.url) |
+ logging.debug('white-listing %s', request.url) |
whitelisted_urls.add(request.url) |
+ logging.info('number of white-listed resources: %d', len(whitelisted_urls)) |
return whitelisted_urls |
@@ -261,37 +274,32 @@ def _ListUrlRequests(trace, request_kind): |
return urls |
-def _VerifyBenchmarkOutputDirectory(benchmark_setup_path, |
- benchmark_output_directory_path): |
- """Verifies that all run inside the run_output_directory worked as expected. |
- |
- Args: |
- benchmark_setup_path: Path of the JSON of the benchmark setup. |
- benchmark_output_directory_path: Path of the benchmark output directory to |
- verify. |
+class _RunOutputVerifier(object): |
+ """Object to verify benchmark run from traces and WPR log stored in the |
+ runner output directory. |
""" |
- # TODO(gabadie): What's the best way of propagating errors happening in here? |
- benchmark_setup = json.load(open(benchmark_setup_path)) |
- cache_whitelist = set(benchmark_setup['cache_whitelist']) |
- original_requests = set(benchmark_setup['url_resources']) |
- original_cached_requests = original_requests.intersection(cache_whitelist) |
- original_uncached_requests = original_requests.difference(cache_whitelist) |
- all_sent_url_requests = set() |
- |
- # Verify requests from traces. |
- run_id = -1 |
- while True: |
- run_id += 1 |
- run_path = os.path.join(benchmark_output_directory_path, str(run_id)) |
- if not os.path.isdir(run_path): |
- break |
- trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME) |
- if not os.path.isfile(trace_path): |
- logging.error('missing trace %s' % trace_path) |
- continue |
- trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) |
- logging.info('verifying %s from %s' % (trace.url, trace_path)) |
+ def __init__(self, cache_validation_result, benchmark_setup): |
+ """Constructor. |
+ |
+ Args: |
+ cache_validation_result: JSON of the cache validation task. |
+ benchmark_setup: JSON of the benchmark setup. |
+ """ |
+ self._cache_whitelist = set(benchmark_setup['cache_whitelist']) |
+ self._original_requests = set(cache_validation_result['effective_requests']) |
+ self._original_post_requests = set( |
+ cache_validation_result['effective_post_requests']) |
+ self._original_cached_requests = self._original_requests.intersection( |
+ self._cache_whitelist) |
+ self._original_uncached_requests = self._original_requests.difference( |
+ self._cache_whitelist) |
+ self._all_sent_url_requests = set() |
+ |
+ def VerifyTrace(self, trace): |
+ """Verifies a trace with the cache validation result and the benchmark |
+ setup. |
+ """ |
effective_requests = _ListUrlRequests(trace, _RequestOutcome.All) |
effective_post_requests = _ListUrlRequests(trace, _RequestOutcome.Post) |
effective_cached_requests = \ |
@@ -299,74 +307,49 @@ def _VerifyBenchmarkOutputDirectory(benchmark_setup_path, |
effective_uncached_requests = \ |
_ListUrlRequests(trace, _RequestOutcome.NotServedFromCache) |
- missing_requests = original_requests.difference(effective_requests) |
- unexpected_requests = effective_requests.difference(original_requests) |
+ missing_requests = self._original_requests.difference(effective_requests) |
+ unexpected_requests = effective_requests.difference(self._original_requests) |
expected_cached_requests = \ |
- original_cached_requests.difference(missing_requests) |
- missing_cached_requests = \ |
- expected_cached_requests.difference(effective_cached_requests) |
- expected_uncached_requests = original_uncached_requests.union( |
- unexpected_requests).union(missing_cached_requests) |
- all_sent_url_requests.update(effective_uncached_requests) |
+ self._original_cached_requests.difference(missing_requests) |
+ expected_uncached_requests = self._original_uncached_requests.union( |
+ unexpected_requests).difference(missing_requests) |
# POST requests are known to be unable to use the cache. |
expected_cached_requests.difference_update(effective_post_requests) |
expected_uncached_requests.update(effective_post_requests) |
- _PrintUrlSetComparison(original_requests, effective_requests, |
+ _PrintUrlSetComparison(self._original_requests, effective_requests, |
'All resources') |
- _PrintUrlSetComparison(set(), effective_post_requests, |
- 'POST resources') |
+ _PrintUrlSetComparison(set(), effective_post_requests, 'POST resources') |
_PrintUrlSetComparison(expected_cached_requests, effective_cached_requests, |
'Cached resources') |
_PrintUrlSetComparison(expected_uncached_requests, |
effective_uncached_requests, 'Non cached resources') |
- # Verify requests from WPR. |
- wpr_log_path = os.path.join( |
- benchmark_output_directory_path, sandwich_runner.WPR_LOG_FILENAME) |
- logging.info('verifying requests from %s' % wpr_log_path) |
- all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) |
- all_wpr_urls = set() |
- unserved_wpr_urls = set() |
- wpr_command_colliding_urls = set() |
- |
- for request in all_wpr_requests: |
- if request.is_wpr_host: |
- continue |
- if urlparse(request.url).path.startswith('/web-page-replay'): |
- wpr_command_colliding_urls.add(request.url) |
- elif request.is_served is False: |
- unserved_wpr_urls.add(request.url) |
- all_wpr_urls.add(request.url) |
- |
- _PrintUrlSetComparison(set(), unserved_wpr_urls, |
- 'Distinct unserved resources from WPR') |
- _PrintUrlSetComparison(set(), wpr_command_colliding_urls, |
- 'Distinct resources colliding to WPR commands') |
- _PrintUrlSetComparison(all_wpr_urls, all_sent_url_requests, |
- 'Distinct resource requests to WPR') |
- |
+ self._all_sent_url_requests.update(effective_uncached_requests) |
-def _ReadSubresourceFromRunnerOutputDir(runner_output_dir): |
- """Extracts a list of subresources in runner output directory. |
+ def VerifyWprLog(self, wpr_log_path): |
+ """Verifies WPR log with previously verified traces.""" |
+ all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) |
+ all_wpr_urls = set() |
+ unserved_wpr_urls = set() |
+ wpr_command_colliding_urls = set() |
- Args: |
- runner_output_dir: Path of the runner's output directory. |
+ for request in all_wpr_requests: |
+ if request.is_wpr_host: |
+ continue |
+ if urlparse(request.url).path.startswith('/web-page-replay'): |
+ wpr_command_colliding_urls.add(request.url) |
+ elif request.is_served is False: |
+ unserved_wpr_urls.add(request.url) |
+ all_wpr_urls.add(request.url) |
- Returns: |
- [URLs of sub-resources] |
- """ |
- trace_path = os.path.join( |
- runner_output_dir, '0', sandwich_runner.TRACE_FILENAME) |
- trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) |
- url_set = set() |
- for request_event in _FilterOutDataAndIncompleteRequests( |
- trace.request_track.GetEvents()): |
- url_set.add(request_event.url) |
- logging.info('lists %s resources of %s from %s' % \ |
- (len(url_set), trace.url, trace_path)) |
- return [url for url in url_set] |
+ _PrintUrlSetComparison(set(), unserved_wpr_urls, |
+ 'Distinct unserved resources from WPR') |
+ _PrintUrlSetComparison(set(), wpr_command_colliding_urls, |
+ 'Distinct resources colliding to WPR commands') |
+ _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests, |
+ 'Distinct resource requests to WPR') |
def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path): |
@@ -375,6 +358,14 @@ def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path): |
Args: |
cache_build_trace_path: Path of the generated trace at the cache build time. |
cache_archive_path: Cache archive's path to validate. |
+ |
+ Returns: |
+ { |
+ 'effective_requests': [URLs of all requests], |
+ 'effective_post_requests': [URLs of POST requests], |
+ 'expected_cached_resources': [URLs of resources expected to be cached], |
+ 'successfully_cached': [URLs of cached sub-resources] |
+ } |
""" |
# TODO(gabadie): What's the best way of propagating errors happening in here? |
logging.info('lists cached urls from %s' % cache_archive_path) |
@@ -405,6 +396,69 @@ def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path): |
_PrintUrlSetComparison(expected_cached_requests, effective_cache_keys, |
'Cached resources') |
+ return { |
+ 'effective_requests': [url for url in effective_requests], |
+ 'effective_post_requests': [url for url in effective_post_requests], |
+ 'expected_cached_resources': [url for url in expected_cached_requests], |
+ 'successfully_cached_resources': [url for url in effective_cache_keys] |
+ } |
+ |
+ |
+def _ProcessRunOutputDir( |
+ cache_validation_result, benchmark_setup, runner_output_dir): |
+ """Process benchmark's run output directory. |
+ |
+ Args: |
+ cache_validation_result: Same as for _RunOutputVerifier |
+ benchmark_setup: Same as for _RunOutputVerifier |
+ runner_output_dir: Same as for SandwichRunner.output_dir |
+ |
+ Returns: |
+ List of dictionary. |
+ """ |
+ run_metrics_list = [] |
+ run_output_verifier = _RunOutputVerifier( |
+ cache_validation_result, benchmark_setup) |
+ for repeat_id, repeat_dir in sandwich_runner.WalkRepeatedRuns( |
+ runner_output_dir): |
+ trace_path = os.path.join(repeat_dir, sandwich_runner.TRACE_FILENAME) |
+ |
+ logging.info('loading trace: %s', trace_path) |
+ trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) |
+ |
+ logging.info('verifying trace: %s', trace_path) |
+ run_output_verifier.VerifyTrace(trace) |
+ |
+ logging.info('extracting metrics from trace: %s', trace_path) |
+ run_metrics = { |
+ 'url': trace.url, |
+ 'repeat_id': repeat_id, |
+ 'subresource_discoverer': benchmark_setup['subresource_discoverer'], |
+ 'cache_recording.subresource_count': |
+ len(cache_validation_result['effective_requests']), |
+ 'cache_recording.cached_subresource_count_theoretic': |
+ len(cache_validation_result['successfully_cached_resources']), |
+ 'cache_recording.cached_subresource_count': |
+ len(cache_validation_result['expected_cached_resources']), |
+ 'benchmark.subresource_count': len(_ListUrlRequests( |
+ trace, _RequestOutcome.All)), |
+ 'benchmark.served_from_cache_count_theoretic': |
+ len(benchmark_setup['cache_whitelist']), |
+ 'benchmark.served_from_cache_count': len(_ListUrlRequests( |
+ trace, _RequestOutcome.ServedFromCache)), |
+ } |
+ run_metrics.update( |
+ sandwich_metrics.ExtractCommonMetricsFromRepeatDirectory( |
+ repeat_dir, trace)) |
+ run_metrics_list.append(run_metrics) |
+ run_metrics_list.sort(key=lambda e: e['repeat_id']) |
+ |
+ wpr_log_path = os.path.join( |
+ runner_output_dir, sandwich_runner.WPR_LOG_FILENAME) |
+ logging.info('verifying wpr log: %s', wpr_log_path) |
+ run_output_verifier.VerifyWprLog(wpr_log_path) |
+ return run_metrics_list |
+ |
class PrefetchBenchmarkBuilder(task_manager.Builder): |
"""A builder for a graph of tasks for NoState-Prefetch emulated benchmarks.""" |
@@ -415,10 +469,10 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
common_builder.output_subdirectory) |
self._common_builder = common_builder |
- self._patched_wpr_task = None |
- self._reference_cache_task = None |
+ self._wpr_archive_path = None |
+ self._cache_path = None |
self._trace_from_grabbing_reference_cache = None |
- self._subresources_for_urls_task = None |
+ self._cache_validation_task = None |
self._PopulateCommonPipelines() |
def _PopulateCommonPipelines(self): |
@@ -428,13 +482,11 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
subresources (urls-resources.json). |
Here is the full dependency tree for the returned task: |
- common/patched-cache-validation.log |
+ common/patched-cache-validation.json |
depends on: common/patched-cache.zip |
depends on: common/original-cache.zip |
depends on: common/webpages-patched.wpr |
depends on: common/webpages.wpr |
- depends on: common/urls-resources.json |
- depends on: common/original-cache.zip |
""" |
@self.RegisterTask('common/webpages-patched.wpr', |
dependencies=[self._common_builder.original_wpr_task]) |
@@ -461,29 +513,18 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
_PatchCacheArchive(BuildOriginalCache.path, |
original_cache_trace_path, BuildPatchedCache.path) |
- @self.RegisterTask('common/subresources-for-urls.json', |
- [BuildOriginalCache]) |
- def ListUrlsResources(): |
- url_resources = _ReadSubresourceFromRunnerOutputDir( |
- BuildOriginalCache.run_path) |
- with open(ListUrlsResources.path, 'w') as output: |
- json.dump(url_resources, output) |
- |
- @self.RegisterTask('common/patched-cache-validation.log', |
+ @self.RegisterTask('common/patched-cache-validation.json', |
[BuildPatchedCache]) |
def ValidatePatchedCache(): |
- handler = logging.FileHandler(ValidatePatchedCache.path) |
- logging.getLogger().addHandler(handler) |
- try: |
- _ValidateCacheArchiveContent( |
- original_cache_trace_path, BuildPatchedCache.path) |
- finally: |
- logging.getLogger().removeHandler(handler) |
- |
- self._patched_wpr_task = BuildPatchedWpr |
+ cache_validation_result = _ValidateCacheArchiveContent( |
+ original_cache_trace_path, BuildPatchedCache.path) |
+ with open(ValidatePatchedCache.path, 'w') as output: |
+ json.dump(cache_validation_result, output) |
+ |
+ self._wpr_archive_path = BuildPatchedWpr.path |
self._trace_from_grabbing_reference_cache = original_cache_trace_path |
- self._reference_cache_task = BuildPatchedCache |
- self._subresources_for_urls_task = ListUrlsResources |
+ self._cache_path = BuildPatchedCache.path |
+ self._cache_validation_task = ValidatePatchedCache |
self._common_builder.default_final_tasks.append(ValidatePatchedCache) |
@@ -503,21 +544,19 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
<transformer_list_name>/<subresource_discoverer>-metrics.csv |
depends on: <transformer_list_name>/<subresource_discoverer>-run/ |
depends on: common/<subresource_discoverer>-cache.zip |
- depends on: some tasks saved by PopulateCommonPipelines() |
depends on: common/<subresource_discoverer>-setup.json |
- depends on: some tasks saved by PopulateCommonPipelines() |
+ depends on: common/patched-cache-validation.json |
""" |
additional_column_names = [ |
'url', |
'repeat_id', |
'subresource_discoverer', |
- 'subresource_count', |
- # The amount of subresources detected at SetupBenchmark step. |
- 'subresource_count_theoretic', |
- # Amount of subresources for caching as suggested by the subresource |
- # discoverer. |
- 'cached_subresource_count_theoretic', |
- 'cached_subresource_count'] |
+ 'cache_recording.subresource_count', |
+ 'cache_recording.cached_subresource_count_theoretic', |
+ 'cache_recording.cached_subresource_count', |
+ 'benchmark.subresource_count', |
+ 'benchmark.served_from_cache_count_theoretic', |
+ 'benchmark.served_from_cache_count'] |
assert subresource_discoverer in SUBRESOURCE_DISCOVERERS |
assert 'common' not in SUBRESOURCE_DISCOVERERS |
@@ -525,28 +564,25 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
task_prefix = os.path.join(transformer_list_name, subresource_discoverer) |
@self.RegisterTask(shared_task_prefix + '-setup.json', merge=True, |
- dependencies=[self._subresources_for_urls_task]) |
+ dependencies=[self._cache_validation_task]) |
def SetupBenchmark(): |
whitelisted_urls = _ExtractDiscoverableUrls( |
self._trace_from_grabbing_reference_cache, subresource_discoverer) |
- url_resources = json.load(open(self._subresources_for_urls_task.path)) |
common_util.EnsureParentDirectoryExists(SetupBenchmark.path) |
with open(SetupBenchmark.path, 'w') as output: |
json.dump({ |
'cache_whitelist': [url for url in whitelisted_urls], |
'subresource_discoverer': subresource_discoverer, |
- 'url_resources': url_resources, |
}, output) |
@self.RegisterTask(shared_task_prefix + '-cache.zip', merge=True, |
- dependencies=[ |
- SetupBenchmark, self._reference_cache_task]) |
+ dependencies=[SetupBenchmark]) |
def BuildBenchmarkCacheArchive(): |
- setup = json.load(open(SetupBenchmark.path)) |
+ benchmark_setup = json.load(open(SetupBenchmark.path)) |
chrome_cache.ApplyUrlWhitelistToCacheArchive( |
- cache_archive_path=self._reference_cache_task.path, |
- whitelisted_urls=setup['cache_whitelist'], |
+ cache_archive_path=self._cache_path, |
+ whitelisted_urls=benchmark_setup['cache_whitelist'], |
output_cache_archive_path=BuildBenchmarkCacheArchive.path) |
@self.RegisterTask(task_prefix + '-run/', |
@@ -555,7 +591,7 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
runner = self._common_builder.CreateSandwichRunner() |
for transformer in transformer_list: |
transformer(runner) |
- runner.wpr_archive_path = self._patched_wpr_task.path |
+ runner.wpr_archive_path = self._wpr_archive_path |
runner.wpr_out_log_path = os.path.join( |
RunBenchmark.path, sandwich_runner.WPR_LOG_FILENAME) |
runner.cache_archive_path = BuildBenchmarkCacheArchive.path |
@@ -565,42 +601,18 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
@self.RegisterTask(task_prefix + '-metrics.csv', |
dependencies=[RunBenchmark]) |
- def ExtractMetrics(): |
- # TODO(gabadie): Performance improvement: load each trace only once and |
- # use it for validation and extraction of metrics later. |
- _VerifyBenchmarkOutputDirectory(SetupBenchmark.path, RunBenchmark.path) |
- |
+ def ProcessRunOutputDir(): |
benchmark_setup = json.load(open(SetupBenchmark.path)) |
- run_metrics_list = [] |
- for repeat_id, repeat_dir in sandwich_runner.WalkRepeatedRuns( |
- RunBenchmark.path): |
- trace_path = os.path.join(repeat_dir, sandwich_runner.TRACE_FILENAME) |
- logging.info('processing trace: %s', trace_path) |
- trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) |
- run_metrics = { |
- 'url': trace.url, |
- 'repeat_id': repeat_id, |
- 'subresource_discoverer': benchmark_setup['subresource_discoverer'], |
- 'subresource_count': len(_ListUrlRequests( |
- trace, _RequestOutcome.All)), |
- 'subresource_count_theoretic': |
- len(benchmark_setup['url_resources']), |
- 'cached_subresource_count': len(_ListUrlRequests( |
- trace, _RequestOutcome.ServedFromCache)), |
- 'cached_subresource_count_theoretic': |
- len(benchmark_setup['cache_whitelist']), |
- } |
- run_metrics.update( |
- sandwich_metrics.ExtractCommonMetricsFromRepeatDirectory( |
- repeat_dir, trace)) |
- run_metrics_list.append(run_metrics) |
- |
- run_metrics_list.sort(key=lambda e: e['repeat_id']) |
- with open(ExtractMetrics.path, 'w') as csv_file: |
+ cache_validation_result = json.load( |
+ open(self._cache_validation_task.path)) |
+ |
+ run_metrics_list = _ProcessRunOutputDir( |
+ cache_validation_result, benchmark_setup, RunBenchmark.path) |
+ with open(ProcessRunOutputDir.path, 'w') as csv_file: |
writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names + |
sandwich_metrics.COMMON_CSV_COLUMN_NAMES)) |
writer.writeheader() |
for trace_metrics in run_metrics_list: |
writer.writerow(trace_metrics) |
- self._common_builder.default_final_tasks.append(ExtractMetrics) |
+ self._common_builder.default_final_tasks.append(ProcessRunOutputDir) |