tools/android/loading/sandwich_prefetch.py - Issue 2033093002: sandwich: Merge cache-validation.json and urls-for-resources.json tasks

Unified Diff: tools/android/loading/sandwich_prefetch.py

Issue 2033093002: sandwich: Merge cache-validation.json and urls-for-resources.json tasks (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Rebase Created 4 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/android/loading/sandwich_prefetch.py

diff --git a/tools/android/loading/sandwich_prefetch.py b/tools/android/loading/sandwich_prefetch.py

index 173d0bffbb1d820583bc2f808fe2b453ee634c7f..ab093c2c280cbf1731a1ce038b200124e2968bac 100644

--- a/tools/android/loading/sandwich_prefetch.py

+++ b/tools/android/loading/sandwich_prefetch.py

@@ -2,6 +2,19 @@

# Use of this source code is governed by a BSD-style license that can be

# found in the LICENSE file.

+"""

+Implements a task builder for benchmarking effects of NoState Prefetch.

+Noticeable steps of the task pipeline:

+ * Save a WPR archive

+ * Process the WPR archive to make all resources cacheable

+ * Process cache archive to patch response headers back to their original

+ values.

+ * Find out which resources are discoverable by NoState Prefetch

+ (HTMLPreloadScanner)

+ * Load pages with empty/full/prefetched cache

+ * Extract most important metrics to a CSV

+"""

import csv

import logging

import json

@@ -198,10 +211,10 @@ def _ExtractDiscoverableUrls(loading_trace_path, subresource_discoverer):

assert False

whitelisted_urls = set()

- logging.info('white-listing %s' % first_resource_request.url)

for request in _FilterOutDataAndIncompleteRequests(discovered_requests):

- logging.info('white-listing %s' % request.url)

+ logging.debug('white-listing %s', request.url)

whitelisted_urls.add(request.url)

+ logging.info('number of white-listed resources: %d', len(whitelisted_urls))

return whitelisted_urls

@@ -261,37 +274,32 @@ def _ListUrlRequests(trace, request_kind):

return urls

-def _VerifyBenchmarkOutputDirectory(benchmark_setup_path,

- benchmark_output_directory_path):

- """Verifies that all run inside the run_output_directory worked as expected.

- Args:

- benchmark_setup_path: Path of the JSON of the benchmark setup.

- benchmark_output_directory_path: Path of the benchmark output directory to

- verify.

+class _RunOutputVerifier(object):

+ """Object to verify benchmark run from traces and WPR log stored in the

+ runner output directory.

"""

- # TODO(gabadie): What's the best way of propagating errors happening in here?

- benchmark_setup = json.load(open(benchmark_setup_path))

- cache_whitelist = set(benchmark_setup['cache_whitelist'])

- original_requests = set(benchmark_setup['url_resources'])

- original_cached_requests = original_requests.intersection(cache_whitelist)

- original_uncached_requests = original_requests.difference(cache_whitelist)

- all_sent_url_requests = set()

- # Verify requests from traces.

- run_id = -1

- while True:

- run_id += 1

- run_path = os.path.join(benchmark_output_directory_path, str(run_id))

- if not os.path.isdir(run_path):

- break

- trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME)

- if not os.path.isfile(trace_path):

- logging.error('missing trace %s' % trace_path)

- continue

- trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)

- logging.info('verifying %s from %s' % (trace.url, trace_path))

+ def __init__(self, cache_validation_result, benchmark_setup):

+ """Constructor.

+ Args:

+ cache_validation_result: JSON of the cache validation task.

+ benchmark_setup: JSON of the benchmark setup.

+ """

+ self._cache_whitelist = set(benchmark_setup['cache_whitelist'])

+ self._original_requests = set(cache_validation_result['effective_requests'])

+ self._original_post_requests = set(

+ cache_validation_result['effective_post_requests'])

+ self._original_cached_requests = self._original_requests.intersection(

+ self._cache_whitelist)

+ self._original_uncached_requests = self._original_requests.difference(

+ self._cache_whitelist)

+ self._all_sent_url_requests = set()

+ def VerifyTrace(self, trace):

+ """Verifies a trace with the cache validation result and the benchmark

+ setup.

+ """

effective_requests = _ListUrlRequests(trace, _RequestOutcome.All)

effective_post_requests = _ListUrlRequests(trace, _RequestOutcome.Post)

effective_cached_requests = \

@@ -299,74 +307,49 @@ def _VerifyBenchmarkOutputDirectory(benchmark_setup_path,

effective_uncached_requests = \

_ListUrlRequests(trace, _RequestOutcome.NotServedFromCache)

- missing_requests = original_requests.difference(effective_requests)

- unexpected_requests = effective_requests.difference(original_requests)

+ missing_requests = self._original_requests.difference(effective_requests)

+ unexpected_requests = effective_requests.difference(self._original_requests)

expected_cached_requests = \

- original_cached_requests.difference(missing_requests)

- missing_cached_requests = \

- expected_cached_requests.difference(effective_cached_requests)

- expected_uncached_requests = original_uncached_requests.union(

- unexpected_requests).union(missing_cached_requests)

- all_sent_url_requests.update(effective_uncached_requests)

+ self._original_cached_requests.difference(missing_requests)

+ expected_uncached_requests = self._original_uncached_requests.union(

+ unexpected_requests).difference(missing_requests)

# POST requests are known to be unable to use the cache.

expected_cached_requests.difference_update(effective_post_requests)

expected_uncached_requests.update(effective_post_requests)

- _PrintUrlSetComparison(original_requests, effective_requests,

+ _PrintUrlSetComparison(self._original_requests, effective_requests,

'All resources')

- _PrintUrlSetComparison(set(), effective_post_requests,

- 'POST resources')

+ _PrintUrlSetComparison(set(), effective_post_requests, 'POST resources')

_PrintUrlSetComparison(expected_cached_requests, effective_cached_requests,

'Cached resources')

_PrintUrlSetComparison(expected_uncached_requests,

effective_uncached_requests, 'Non cached resources')

- # Verify requests from WPR.

- wpr_log_path = os.path.join(

- benchmark_output_directory_path, sandwich_runner.WPR_LOG_FILENAME)

- logging.info('verifying requests from %s' % wpr_log_path)

- all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)

- all_wpr_urls = set()

- unserved_wpr_urls = set()

- wpr_command_colliding_urls = set()

- for request in all_wpr_requests:

- if request.is_wpr_host:

- continue

- if urlparse(request.url).path.startswith('/web-page-replay'):

- wpr_command_colliding_urls.add(request.url)

- elif request.is_served is False:

- unserved_wpr_urls.add(request.url)

- all_wpr_urls.add(request.url)

- _PrintUrlSetComparison(set(), unserved_wpr_urls,

- 'Distinct unserved resources from WPR')

- _PrintUrlSetComparison(set(), wpr_command_colliding_urls,

- 'Distinct resources colliding to WPR commands')

- _PrintUrlSetComparison(all_wpr_urls, all_sent_url_requests,

- 'Distinct resource requests to WPR')

+ self._all_sent_url_requests.update(effective_uncached_requests)

-def _ReadSubresourceFromRunnerOutputDir(runner_output_dir):

- """Extracts a list of subresources in runner output directory.

+ def VerifyWprLog(self, wpr_log_path):

+ """Verifies WPR log with previously verified traces."""

+ all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)

+ all_wpr_urls = set()

+ unserved_wpr_urls = set()

+ wpr_command_colliding_urls = set()

- Args:

- runner_output_dir: Path of the runner's output directory.

+ for request in all_wpr_requests:

+ if request.is_wpr_host:

+ continue

+ if urlparse(request.url).path.startswith('/web-page-replay'):

+ wpr_command_colliding_urls.add(request.url)

+ elif request.is_served is False:

+ unserved_wpr_urls.add(request.url)

+ all_wpr_urls.add(request.url)

- Returns:

- [URLs of sub-resources]

- """

- trace_path = os.path.join(

- runner_output_dir, '0', sandwich_runner.TRACE_FILENAME)

- trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)

- url_set = set()

- for request_event in _FilterOutDataAndIncompleteRequests(

- trace.request_track.GetEvents()):

- url_set.add(request_event.url)

- logging.info('lists %s resources of %s from %s' % \

- (len(url_set), trace.url, trace_path))

- return [url for url in url_set]

+ _PrintUrlSetComparison(set(), unserved_wpr_urls,

+ 'Distinct unserved resources from WPR')

+ _PrintUrlSetComparison(set(), wpr_command_colliding_urls,

+ 'Distinct resources colliding to WPR commands')

+ _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests,

+ 'Distinct resource requests to WPR')

def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path):

@@ -375,6 +358,14 @@ def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path):

Args:

cache_build_trace_path: Path of the generated trace at the cache build time.

cache_archive_path: Cache archive's path to validate.

+ Returns:

+ {

+ 'effective_requests': [URLs of all requests],

+ 'effective_post_requests': [URLs of POST requests],

+ 'expected_cached_resources': [URLs of resources expected to be cached],

+ 'successfully_cached': [URLs of cached sub-resources]

+ }

"""

# TODO(gabadie): What's the best way of propagating errors happening in here?

logging.info('lists cached urls from %s' % cache_archive_path)

@@ -405,6 +396,69 @@ def _ValidateCacheArchiveContent(cache_build_trace_path, cache_archive_path):

_PrintUrlSetComparison(expected_cached_requests, effective_cache_keys,

'Cached resources')

+ return {

+ 'effective_requests': [url for url in effective_requests],

+ 'effective_post_requests': [url for url in effective_post_requests],

+ 'expected_cached_resources': [url for url in expected_cached_requests],

+ 'successfully_cached_resources': [url for url in effective_cache_keys]

+ }

+def _ProcessRunOutputDir(

+ cache_validation_result, benchmark_setup, runner_output_dir):

+ """Process benchmark's run output directory.

+ Args:

+ cache_validation_result: Same as for _RunOutputVerifier

+ benchmark_setup: Same as for _RunOutputVerifier

+ runner_output_dir: Same as for SandwichRunner.output_dir

+ Returns:

+ List of dictionary.

+ """

+ run_metrics_list = []

+ run_output_verifier = _RunOutputVerifier(

+ cache_validation_result, benchmark_setup)

+ for repeat_id, repeat_dir in sandwich_runner.WalkRepeatedRuns(

+ runner_output_dir):

+ trace_path = os.path.join(repeat_dir, sandwich_runner.TRACE_FILENAME)

+ logging.info('loading trace: %s', trace_path)

+ trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)

+ logging.info('verifying trace: %s', trace_path)

+ run_output_verifier.VerifyTrace(trace)

+ logging.info('extracting metrics from trace: %s', trace_path)

+ run_metrics = {

+ 'url': trace.url,

+ 'repeat_id': repeat_id,

+ 'subresource_discoverer': benchmark_setup['subresource_discoverer'],

+ 'cache_recording.subresource_count':

+ len(cache_validation_result['effective_requests']),

+ 'cache_recording.cached_subresource_count_theoretic':

+ len(cache_validation_result['successfully_cached_resources']),

+ 'cache_recording.cached_subresource_count':

+ len(cache_validation_result['expected_cached_resources']),

+ 'benchmark.subresource_count': len(_ListUrlRequests(

+ trace, _RequestOutcome.All)),

+ 'benchmark.served_from_cache_count_theoretic':

+ len(benchmark_setup['cache_whitelist']),

+ 'benchmark.served_from_cache_count': len(_ListUrlRequests(

+ trace, _RequestOutcome.ServedFromCache)),

+ }

+ run_metrics.update(

+ sandwich_metrics.ExtractCommonMetricsFromRepeatDirectory(

+ repeat_dir, trace))

+ run_metrics_list.append(run_metrics)

+ run_metrics_list.sort(key=lambda e: e['repeat_id'])

+ wpr_log_path = os.path.join(

+ runner_output_dir, sandwich_runner.WPR_LOG_FILENAME)

+ logging.info('verifying wpr log: %s', wpr_log_path)

+ run_output_verifier.VerifyWprLog(wpr_log_path)

+ return run_metrics_list

class PrefetchBenchmarkBuilder(task_manager.Builder):

"""A builder for a graph of tasks for NoState-Prefetch emulated benchmarks."""

@@ -415,10 +469,10 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

common_builder.output_subdirectory)

self._common_builder = common_builder

- self._patched_wpr_task = None

- self._reference_cache_task = None

+ self._wpr_archive_path = None

+ self._cache_path = None

self._trace_from_grabbing_reference_cache = None

- self._subresources_for_urls_task = None

+ self._cache_validation_task = None

self._PopulateCommonPipelines()

def _PopulateCommonPipelines(self):

@@ -428,13 +482,11 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

subresources (urls-resources.json).

Here is the full dependency tree for the returned task:

- common/patched-cache-validation.log

+ common/patched-cache-validation.json

depends on: common/patched-cache.zip

depends on: common/original-cache.zip

depends on: common/webpages-patched.wpr

depends on: common/webpages.wpr

- depends on: common/urls-resources.json

- depends on: common/original-cache.zip

"""

@self.RegisterTask('common/webpages-patched.wpr',

dependencies=[self._common_builder.original_wpr_task])

@@ -461,29 +513,18 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

_PatchCacheArchive(BuildOriginalCache.path,

original_cache_trace_path, BuildPatchedCache.path)

- @self.RegisterTask('common/subresources-for-urls.json',

- [BuildOriginalCache])

- def ListUrlsResources():

- url_resources = _ReadSubresourceFromRunnerOutputDir(

- BuildOriginalCache.run_path)

- with open(ListUrlsResources.path, 'w') as output:

- json.dump(url_resources, output)

- @self.RegisterTask('common/patched-cache-validation.log',

+ @self.RegisterTask('common/patched-cache-validation.json',

[BuildPatchedCache])

def ValidatePatchedCache():

- handler = logging.FileHandler(ValidatePatchedCache.path)

- logging.getLogger().addHandler(handler)

- try:

- _ValidateCacheArchiveContent(

- original_cache_trace_path, BuildPatchedCache.path)

- finally:

- logging.getLogger().removeHandler(handler)

- self._patched_wpr_task = BuildPatchedWpr

+ cache_validation_result = _ValidateCacheArchiveContent(

+ original_cache_trace_path, BuildPatchedCache.path)

+ with open(ValidatePatchedCache.path, 'w') as output:

+ json.dump(cache_validation_result, output)

+ self._wpr_archive_path = BuildPatchedWpr.path

self._trace_from_grabbing_reference_cache = original_cache_trace_path

- self._reference_cache_task = BuildPatchedCache

- self._subresources_for_urls_task = ListUrlsResources

+ self._cache_path = BuildPatchedCache.path

+ self._cache_validation_task = ValidatePatchedCache

self._common_builder.default_final_tasks.append(ValidatePatchedCache)

@@ -503,21 +544,19 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

<transformer_list_name>/<subresource_discoverer>-metrics.csv

depends on: <transformer_list_name>/<subresource_discoverer>-run/

depends on: common/<subresource_discoverer>-cache.zip

- depends on: some tasks saved by PopulateCommonPipelines()

depends on: common/<subresource_discoverer>-setup.json

- depends on: some tasks saved by PopulateCommonPipelines()

+ depends on: common/patched-cache-validation.json

"""

additional_column_names = [

'url',

'repeat_id',

'subresource_discoverer',

- 'subresource_count',

- # The amount of subresources detected at SetupBenchmark step.

- 'subresource_count_theoretic',

- # Amount of subresources for caching as suggested by the subresource

- # discoverer.

- 'cached_subresource_count_theoretic',

- 'cached_subresource_count']

+ 'cache_recording.subresource_count',

+ 'cache_recording.cached_subresource_count_theoretic',

+ 'cache_recording.cached_subresource_count',

+ 'benchmark.subresource_count',

+ 'benchmark.served_from_cache_count_theoretic',

+ 'benchmark.served_from_cache_count']

assert subresource_discoverer in SUBRESOURCE_DISCOVERERS

assert 'common' not in SUBRESOURCE_DISCOVERERS

@@ -525,28 +564,25 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

task_prefix = os.path.join(transformer_list_name, subresource_discoverer)

@self.RegisterTask(shared_task_prefix + '-setup.json', merge=True,

- dependencies=[self._subresources_for_urls_task])

+ dependencies=[self._cache_validation_task])

def SetupBenchmark():

whitelisted_urls = _ExtractDiscoverableUrls(

self._trace_from_grabbing_reference_cache, subresource_discoverer)

- url_resources = json.load(open(self._subresources_for_urls_task.path))

common_util.EnsureParentDirectoryExists(SetupBenchmark.path)

with open(SetupBenchmark.path, 'w') as output:

json.dump({

'cache_whitelist': [url for url in whitelisted_urls],

'subresource_discoverer': subresource_discoverer,

- 'url_resources': url_resources,

}, output)

@self.RegisterTask(shared_task_prefix + '-cache.zip', merge=True,

- dependencies=[

- SetupBenchmark, self._reference_cache_task])

+ dependencies=[SetupBenchmark])

def BuildBenchmarkCacheArchive():

- setup = json.load(open(SetupBenchmark.path))

+ benchmark_setup = json.load(open(SetupBenchmark.path))

chrome_cache.ApplyUrlWhitelistToCacheArchive(

- cache_archive_path=self._reference_cache_task.path,

- whitelisted_urls=setup['cache_whitelist'],

+ cache_archive_path=self._cache_path,

+ whitelisted_urls=benchmark_setup['cache_whitelist'],

output_cache_archive_path=BuildBenchmarkCacheArchive.path)

@self.RegisterTask(task_prefix + '-run/',

@@ -555,7 +591,7 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

runner = self._common_builder.CreateSandwichRunner()

for transformer in transformer_list:

transformer(runner)

- runner.wpr_archive_path = self._patched_wpr_task.path

+ runner.wpr_archive_path = self._wpr_archive_path

runner.wpr_out_log_path = os.path.join(

RunBenchmark.path, sandwich_runner.WPR_LOG_FILENAME)

runner.cache_archive_path = BuildBenchmarkCacheArchive.path

@@ -565,42 +601,18 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

@self.RegisterTask(task_prefix + '-metrics.csv',

dependencies=[RunBenchmark])

- def ExtractMetrics():

- # TODO(gabadie): Performance improvement: load each trace only once and

- # use it for validation and extraction of metrics later.

- _VerifyBenchmarkOutputDirectory(SetupBenchmark.path, RunBenchmark.path)

+ def ProcessRunOutputDir():

benchmark_setup = json.load(open(SetupBenchmark.path))

- run_metrics_list = []

- for repeat_id, repeat_dir in sandwich_runner.WalkRepeatedRuns(

- RunBenchmark.path):

- trace_path = os.path.join(repeat_dir, sandwich_runner.TRACE_FILENAME)

- logging.info('processing trace: %s', trace_path)

- trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)

- run_metrics = {

- 'url': trace.url,

- 'repeat_id': repeat_id,

- 'subresource_discoverer': benchmark_setup['subresource_discoverer'],

- 'subresource_count': len(_ListUrlRequests(

- trace, _RequestOutcome.All)),

- 'subresource_count_theoretic':

- len(benchmark_setup['url_resources']),

- 'cached_subresource_count': len(_ListUrlRequests(

- trace, _RequestOutcome.ServedFromCache)),

- 'cached_subresource_count_theoretic':

- len(benchmark_setup['cache_whitelist']),

- }

- run_metrics.update(

- sandwich_metrics.ExtractCommonMetricsFromRepeatDirectory(

- repeat_dir, trace))

- run_metrics_list.append(run_metrics)

- run_metrics_list.sort(key=lambda e: e['repeat_id'])

- with open(ExtractMetrics.path, 'w') as csv_file:

+ cache_validation_result = json.load(

+ open(self._cache_validation_task.path))

+ run_metrics_list = _ProcessRunOutputDir(

+ cache_validation_result, benchmark_setup, RunBenchmark.path)

+ with open(ProcessRunOutputDir.path, 'w') as csv_file:

writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names +

sandwich_metrics.COMMON_CSV_COLUMN_NAMES))

writer.writeheader()

for trace_metrics in run_metrics_list:

writer.writerow(trace_metrics)

- self._common_builder.default_final_tasks.append(ExtractMetrics)

+ self._common_builder.default_final_tasks.append(ProcessRunOutputDir)

« no previous file with comments | « tools/android/loading/sandwich_metrics.py ('k') | no next file » | no next file with comments »