tools/android/loading/sandwich_misc.py - Issue 1872313002: sandwich: Implement SandwichTaskBuilder

Side by Side Diff: tools/android/loading/sandwich_misc.py

Issue 1872313002: sandwich: Implement SandwichTaskBuilder (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addresses pasko's comments Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« no previous file with comments | « tools/android/loading/sandwich_metrics_unittest.py ('k') | tools/android/loading/sandwich_misc_unittest.py » ('j') | tools/android/loading/sandwich_task_builder.py » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import logging	5 import logging

	6 import json

	7 import os

6	8

	9 import chrome_cache

	10 import common_util

7 from loading_trace import LoadingTrace	11 from loading_trace import LoadingTrace

8 from prefetch_view import PrefetchSimulationView	12 from prefetch_view import PrefetchSimulationView

9 from request_dependencies_lens import RequestDependencyLens	13 from request_dependencies_lens import RequestDependencyLens

10 from user_satisfied_lens import FirstContentfulPaintLens	14 import sandwich_runner

11 import wpr_backend	15 import wpr_backend

12	16

13	17

	18 # Do not prefetch anything.

	19 EMPTY_CACHE_DISCOVERER = 'empty-cache'

	20

	21 # Prefetches everything to load fully from cache (impossible in practice).

	22 FULL_CACHE_DISCOVERER = 'full-cache'

	23

14 # Prefetches the first resource following the redirection chain.	24 # Prefetches the first resource following the redirection chain.

15 REDIRECTED_MAIN_DISCOVERER = 'redirected-main'	25 REDIRECTED_MAIN_DISCOVERER = 'redirected-main'

16	26

17 # All resources which are fetched from the main document and their redirections.	27 # All resources which are fetched from the main document and their redirections.

18 PARSER_DISCOVERER = 'parser'	28 PARSER_DISCOVERER = 'parser'

19	29

20 # Simulation of HTMLPreloadScanner on the main document and their redirections.	30 # Simulation of HTMLPreloadScanner on the main document and their redirections.

21 HTML_PRELOAD_SCANNER_DISCOVERER = 'html-scanner'	31 HTML_PRELOAD_SCANNER_DISCOVERER = 'html-scanner'

22	32

23 SUBRESOURCE_DISCOVERERS = set([	33 SUBRESOURCE_DISCOVERERS = set([

	34 EMPTY_CACHE_DISCOVERER,

	35 FULL_CACHE_DISCOVERER,

24 REDIRECTED_MAIN_DISCOVERER,	36 REDIRECTED_MAIN_DISCOVERER,

25 PARSER_DISCOVERER,	37 PARSER_DISCOVERER,

26 HTML_PRELOAD_SCANNER_DISCOVERER	38 HTML_PRELOAD_SCANNER_DISCOVERER

27 ])	39 ])

28	40

29	41

30 def PatchWpr(wpr_archive_path):	42 def PatchWpr(wpr_archive_path):

31 """Patches a WPR archive to get all resources into the HTTP cache and avoid	43 """Patches a WPR archive to get all resources into the HTTP cache and avoid

32 invalidation and revalidations.	44 invalidation and revalidations.

33	45

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
78 'unknown prefetch simulation {}'.format(subresource_discoverer)	90 'unknown prefetch simulation {}'.format(subresource_discoverer)

79	91

80 # Load trace and related infos.	92 # Load trace and related infos.

81 logging.info('loading %s' % loading_trace_path)	93 logging.info('loading %s' % loading_trace_path)

82 trace = LoadingTrace.FromJsonFile(loading_trace_path)	94 trace = LoadingTrace.FromJsonFile(loading_trace_path)

83 dependencies_lens = RequestDependencyLens(trace)	95 dependencies_lens = RequestDependencyLens(trace)

84 first_resource_request = trace.request_track.GetFirstResourceRequest()	96 first_resource_request = trace.request_track.GetFirstResourceRequest()

85	97

86 # Build the list of discovered requests according to the desired simulation.	98 # Build the list of discovered requests according to the desired simulation.

87 discovered_requests = []	99 discovered_requests = []

88 if subresource_discoverer == REDIRECTED_MAIN_DISCOVERER:	100 if subresource_discoverer == EMPTY_CACHE_DISCOVERER:

	101 pass

	102 elif subresource_discoverer == FULL_CACHE_DISCOVERER:

	103 discovered_requests = trace.request_track.GetEvents()

	104 elif subresource_discoverer == REDIRECTED_MAIN_DISCOVERER:

89 discovered_requests = \	105 discovered_requests = \

90 [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]	106 [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]

91 elif subresource_discoverer == PARSER_DISCOVERER:	107 elif subresource_discoverer == PARSER_DISCOVERER:

92 discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests(	108 discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests(

93 first_resource_request, dependencies_lens)	109 first_resource_request, dependencies_lens)

94 elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER:	110 elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER:

95 discovered_requests = PrefetchSimulationView.PreloadedRequests(	111 discovered_requests = PrefetchSimulationView.PreloadedRequests(

96 first_resource_request, dependencies_lens, trace)	112 first_resource_request, dependencies_lens, trace)

97 else:	113 else:

98 assert False	114 assert False

99	115

100 # Prune out data:// requests.	116 # Prune out data:// requests.

101 whitelisted_urls = set()	117 whitelisted_urls = set()

102 logging.info('white-listing %s' % first_resource_request.url)	118 logging.info('white-listing %s' % first_resource_request.url)

103 whitelisted_urls.add(first_resource_request.url)

104 for request in discovered_requests:	119 for request in discovered_requests:

105 # Work-around where the protocol may be none for an unclear reason yet.	120 # Work-around where the protocol may be none for an unclear reason yet.

106 # TODO(gabadie): Follow up on this with Clovis guys and possibly remove	121 # TODO(gabadie): Follow up on this with Clovis guys and possibly remove

107 # this work-around.	122 # this work-around.

108 if not request.protocol:	123 if not request.protocol:

109 logging.warning('ignoring %s (no protocol)' % request.url)	124 logging.warning('ignoring %s (no protocol)' % request.url)

110 continue	125 continue

111 # Ignore data protocols.	126 # Ignore data protocols.

112 if not request.protocol.startswith('http'):	127 if not request.protocol.startswith('http'):

113 continue	128 continue

114 logging.info('white-listing %s' % request.url)	129 logging.info('white-listing %s' % request.url)

115 whitelisted_urls.add(request.url)	130 whitelisted_urls.add(request.url)

116 return whitelisted_urls	131 return whitelisted_urls

	132

	133

	134 def _PrintUrlSetComparison(ref_url_set, url_set, url_set_name):

	135 """Compare URL sets and log the diffs.

	136

	137 Args:

	138 ref_url_set: Set of reference urls.

	139 url_set: Set of urls to compare to the reference.

	140 url_set_name: The set name for logging purposes.

	141 """

	142 assert type(ref_url_set) == set

	143 assert type(url_set) == set

	144 if ref_url_set == url_set:

	145 logging.info(' %d %s are matching.' % (len(ref_url_set), url_set_name))

	146 return

	147 logging.error(' %s are not matching.' % url_set_name)

	148 logging.error(' List of missing resources:')

	149 for url in ref_url_set.difference(url_set):

	150 logging.error('- ' + url)

	151 logging.error(' List of unexpected resources:')

	152 for url in url_set.difference(ref_url_set):

	153 logging.error('+ ' + url)

	154

	155

	156 def _ListUrlRequests(trace, from_cache):

	157 """Lists requested URLs from a trace.

	158

	159 Args:

	160 trace: The trace.

	161 from_cache:

	162 None to list all requested urls;
	pasko 2016/04/21 18:21:44 these rules are hard to remember, so the reader wo these rules are hard to remember, so the reader would have to always look up the function definition when the function is called. More enum-like interface would be good: class _RequestOutcome: All, ServedFromCache, NotServedFromCache = range(3) then: def _ListUrlRequests(trace, request_kind): """... Args: request_kind: _RequestOutcome indicating the subset of requests to output. then to call it: _ListUrlRequests(trace, _RequestOutcome.ServedFromCache) gabadie 2016/04/22 14:16:42 Done. Show quoted text On 2016/04/21 18:21:44, pasko wrote: > these rules are hard to remember, so the reader would have to always look up the > function definition when the function is called. > > More enum-like interface would be good: > > class _RequestOutcome: > All, ServedFromCache, NotServedFromCache = range(3) > > then: > > def _ListUrlRequests(trace, request_kind): > """... > > Args: > request_kind: _RequestOutcome indicating the subset of requests to output. > > then to call it: > > _ListUrlRequests(trace, _RequestOutcome.ServedFromCache) Done.
	163 True to list all requested urls served from cache;

	164 Fals to list all requested urls not served from cache.

	165

	166 Returns:

	167 set([str])

	168 """

	169 urls = set()

	170 for request_event in trace.request_track.GetEvents():

	171 if request_event.protocol == None:

	172 continue

	173 if not request_event.protocol.startswith('http'):
	pasko 2016/04/21 18:21:44 This was not mentioned in the docstring. Does this This was not mentioned in the docstring. Does this filter out data requests? Let's filter out data urls. I really prefer a blacklist. This would allow deciding what to do with other protocols as they appear case-by-case. Having a blind whitelist for http/https would make us silently produce wrong data in such cases. gabadie 2016/04/22 14:16:42 This what this is for. Show quoted text On 2016/04/21 18:21:44, pasko wrote: > This was not mentioned in the docstring. Does this filter out data requests? > Let's filter out data urls. This what this is for. Show quoted text > > I really prefer a blacklist. This would allow deciding what to do with other > protocols as they appear case-by-case. Having a blind whitelist for http/https > would make us silently produce wrong data in such cases. Done.
	174 continue

	175 if from_cache is not None and request_event.from_disk_cache != from_cache:

	176 continue

	177 urls.add(request_event.url)

	178 return urls

	179

	180

	181 def VerifyBenchmarkOutputDirectory(benchmark_setup_path,
	pasko 2016/04/21 18:21:44 Need to apply the same action as for ValidateCache Need to apply the same action as for ValidateCacheArchiveContent() - i.e. TODO or raise exception or put status in the final CSV. gabadie 2016/04/22 14:16:42 Acknowledged. But I don't want to block sandwich w Show quoted text On 2016/04/21 18:21:44, pasko wrote: > Need to apply the same action as for ValidateCacheArchiveContent() - i.e. TODO > or raise exception or put status in the final CSV. Acknowledged. But I don't want to block sandwich workflow because of AJAX requests.
	182 benchmark_output_directory_path):

	183 """Verifies that all run inside the run_output_directory worked as expected.

	184

	185 Args:

	186 benchmark_setup_path: Path of the JSON of the benchmark setup.

	187 benchmark_output_directory_path: Path of the benchmark output directory to

	188 verify.

	189 """

	190 benchmark_setup = json.load(open(benchmark_setup_path))

	191 cache_whitelist = set(benchmark_setup['cache_whitelist'])

	192 url_resources = set(benchmark_setup['url_resources'])

	193

	194 # Verify requests from traces.

	195 run_id = -1

	196 while True:

	197 run_id += 1

	198 run_path = os.path.join(benchmark_output_directory_path, str(run_id))

	199 if not os.path.isdir(run_path):

	200 break

	201 trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME)

	202 if not os.path.isfile(trace_path):

	203 logging.error('missing trace %s' % trace_path)

	204 continue

	205 trace = LoadingTrace.FromJsonFile(trace_path)

	206 logging.info('verifying %s from %s' % (trace.url, trace_path))

	207 _PrintUrlSetComparison(url_resources, _ListUrlRequests(trace, None),

	208 'All resources')

	209 _PrintUrlSetComparison(url_resources.intersection(cache_whitelist),

	210 _ListUrlRequests(trace, True), 'Cached resources')

	211 _PrintUrlSetComparison(url_resources.difference(cache_whitelist),

	212 _ListUrlRequests(trace, False),

	213 'Non cached resources')

	214

	215

	216 def ListResourceUrls(benchmark_output_directory_path):
	pasko 2016/04/21 18:21:44 ReadSubresourceMapFromBenchmarkOutput(...) ReadSubresourceMapFromBenchmarkOutput(...) gabadie 2016/04/22 14:16:42 Second time you ask me for modification! Done. Show quoted text On 2016/04/21 18:21:44, pasko wrote: > ReadSubresourceMapFromBenchmarkOutput(...) Second time you ask me for modification! Done. pasko 2016/04/25 13:29:06 Acknowledged. Show quoted text On 2016/04/22 14:16:42, gabadie wrote: > On 2016/04/21 18:21:44, pasko wrote: > > ReadSubresourceMapFromBenchmarkOutput(...) > > Second time you ask me for modification! Done. Acknowledged.
	217 """Lists all requested URLs per navigated URLs
	pasko 2016/04/21 18:21:44 """Extracts a map URL-to-subresources for each nav """Extracts a map URL-to-subresources for each navigation in benchmark directory.""" gabadie 2016/04/22 14:16:42 Second time you ask me for modification! Done. Show quoted text On 2016/04/21 18:21:44, pasko wrote: > """Extracts a map URL-to-subresources for each navigation in benchmark > directory.""" Second time you ask me for modification! Done. pasko 2016/04/25 13:29:06 I will keep asking for modifications as many times Show quoted text On 2016/04/22 14:16:42, gabadie wrote: > On 2016/04/21 18:21:44, pasko wrote: > > """Extracts a map URL-to-subresources for each navigation in benchmark > > directory.""" > > Second time you ask me for modification! Done. I will keep asking for modifications as many times as needed. Thanks.
	218

	219 Args:

	220 benchmark_output_directory_path: Path of the benchmark output directory to

	221 verify.

	222

	223 Returns:

	224 {url -> [URLs of sub-resources]}

	225 """

	226 url_subresources = {}

	227 run_id = -1

	228 while True:

	229 run_id += 1

	230 run_path = os.path.join(benchmark_output_directory_path, str(run_id))

	231 if not os.path.isdir(run_path):

	232 break

	233 trace_path = os.path.join(run_path, sandwich_runner.TRACE_FILENAME)

	234 if not os.path.isfile(trace_path):

	235 continue

	236 trace = LoadingTrace.FromJsonFile(trace_path)

	237 if trace.url in url_subresources:

	238 continue

	239 logging.info('lists resources of %s from %s' % (trace.url, trace_path))

	240 urls_set = set()

	241 for request_event in trace.request_track.GetEvents():

	242 if not request_event.protocol.startswith('http'):

	243 continue

	244 if request_event.url not in urls_set:

	245 logging.info(' %s' % request_event.url)

	246 urls_set.add(request_event.url)

	247 url_subresources[trace.url] = [url for url in urls_set]

	248 return url_subresources

	249

	250

	251 def ValidateCacheArchiveContent(ref_urls, cache_archive_path):
	pasko 2016/04/21 18:21:44 Producing log messages on error is insufficient - Producing log messages on error is insufficient - easy to miss when running all benchmarks. I see these options: 1. raise an exception and discard the whole Task scenario if cache content is inconsistent. This is simplest to do, and probably good enough because other benchmarks would probably be flawed and the data be useless. 2. Continue but put the result of validation in an intermediate .json and propagate it to the final CSV. More involved. 3. Put a TODO for the next CL. I am OK with any of these at the moment. gabadie 2016/04/22 14:16:42 I don't want to block sandwich workflow because of Show quoted text On 2016/04/21 18:21:44, pasko wrote: > Producing log messages on error is insufficient - easy to miss when running all > benchmarks. > > I see these options: > > 1. raise an exception and discard the whole Task scenario if cache content is > inconsistent. This is simplest to do, and probably good enough because other > benchmarks would probably be flawed and the data be useless. > > 2. Continue but put the result of validation in an intermediate .json and > propagate it to the final CSV. More involved. > > 3. Put a TODO for the next CL. > > I am OK with any of these at the moment. I don't want to block sandwich workflow because of AJAX requests. Adding TODO. pasko 2016/04/25 13:29:06 nit: In browser development the term XMLHttpReques Show quoted text On 2016/04/22 14:16:42, gabadie wrote: > On 2016/04/21 18:21:44, pasko wrote: > > Producing log messages on error is insufficient - easy to miss when running > all > > benchmarks. > > > > I see these options: > > > > 1. raise an exception and discard the whole Task scenario if cache content is > > inconsistent. This is simplest to do, and probably good enough because other > > benchmarks would probably be flawed and the data be useless. > > > > 2. Continue but put the result of validation in an intermediate .json and > > propagate it to the final CSV. More involved. > > > > 3. Put a TODO for the next CL. > > > > I am OK with any of these at the moment. > > I don't want to block sandwich workflow because of AJAX > requests. Adding TODO. nit: In browser development the term XMLHttpRequest is more preferable to use, because AJAX is not carefully defined. gabadie 2016/04/27 08:32:16 Acknowledged. Show quoted text On 2016/04/25 13:29:06, pasko wrote: > On 2016/04/22 14:16:42, gabadie wrote: > > On 2016/04/21 18:21:44, pasko wrote: > > > Producing log messages on error is insufficient - easy to miss when running > > all > > > benchmarks. > > > > > > I see these options: > > > > > > 1. raise an exception and discard the whole Task scenario if cache content > is > > > inconsistent. This is simplest to do, and probably good enough because other > > > benchmarks would probably be flawed and the data be useless. > > > > > > 2. Continue but put the result of validation in an intermediate .json and > > > propagate it to the final CSV. More involved. > > > > > > 3. Put a TODO for the next CL. > > > > > > I am OK with any of these at the moment. > > > > I don't want to block sandwich workflow because of AJAX > > requests. Adding TODO. > > nit: In browser development the term XMLHttpRequest is more preferable to use, > because AJAX is not carefully defined. Acknowledged.
	252 """Validates a cache archive content.

	253

	254 Args:

	255 ref_urls: Reference list of urls.

	256 cache_archive_path: Cache archive's path to validate.

	257 """

	258 logging.info('lists cached urls from %s' % cache_archive_path)

	259 with common_util.TemporaryDirectory() as cache_directory:

	260 chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory)

	261 cached_urls = \

	262 chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys()

	263 _PrintUrlSetComparison(set(ref_urls), set(cached_urls), 'cached resources')

OLD	NEW