tools/android/loading/sandwich_misc.py - Issue 1872313002: sandwich: Implement SandwichTaskBuilder

Side by Side Diff: tools/android/loading/sandwich_misc.py

Issue 1872313002: sandwich: Implement SandwichTaskBuilder (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: mv sandwich_tasks.py -> sandwich_task_builder.py Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 # Copyright 2016 The Chromium Authors. All rights reserved.	1 # Copyright 2016 The Chromium Authors. All rights reserved.

2 # Use of this source code is governed by a BSD-style license that can be	2 # Use of this source code is governed by a BSD-style license that can be

3 # found in the LICENSE file.	3 # found in the LICENSE file.

4	4

5 import logging	5 import logging

	6 import json

	7 import os

6	8

	9 import chrome_cache

	10 import common_util

7 from loading_trace import LoadingTrace	11 from loading_trace import LoadingTrace

8 from prefetch_view import PrefetchSimulationView	12 from prefetch_view import PrefetchSimulationView

9 from request_dependencies_lens import RequestDependencyLens	13 from request_dependencies_lens import RequestDependencyLens

10 from user_satisfied_lens import FirstContentfulPaintLens

11 import wpr_backend	14 import wpr_backend

12	15

13	16

	17 # Do not prefetch anything.

	18 NO_DISCOVERER = 'no-discoverer'

	19

	20 # Prefetches everything to load fully from cache (impossible in practice).

	21 FULL_CACHE_DISCOVERER = 'full-cache'

	22

14 # Prefetches the first resource following the redirection chain.	23 # Prefetches the first resource following the redirection chain.

15 REDIRECTED_MAIN_DISCOVERER = 'redirected-main'	24 REDIRECTED_MAIN_DISCOVERER = 'redirected-main'

16	25

17 # All resources which are fetched from the main document and their redirections.	26 # All resources which are fetched from the main document and their redirections.

18 PARSER_DISCOVERER = 'parser',	27 PARSER_DISCOVERER = 'parser'

19	28

20 # Simulation of HTMLPreloadScanner on the main document and their redirections.	29 # Simulation of HTMLPreloadScanner on the main document and their redirections.

21 HTML_PRELOAD_SCANNER_DISCOVERER = 'html-scanner',	30 HTML_PRELOAD_SCANNER_DISCOVERER = 'html-scanner'

22	31

23 SUBRESOURCE_DISCOVERERS = set([	32 SUBRESOURCE_DISCOVERERS = set([

	33 NO_DISCOVERER,

	34 FULL_CACHE_DISCOVERER,

24 REDIRECTED_MAIN_DISCOVERER,	35 REDIRECTED_MAIN_DISCOVERER,

25 PARSER_DISCOVERER,	36 PARSER_DISCOVERER,

26 HTML_PRELOAD_SCANNER_DISCOVERER	37 HTML_PRELOAD_SCANNER_DISCOVERER

27 ])	38 ])

28	39

29	40

30 def PatchWpr(wpr_archive_path):	41 def PatchWpr(wpr_archive_path):

31 """Patches a WPR archive to get all resources into the HTTP cache and avoid	42 """Patches a WPR archive to get all resources into the HTTP cache and avoid

32 invalidation and revalidations.	43 invalidation and revalidations.

33	44

(...skipping 44 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
78 'unknown prefetch simulation {}'.format(subresource_discoverer)	89 'unknown prefetch simulation {}'.format(subresource_discoverer)

79	90

80 # Load trace and related infos.	91 # Load trace and related infos.

81 logging.info('loading %s' % loading_trace_path)	92 logging.info('loading %s' % loading_trace_path)

82 trace = LoadingTrace.FromJsonFile(loading_trace_path)	93 trace = LoadingTrace.FromJsonFile(loading_trace_path)

83 dependencies_lens = RequestDependencyLens(trace)	94 dependencies_lens = RequestDependencyLens(trace)

84 first_resource_request = trace.request_track.GetFirstResourceRequest()	95 first_resource_request = trace.request_track.GetFirstResourceRequest()

85	96

86 # Build the list of discovered requests according to the desired simulation.	97 # Build the list of discovered requests according to the desired simulation.

87 discovered_requests = []	98 discovered_requests = []

88 if subresource_discoverer == REDIRECTED_MAIN_DISCOVERER:	99 if subresource_discoverer == NO_DISCOVERER:

	100 pass

	101 elif subresource_discoverer == FULL_CACHE_DISCOVERER:

	102 discovered_requests = trace.request_track.GetEvents()

	103 elif subresource_discoverer == REDIRECTED_MAIN_DISCOVERER:

89 discovered_requests = \	104 discovered_requests = \

90 [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]	105 [dependencies_lens.GetRedirectChain(first_resource_request)[-1]]

91 elif subresource_discoverer == PARSER_DISCOVERER:	106 elif subresource_discoverer == PARSER_DISCOVERER:

92 discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests(	107 discovered_requests = PrefetchSimulationView.ParserDiscoverableRequests(

93 first_resource_request, dependencies_lens)	108 first_resource_request, dependencies_lens)

94 elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER:	109 elif subresource_discoverer == HTML_PRELOAD_SCANNER_DISCOVERER:

95 discovered_requests = PrefetchSimulationView.PreloadedRequests(	110 discovered_requests = PrefetchSimulationView.PreloadedRequests(

96 first_resource_request, dependencies_lens, trace)	111 first_resource_request, dependencies_lens, trace)

97 else:	112 else:

98 assert False	113 assert False

99	114

100 # Prune out data:// requests.	115 # Prune out data:// requests.

101 whitelisted_urls = set()	116 whitelisted_urls = set()

102 logging.info('white-listing %s' % first_resource_request.url)	117 logging.info('white-listing %s' % first_resource_request.url)

103 whitelisted_urls.add(first_resource_request.url)

104 for request in discovered_requests:	118 for request in discovered_requests:

105 # Work-around where the protocol may be none for an unclear reason yet.	119 # Work-around where the protocol may be none for an unclear reason yet.

106 # TODO(gabadie): Follow up on this with Clovis guys and possibly remove	120 # TODO(gabadie): Follow up on this with Clovis guys and possibly remove

107 # this work-around.	121 # this work-around.

108 if not request.protocol:	122 if not request.protocol:

109 logging.warning('ignoring %s (no protocol)' % request.url)	123 logging.warning('ignoring %s (no protocol)' % request.url)

110 continue	124 continue

111 # Ignore data protocols.	125 # Ignore data protocols.

112 if not request.protocol.startswith('http'):	126 if not request.protocol.startswith('http'):

113 continue	127 continue

114 logging.info('white-listing %s' % request.url)	128 logging.info('white-listing %s' % request.url)

115 whitelisted_urls.add(request.url)	129 whitelisted_urls.add(request.url)

116 return whitelisted_urls	130 return whitelisted_urls

	131

	132

	133 def CompareUrlSet(ref_url_set, url_set, url_set_name, debug_hint='Good luck!'):
	pasko 2016/04/18 09:36:12 I find this debug hint to be slightly offensive :) I find this debug hint to be slightly offensive :) please change it to '' pasko 2016/04/18 09:36:13 why is this function in sandwich_misc? is it going why is this function in sandwich_misc? is it going to be used in the integration test? if not, then it's just an overhead for switching between files when reading the code. pasko 2016/04/18 09:36:13 more intuitive name: PrintUrlSetComparison A func more intuitive name: PrintUrlSetComparison A function named as CompareX suggests that it returns a result of comparison and does not have side effects. pasko 2016/04/18 09:36:13 not used outside sandwich_misc -> _Compare... not used outside sandwich_misc -> _Compare... gabadie 2016/04/19 17:39:48 Done. Show quoted text On 2016/04/18 09:36:12, pasko wrote: > I find this debug hint to be slightly offensive :) > please change it to '' Done. gabadie 2016/04/19 17:39:48 Not sure to understand. Used only in this file. Show quoted text On 2016/04/18 09:36:13, pasko wrote: > why is this function in sandwich_misc? is it going to be used in the integration > test? if not, then it's just an overhead for switching between files when > reading the code. Not sure to understand. Used only in this file. gabadie 2016/04/19 17:39:48 Done. Show quoted text On 2016/04/18 09:36:13, pasko wrote: > more intuitive name: PrintUrlSetComparison > > A function named as CompareX suggests that it returns a result of comparison and > does not have side effects. Done. gabadie 2016/04/19 17:39:49 Done. Show quoted text On 2016/04/18 09:36:13, pasko wrote: > not used outside sandwich_misc -> _Compare... Done.
	134 """Compare URL sets

	135
	pasko 2016/04/18 09:36:12 need to explain what the function prints need to explain what the function prints gabadie 2016/04/19 17:39:49 Done. Show quoted text On 2016/04/18 09:36:12, pasko wrote: > need to explain what the function prints Done.
	136 Args:

	137 ref_url_set: Set of reference urls.

	138 url_set: Set of urls to compare to the reference.

	139 url_set_name: The set name for logging purposes.

	140 debug_hint: A debug hint to help debugging in any case the sets are

	141 different.

	142 """

	143 assert type(ref_url_set) == set

	144 assert type(url_set) == set

	145 if ref_url_set == url_set:

	146 logging.info(' %d %s are matching.' % (len(ref_url_set), url_set_name))
	pasko 2016/04/18 09:36:13 why % formatting here and {} formatting in other p why % formatting here and {} formatting in other places? Is it just by accident or there is some clever reason for not being consistent? gabadie 2016/04/19 17:39:49 There is an annoying check in pylint to force the Show quoted text On 2016/04/18 09:36:13, pasko wrote: > why % formatting here and {} formatting in other places? Is it just by accident > or there is some clever reason for not being consistent? There is an annoying check in pylint to force the use of % in logging.info() :(
	147 return

	148 logging.error(' %s are not matching.' % url_set_name)

	149 logging.error(' Hint: ' + debug_hint)

	150 logging.error(' List of missing resources:')

	151 for url in ref_url_set.difference(url_set):

	152 logging.error('- ' + url)

	153 logging.error(' List of unexpected resources:')

	154 for url in url_set.difference(ref_url_set):

	155 logging.error('+ ' + url)

	156

	157

	158 def _ListUrlRequests(trace, from_cache=None):
	pasko 2016/04/18 09:36:13 default arguments that can be True, False and None default arguments that can be True, False and None are confusing, especially without a pydoc. What does this argument mean? gabadie 2016/04/19 17:39:48 Done. Show quoted text On 2016/04/18 09:36:13, pasko wrote: > default arguments that can be True, False and None are confusing, especially > without a pydoc. What does this argument mean? Done.
	159 urls = set()

	160 for request_event in trace.request_track.GetEvents():

	161 if request_event.protocol == None:

	162 continue

	163 if not request_event.protocol.startswith('http'):

	164 continue

	165 if from_cache is not None and request_event.from_disk_cache != from_cache:

	166 continue

	167 urls.add(request_event.url)

	168 return urls

	169

	170

	171 def VerifyBenchmarkOutputDirectory(benchmark_setup_path,

	172 benchmark_output_directory_path):

	173 """Verifies that all run inside the run_output_directory worked as expected.

	174

	175 Args:

	176 benchmark_setup_path: Path of the JSON of the benchmark setup.
	pasko 2016/04/18 09:36:13 what is a 'benchmark setup'? what is a 'benchmark setup'? gabadie 2016/04/19 17:39:48 See SetupBenchmark in sandwich_task_builder.py Show quoted text On 2016/04/18 09:36:13, pasko wrote: > what is a 'benchmark setup'? See SetupBenchmark in sandwich_task_builder.py
	177 benchmark_output_directory_path: Path of the benchmark output directory to

	178 verify.

	179 """

	180 benchmark_setup = json.load(open(benchmark_setup_path))

	181 cache_whitelist = set(benchmark_setup['cache_whitelist'])

	182 url_resources = set(benchmark_setup['url_resources'])

	183

	184 # Verify requests from traces.

	185 run_id = -1

	186 while True:

	187 run_id += 1

	188 run_path = os.path.join(benchmark_output_directory_path, str(run_id))

	189 if not os.path.isdir(run_path):

	190 break

	191 trace_path = os.path.join(run_path, 'trace.json')
	pasko 2016/04/18 09:36:13 'trace.json' exists in many files, consider making 'trace.json' exists in many files, consider making a constant for it in common_util gabadie 2016/04/19 17:39:48 Nop. trace.json is sandwich specific. I don't see Show quoted text On 2016/04/18 09:36:13, pasko wrote: > 'trace.json' exists in many files, consider making a constant for it in > common_util Nop. trace.json is sandwich specific. I don't see the point in exposing that in common_util. Added in sandwich_runner instead.
	192 if not os.path.isfile(trace_path):

	193 logging.error('missing trace %s' % trace_path)

	194 continue

	195 trace = LoadingTrace.FromJsonFile(trace_path)

	196 logging.info('verifying %s from %s' % (trace.url, trace_path))

	197 CompareUrlSet(url_resources, _ListUrlRequests(trace), 'All resources',
	pasko 2016/04/18 09:36:13 explicit second arg for _ListUrlRequests is needed explicit second arg for _ListUrlRequests is needed, implicit stuff is more error prone gabadie 2016/04/19 17:39:48 Done. Show quoted text On 2016/04/18 09:36:13, pasko wrote: > explicit second arg for _ListUrlRequests is needed, implicit stuff is more error > prone Done.
	198 'You may have an issue with an AJAX requests.')
	pasko 2016/04/18 09:36:13 I think one can easily tell this by looking at URL I think one can easily tell this by looking at URLs, is this hint useful? gabadie 2016/04/19 17:39:48 Done. Show quoted text On 2016/04/18 09:36:13, pasko wrote: > I think one can easily tell this by looking at URLs, is this hint useful? Done.
	199 CompareUrlSet(url_resources.intersection(cache_whitelist),

	200 _ListUrlRequests(trace, True), 'Cached resources',

	201 'The WPR archive patcher may have an invalidation issue.')
	pasko 2016/04/18 09:36:13 It's slightly funny to see gabadie@ providing hint It's slightly funny to see gabadie@ providing hints for himself, well .. hmm, was this hint useful to you? gabadie 2016/04/19 17:39:48 The point has to give hint to user who wanted to r Show quoted text On 2016/04/18 09:36:13, pasko wrote: > It's slightly funny to see gabadie@ providing hints for himself, well .. hmm, > was this hint useful to you? The point has to give hint to user who wanted to reproduce our analysis. Happy to remove. Done.
	202 CompareUrlSet(url_resources.difference(cache_whitelist),

	203 _ListUrlRequests(trace, False), 'Non cached resources')

	204

	205

	206 def ListResourcesUrls(benchmark_output_directory_path):
	pasko 2016/04/18 09:36:13 ListResourceUrls ListResourceUrls gabadie 2016/04/19 17:39:48 Done. Show quoted text On 2016/04/18 09:36:13, pasko wrote: > ListResourceUrls Done.
	207 """Lists all requested urls per navigated urls
	pasko 2016/04/18 09:36:12 s/urls/URLs/ s/urls/URLs/ gabadie 2016/04/19 17:39:48 Done. Show quoted text On 2016/04/18 09:36:12, pasko wrote: > s/urls/URLs/ Done.
	208

	209 Args:

	210 benchmark_output_directory_path: Path of the benchmark output directory to

	211 verify.

	212

	213 Returns:

	214 {url -> [urls of sub-resources]}

	215 """

	216 url_subresources = {}

	217 run_id = -1

	218 while True:

	219 run_id += 1

	220 run_path = os.path.join(benchmark_output_directory_path, str(run_id))

	221 if not os.path.isdir(run_path):

	222 break

	223 trace_path = os.path.join(run_path, 'trace.json')

	224 if not os.path.isfile(trace_path):

	225 continue

	226 trace = LoadingTrace.FromJsonFile(trace_path)

	227 if trace.url in url_subresources:

	228 continue

	229 logging.info('lists resources of %s from %s' % (trace.url, trace_path))

	230 urls_set = set()

	231 for request_event in trace.request_track.GetEvents():

	232 if not request_event.protocol.startswith('http'):

	233 continue

	234 if request_event.url not in urls_set:

	235 logging.info(' %s' % request_event.url)

	236 urls_set.add(request_event.url)

	237 url_subresources[trace.url] = [url for url in urls_set]

	238 return url_subresources

	239

	240

	241 def ValidateCacheArchiveContent(ref_urls, cache_archive_path):

	242 """Validates a cache archive content.

	243

	244 Args:

	245 ref_urls: Reference list of urls.

	246 cache_archive_path: Cache archive's path to validate.

	247 """

	248 logging.info('lists cached urls from %s' % cache_archive_path)

	249 with common_util.TemporaryDirectory() as cache_directory:

	250 chrome_cache.UnzipDirectoryContent(cache_archive_path, cache_directory)

	251 cached_urls = \

	252 chrome_cache.CacheBackend(cache_directory, 'simple').ListKeys()

	253 CompareUrlSet(set(ref_urls), set(cached_urls), 'cached resources',

	254 debug_hint='Looks like a response header needs to be patched.')

OLD	NEW