Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(695)

Side by Side Diff: tools/android/loading/sandwich_prefetch.py

Issue 2112483002: sandwich: Fixes two sources of KeyError task failures (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Addresses Egor's comments Created 4 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 # Copyright 2016 The Chromium Authors. All rights reserved. 1 # Copyright 2016 The Chromium Authors. All rights reserved.
2 # Use of this source code is governed by a BSD-style license that can be 2 # Use of this source code is governed by a BSD-style license that can be
3 # found in the LICENSE file. 3 # found in the LICENSE file.
4 4
5 """ 5 """
6 Implements a task builder for benchmarking effects of NoState Prefetch. 6 Implements a task builder for benchmarking effects of NoState Prefetch.
7 Noticeable steps of the task pipeline: 7 Noticeable steps of the task pipeline:
8 * Save a WPR archive 8 * Save a WPR archive
9 * Process the WPR archive to make all resources cacheable 9 * Process the WPR archive to make all resources cacheable
10 * Process cache archive to patch response headers back to their original 10 * Process cache archive to patch response headers back to their original
11 values. 11 values.
12 * Find out which resources are discoverable by NoState Prefetch 12 * Find out which resources are discoverable by NoState Prefetch
13 (HTMLPreloadScanner) 13 (HTMLPreloadScanner)
14 * Load pages with empty/full/prefetched cache 14 * Load pages with empty/full/prefetched cache
15 * Extract most important metrics to a CSV 15 * Extract most important metrics to a CSV
16 """ 16 """
17 17
18 import csv 18 import csv
19 import logging 19 import logging
20 import json 20 import json
21 import os 21 import os
22 import re 22 import re
23 import shutil 23 import shutil
24 from urlparse import urlparse 24 import urlparse
25 25
26 import chrome_cache 26 import chrome_cache
27 import common_util 27 import common_util
28 import loading_trace 28 import loading_trace
29 from prefetch_view import PrefetchSimulationView 29 from prefetch_view import PrefetchSimulationView
30 from request_dependencies_lens import RequestDependencyLens 30 from request_dependencies_lens import RequestDependencyLens
31 import sandwich_metrics 31 import sandwich_metrics
32 import sandwich_runner 32 import sandwich_runner
33 import sandwich_utils
33 import task_manager 34 import task_manager
34 import wpr_backend 35 import wpr_backend
35 36
36 37
37 class Discoverer(object): 38 class Discoverer(object):
38 # Do not prefetch anything. 39 # Do not prefetch anything.
39 EmptyCache = 'empty-cache' 40 EmptyCache = 'empty-cache'
40 41
41 # Prefetches everything to load fully from cache (impossible in practice). 42 # Prefetches everything to load fully from cache (impossible in practice).
42 FullCache = 'full-cache' 43 FullCache = 'full-cache'
(...skipping 19 matching lines...) Expand all
62 Discoverer.MainDocument, 63 Discoverer.MainDocument,
63 Discoverer.Parser, 64 Discoverer.Parser,
64 Discoverer.HTMLPreloadScanner, 65 Discoverer.HTMLPreloadScanner,
65 Discoverer.HTMLPreloadScannerStore, 66 Discoverer.HTMLPreloadScannerStore,
66 ]) 67 ])
67 68
68 69
69 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$') 70 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')
70 71
71 72
73 def _NormalizeUrl(url):
74 """Returns normalized URL such as removing trailing slashes."""
75 parsed_url = list(urlparse.urlparse(url))
76 parsed_url[2] = re.sub(r'/{2,}', r'/', parsed_url[2])
77 return urlparse.urlunparse(parsed_url)
78
79
72 def _PatchWpr(wpr_archive): 80 def _PatchWpr(wpr_archive):
73 """Patches a WPR archive to get all resources into the HTTP cache and avoid 81 """Patches a WPR archive to get all resources into the HTTP cache and avoid
74 invalidation and revalidations. 82 invalidation and revalidations.
75 83
76 Args: 84 Args:
77 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch. 85 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch.
78 """ 86 """
79 # Sets the resources cache max-age to 10 years. 87 # Sets the resources cache max-age to 10 years.
80 MAX_AGE = 10 * 365 * 24 * 60 * 60 88 MAX_AGE = 10 * 365 * 24 * 60 * 60
81 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE) 89 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE)
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after
194 logging.info('number of requests discovered by %s: %d', 202 logging.info('number of requests discovered by %s: %d',
195 subresource_discoverer, len(requests)) 203 subresource_discoverer, len(requests))
196 return requests 204 return requests
197 205
198 206
199 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests): 207 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests):
200 with open(original_headers_path) as file_input: 208 with open(original_headers_path) as file_input:
201 original_headers = json.load(file_input) 209 original_headers = json.load(file_input)
202 pruned_requests = set() 210 pruned_requests = set()
203 for request in requests: 211 for request in requests:
204 request_original_headers = original_headers[request.url] 212 url = _NormalizeUrl(request.url)
213 if url not in original_headers:
214 # TODO(gabadie): Investigate why these requests were not in WPR.
215 assert request.failed
216 logging.warning(
217 'could not find original headers for: %s (failure: %s)',
218 url, request.error_text)
219 continue
220 request_original_headers = original_headers[url]
205 if ('cache-control' in request_original_headers and 221 if ('cache-control' in request_original_headers and
206 'no-store' in request_original_headers['cache-control'].lower()): 222 'no-store' in request_original_headers['cache-control'].lower()):
207 pruned_requests.add(request) 223 pruned_requests.add(request)
208 return [r for r in requests if r not in pruned_requests] 224 return [r for r in requests if r not in pruned_requests]
209 225
210 226
211 def _ExtractDiscoverableUrls( 227 def _ExtractDiscoverableUrls(
212 original_headers_path, loading_trace_path, subresource_discoverer): 228 original_headers_path, loading_trace_path, subresource_discoverer):
213 """Extracts discoverable resource urls from a loading trace according to a 229 """Extracts discoverable resource urls from a loading trace according to a
214 sub-resource discoverer. 230 sub-resource discoverer.
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after
362 def VerifyWprLog(self, wpr_log_path): 378 def VerifyWprLog(self, wpr_log_path):
363 """Verifies WPR log with previously verified traces.""" 379 """Verifies WPR log with previously verified traces."""
364 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) 380 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path)
365 all_wpr_urls = set() 381 all_wpr_urls = set()
366 unserved_wpr_urls = set() 382 unserved_wpr_urls = set()
367 wpr_command_colliding_urls = set() 383 wpr_command_colliding_urls = set()
368 384
369 for request in all_wpr_requests: 385 for request in all_wpr_requests:
370 if request.is_wpr_host: 386 if request.is_wpr_host:
371 continue 387 continue
372 if urlparse(request.url).path.startswith('/web-page-replay'): 388 if urlparse.urlparse(request.url).path.startswith('/web-page-replay'):
373 wpr_command_colliding_urls.add(request.url) 389 wpr_command_colliding_urls.add(request.url)
374 elif request.is_served is False: 390 elif request.is_served is False:
375 unserved_wpr_urls.add(request.url) 391 unserved_wpr_urls.add(request.url)
376 all_wpr_urls.add(request.url) 392 all_wpr_urls.add(request.url)
377 393
378 _PrintUrlSetComparison(set(), unserved_wpr_urls, 394 _PrintUrlSetComparison(set(), unserved_wpr_urls,
379 'Distinct unserved resources from WPR') 395 'Distinct unserved resources from WPR')
380 _PrintUrlSetComparison(set(), wpr_command_colliding_urls, 396 _PrintUrlSetComparison(set(), wpr_command_colliding_urls,
381 'Distinct resources colliding to WPR commands') 397 'Distinct resources colliding to WPR commands')
382 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests, 398 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests,
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after
474 logging.info('loading trace: %s', trace_path) 490 logging.info('loading trace: %s', trace_path)
475 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) 491 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path)
476 492
477 logging.info('verifying trace: %s', trace_path) 493 logging.info('verifying trace: %s', trace_path)
478 run_output_verifier.VerifyTrace(trace) 494 run_output_verifier.VerifyTrace(trace)
479 495
480 logging.info('extracting metrics from trace: %s', trace_path) 496 logging.info('extracting metrics from trace: %s', trace_path)
481 served_from_network_bytes = 0 497 served_from_network_bytes = 0
482 served_from_cache_bytes = 0 498 served_from_cache_bytes = 0
483 urls_hitting_network = set() 499 urls_hitting_network = set()
500 response_sizes = {}
484 for request in _FilterOutDataAndIncompleteRequests( 501 for request in _FilterOutDataAndIncompleteRequests(
485 trace.request_track.GetEvents()): 502 trace.request_track.GetEvents()):
486 # Ignore requests served from the blink's cache. 503 # Ignore requests served from the blink's cache.
487 if request.served_from_cache: 504 if request.served_from_cache:
488 continue 505 continue
489 urls_hitting_network.add(request.url) 506 urls_hitting_network.add(request.url)
490 if request.from_disk_cache: 507 if request.from_disk_cache:
491 served_from_cache_bytes += cached_encoded_data_lengths[request.url] 508 if request.url in cached_encoded_data_lengths:
509 response_size = cached_encoded_data_lengths[request.url]
510 else:
511 # Some fat webpages may overflow the Memory cache, and so some
512 # requests might be served from disk cache couple of times per page
513 # load.
514 logging.warning('Looks like could be served from memory cache: %s',
515 request.url)
516 response_size = response_sizes[request.url]
517 served_from_cache_bytes += response_size
492 else: 518 else:
493 served_from_network_bytes += request.GetEncodedDataLength() 519 response_size = request.GetEncodedDataLength()
520 served_from_network_bytes += response_size
521 response_sizes[request.url] = response_size
494 522
495 # Make sure the served from blink's cache requests have at least one 523 # Make sure the served from blink's cache requests have at least one
496 # corresponding request that was not served from the blink's cache. 524 # corresponding request that was not served from the blink's cache.
497 for request in _FilterOutDataAndIncompleteRequests( 525 for request in _FilterOutDataAndIncompleteRequests(
498 trace.request_track.GetEvents()): 526 trace.request_track.GetEvents()):
499 assert (request.url in urls_hitting_network or 527 assert (request.url in urls_hitting_network or
500 not request.served_from_cache) 528 not request.served_from_cache)
501 529
502 run_metrics = { 530 run_metrics = {
503 'url': trace.url, 531 'url': trace.url,
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after
567 dependencies=[self._common_builder.original_wpr_task]) 595 dependencies=[self._common_builder.original_wpr_task])
568 def BuildPatchedWpr(): 596 def BuildPatchedWpr():
569 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path) 597 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path)
570 shutil.copyfile( 598 shutil.copyfile(
571 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path) 599 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path)
572 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path) 600 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path)
573 601
574 # Save up original response headers. 602 # Save up original response headers.
575 original_response_headers = {e.url: e.GetResponseHeadersDict() \ 603 original_response_headers = {e.url: e.GetResponseHeadersDict() \
576 for e in wpr_archive.ListUrlEntries()} 604 for e in wpr_archive.ListUrlEntries()}
605 logging.info('save up response headers for %d resources',
606 len(original_response_headers))
607 if not original_response_headers:
608 # TODO(gabadie): How is it possible to not even have the main resource
609 # in the WPR archive? Example URL can be found in:
610 # http://crbug.com/623966#c5
611 raise Exception(
612 'Looks like no resources were recorded in WPR during: {}'.format(
613 self._common_builder.original_wpr_task.name))
577 with open(self._original_headers_path, 'w') as file_output: 614 with open(self._original_headers_path, 'w') as file_output:
578 json.dump(original_response_headers, file_output) 615 json.dump(original_response_headers, file_output)
579 616
580 # Patch WPR. 617 # Patch WPR.
581 _PatchWpr(wpr_archive) 618 _PatchWpr(wpr_archive)
582 wpr_archive.Persist() 619 wpr_archive.Persist()
583 620
584 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr]) 621 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr])
585 def BuildOriginalCache(): 622 def BuildOriginalCache():
586 runner = self._common_builder.CreateSandwichRunner() 623 runner = self._common_builder.CreateSandwichRunner()
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after
698 run_metrics_list = _ProcessRunOutputDir( 735 run_metrics_list = _ProcessRunOutputDir(
699 cache_validation_result, benchmark_setup, RunBenchmark.path) 736 cache_validation_result, benchmark_setup, RunBenchmark.path)
700 with open(ProcessRunOutputDir.path, 'w') as csv_file: 737 with open(ProcessRunOutputDir.path, 'w') as csv_file:
701 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names + 738 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names +
702 sandwich_metrics.COMMON_CSV_COLUMN_NAMES)) 739 sandwich_metrics.COMMON_CSV_COLUMN_NAMES))
703 writer.writeheader() 740 writer.writeheader()
704 for trace_metrics in run_metrics_list: 741 for trace_metrics in run_metrics_list:
705 writer.writerow(trace_metrics) 742 writer.writerow(trace_metrics)
706 743
707 self._common_builder.default_final_tasks.append(ProcessRunOutputDir) 744 self._common_builder.default_final_tasks.append(ProcessRunOutputDir)
OLDNEW
« no previous file with comments | « no previous file | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698