OLD | NEW |
1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
4 | 4 |
5 """ | 5 """ |
6 Implements a task builder for benchmarking effects of NoState Prefetch. | 6 Implements a task builder for benchmarking effects of NoState Prefetch. |
7 Noticeable steps of the task pipeline: | 7 Noticeable steps of the task pipeline: |
8 * Save a WPR archive | 8 * Save a WPR archive |
9 * Process the WPR archive to make all resources cacheable | 9 * Process the WPR archive to make all resources cacheable |
10 * Process cache archive to patch response headers back to their original | 10 * Process cache archive to patch response headers back to their original |
11 values. | 11 values. |
12 * Find out which resources are discoverable by NoState Prefetch | 12 * Find out which resources are discoverable by NoState Prefetch |
13 (HTMLPreloadScanner) | 13 (HTMLPreloadScanner) |
14 * Load pages with empty/full/prefetched cache | 14 * Load pages with empty/full/prefetched cache |
15 * Extract most important metrics to a CSV | 15 * Extract most important metrics to a CSV |
16 """ | 16 """ |
17 | 17 |
18 import csv | 18 import csv |
19 import logging | 19 import logging |
20 import json | 20 import json |
21 import os | 21 import os |
22 import re | 22 import re |
23 import shutil | 23 import shutil |
24 from urlparse import urlparse | 24 import urlparse |
25 | 25 |
26 import chrome_cache | 26 import chrome_cache |
27 import common_util | 27 import common_util |
28 import loading_trace | 28 import loading_trace |
29 from prefetch_view import PrefetchSimulationView | 29 from prefetch_view import PrefetchSimulationView |
30 from request_dependencies_lens import RequestDependencyLens | 30 from request_dependencies_lens import RequestDependencyLens |
31 import sandwich_metrics | 31 import sandwich_metrics |
32 import sandwich_runner | 32 import sandwich_runner |
| 33 import sandwich_utils |
33 import task_manager | 34 import task_manager |
34 import wpr_backend | 35 import wpr_backend |
35 | 36 |
36 | 37 |
37 class Discoverer(object): | 38 class Discoverer(object): |
38 # Do not prefetch anything. | 39 # Do not prefetch anything. |
39 EmptyCache = 'empty-cache' | 40 EmptyCache = 'empty-cache' |
40 | 41 |
41 # Prefetches everything to load fully from cache (impossible in practice). | 42 # Prefetches everything to load fully from cache (impossible in practice). |
42 FullCache = 'full-cache' | 43 FullCache = 'full-cache' |
(...skipping 19 matching lines...) Expand all Loading... |
62 Discoverer.MainDocument, | 63 Discoverer.MainDocument, |
63 Discoverer.Parser, | 64 Discoverer.Parser, |
64 Discoverer.HTMLPreloadScanner, | 65 Discoverer.HTMLPreloadScanner, |
65 Discoverer.HTMLPreloadScannerStore, | 66 Discoverer.HTMLPreloadScannerStore, |
66 ]) | 67 ]) |
67 | 68 |
68 | 69 |
69 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$') | 70 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$') |
70 | 71 |
71 | 72 |
| 73 def _NormalizeUrl(url): |
| 74 """Returns normalized URL such as removing trailing slashes.""" |
| 75 parsed_url = list(urlparse.urlparse(url)) |
| 76 parsed_url[2] = re.sub(r'/{2,}', r'/', parsed_url[2]) |
| 77 return urlparse.urlunparse(parsed_url) |
| 78 |
| 79 |
72 def _PatchWpr(wpr_archive): | 80 def _PatchWpr(wpr_archive): |
73 """Patches a WPR archive to get all resources into the HTTP cache and avoid | 81 """Patches a WPR archive to get all resources into the HTTP cache and avoid |
74 invalidation and revalidations. | 82 invalidation and revalidations. |
75 | 83 |
76 Args: | 84 Args: |
77 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch. | 85 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch. |
78 """ | 86 """ |
79 # Sets the resources cache max-age to 10 years. | 87 # Sets the resources cache max-age to 10 years. |
80 MAX_AGE = 10 * 365 * 24 * 60 * 60 | 88 MAX_AGE = 10 * 365 * 24 * 60 * 60 |
81 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE) | 89 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE) |
(...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
194 logging.info('number of requests discovered by %s: %d', | 202 logging.info('number of requests discovered by %s: %d', |
195 subresource_discoverer, len(requests)) | 203 subresource_discoverer, len(requests)) |
196 return requests | 204 return requests |
197 | 205 |
198 | 206 |
199 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests): | 207 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests): |
200 with open(original_headers_path) as file_input: | 208 with open(original_headers_path) as file_input: |
201 original_headers = json.load(file_input) | 209 original_headers = json.load(file_input) |
202 pruned_requests = set() | 210 pruned_requests = set() |
203 for request in requests: | 211 for request in requests: |
204 request_original_headers = original_headers[request.url] | 212 url = _NormalizeUrl(request.url) |
| 213 if url not in original_headers: |
| 214 # TODO(gabadie): Investigate why these requests were not in WPR. |
| 215 assert request.failed |
| 216 logging.warning( |
| 217 'could not find original headers for: %s (failure: %s)', |
| 218 url, request.error_text) |
| 219 continue |
| 220 request_original_headers = original_headers[url] |
205 if ('cache-control' in request_original_headers and | 221 if ('cache-control' in request_original_headers and |
206 'no-store' in request_original_headers['cache-control'].lower()): | 222 'no-store' in request_original_headers['cache-control'].lower()): |
207 pruned_requests.add(request) | 223 pruned_requests.add(request) |
208 return [r for r in requests if r not in pruned_requests] | 224 return [r for r in requests if r not in pruned_requests] |
209 | 225 |
210 | 226 |
211 def _ExtractDiscoverableUrls( | 227 def _ExtractDiscoverableUrls( |
212 original_headers_path, loading_trace_path, subresource_discoverer): | 228 original_headers_path, loading_trace_path, subresource_discoverer): |
213 """Extracts discoverable resource urls from a loading trace according to a | 229 """Extracts discoverable resource urls from a loading trace according to a |
214 sub-resource discoverer. | 230 sub-resource discoverer. |
(...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
362 def VerifyWprLog(self, wpr_log_path): | 378 def VerifyWprLog(self, wpr_log_path): |
363 """Verifies WPR log with previously verified traces.""" | 379 """Verifies WPR log with previously verified traces.""" |
364 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) | 380 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) |
365 all_wpr_urls = set() | 381 all_wpr_urls = set() |
366 unserved_wpr_urls = set() | 382 unserved_wpr_urls = set() |
367 wpr_command_colliding_urls = set() | 383 wpr_command_colliding_urls = set() |
368 | 384 |
369 for request in all_wpr_requests: | 385 for request in all_wpr_requests: |
370 if request.is_wpr_host: | 386 if request.is_wpr_host: |
371 continue | 387 continue |
372 if urlparse(request.url).path.startswith('/web-page-replay'): | 388 if urlparse.urlparse(request.url).path.startswith('/web-page-replay'): |
373 wpr_command_colliding_urls.add(request.url) | 389 wpr_command_colliding_urls.add(request.url) |
374 elif request.is_served is False: | 390 elif request.is_served is False: |
375 unserved_wpr_urls.add(request.url) | 391 unserved_wpr_urls.add(request.url) |
376 all_wpr_urls.add(request.url) | 392 all_wpr_urls.add(request.url) |
377 | 393 |
378 _PrintUrlSetComparison(set(), unserved_wpr_urls, | 394 _PrintUrlSetComparison(set(), unserved_wpr_urls, |
379 'Distinct unserved resources from WPR') | 395 'Distinct unserved resources from WPR') |
380 _PrintUrlSetComparison(set(), wpr_command_colliding_urls, | 396 _PrintUrlSetComparison(set(), wpr_command_colliding_urls, |
381 'Distinct resources colliding to WPR commands') | 397 'Distinct resources colliding to WPR commands') |
382 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests, | 398 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests, |
(...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
474 logging.info('loading trace: %s', trace_path) | 490 logging.info('loading trace: %s', trace_path) |
475 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) | 491 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) |
476 | 492 |
477 logging.info('verifying trace: %s', trace_path) | 493 logging.info('verifying trace: %s', trace_path) |
478 run_output_verifier.VerifyTrace(trace) | 494 run_output_verifier.VerifyTrace(trace) |
479 | 495 |
480 logging.info('extracting metrics from trace: %s', trace_path) | 496 logging.info('extracting metrics from trace: %s', trace_path) |
481 served_from_network_bytes = 0 | 497 served_from_network_bytes = 0 |
482 served_from_cache_bytes = 0 | 498 served_from_cache_bytes = 0 |
483 urls_hitting_network = set() | 499 urls_hitting_network = set() |
| 500 response_sizes = {} |
484 for request in _FilterOutDataAndIncompleteRequests( | 501 for request in _FilterOutDataAndIncompleteRequests( |
485 trace.request_track.GetEvents()): | 502 trace.request_track.GetEvents()): |
486 # Ignore requests served from the blink's cache. | 503 # Ignore requests served from the blink's cache. |
487 if request.served_from_cache: | 504 if request.served_from_cache: |
488 continue | 505 continue |
489 urls_hitting_network.add(request.url) | 506 urls_hitting_network.add(request.url) |
490 if request.from_disk_cache: | 507 if request.from_disk_cache: |
491 served_from_cache_bytes += cached_encoded_data_lengths[request.url] | 508 if request.url in cached_encoded_data_lengths: |
| 509 response_size = cached_encoded_data_lengths[request.url] |
| 510 else: |
| 511 # Some fat webpages may overflow the Memory cache, and so some |
| 512 # requests might be served from disk cache couple of times per page |
| 513 # load. |
| 514 logging.warning('Looks like could be served from memory cache: %s', |
| 515 request.url) |
| 516 response_size = response_sizes[request.url] |
| 517 served_from_cache_bytes += response_size |
492 else: | 518 else: |
493 served_from_network_bytes += request.GetEncodedDataLength() | 519 response_size = request.GetEncodedDataLength() |
| 520 served_from_network_bytes += response_size |
| 521 response_sizes[request.url] = response_size |
494 | 522 |
495 # Make sure the served from blink's cache requests have at least one | 523 # Make sure the served from blink's cache requests have at least one |
496 # corresponding request that was not served from the blink's cache. | 524 # corresponding request that was not served from the blink's cache. |
497 for request in _FilterOutDataAndIncompleteRequests( | 525 for request in _FilterOutDataAndIncompleteRequests( |
498 trace.request_track.GetEvents()): | 526 trace.request_track.GetEvents()): |
499 assert (request.url in urls_hitting_network or | 527 assert (request.url in urls_hitting_network or |
500 not request.served_from_cache) | 528 not request.served_from_cache) |
501 | 529 |
502 run_metrics = { | 530 run_metrics = { |
503 'url': trace.url, | 531 'url': trace.url, |
(...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
567 dependencies=[self._common_builder.original_wpr_task]) | 595 dependencies=[self._common_builder.original_wpr_task]) |
568 def BuildPatchedWpr(): | 596 def BuildPatchedWpr(): |
569 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path) | 597 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path) |
570 shutil.copyfile( | 598 shutil.copyfile( |
571 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path) | 599 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path) |
572 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path) | 600 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path) |
573 | 601 |
574 # Save up original response headers. | 602 # Save up original response headers. |
575 original_response_headers = {e.url: e.GetResponseHeadersDict() \ | 603 original_response_headers = {e.url: e.GetResponseHeadersDict() \ |
576 for e in wpr_archive.ListUrlEntries()} | 604 for e in wpr_archive.ListUrlEntries()} |
| 605 logging.info('save up response headers for %d resources', |
| 606 len(original_response_headers)) |
| 607 if not original_response_headers: |
| 608 # TODO(gabadie): How is it possible to not even have the main resource |
| 609 # in the WPR archive? Example URL can be found in: |
| 610 # http://crbug.com/623966#c5 |
| 611 raise Exception( |
| 612 'Looks like no resources were recorded in WPR during: {}'.format( |
| 613 self._common_builder.original_wpr_task.name)) |
577 with open(self._original_headers_path, 'w') as file_output: | 614 with open(self._original_headers_path, 'w') as file_output: |
578 json.dump(original_response_headers, file_output) | 615 json.dump(original_response_headers, file_output) |
579 | 616 |
580 # Patch WPR. | 617 # Patch WPR. |
581 _PatchWpr(wpr_archive) | 618 _PatchWpr(wpr_archive) |
582 wpr_archive.Persist() | 619 wpr_archive.Persist() |
583 | 620 |
584 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr]) | 621 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr]) |
585 def BuildOriginalCache(): | 622 def BuildOriginalCache(): |
586 runner = self._common_builder.CreateSandwichRunner() | 623 runner = self._common_builder.CreateSandwichRunner() |
(...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
698 run_metrics_list = _ProcessRunOutputDir( | 735 run_metrics_list = _ProcessRunOutputDir( |
699 cache_validation_result, benchmark_setup, RunBenchmark.path) | 736 cache_validation_result, benchmark_setup, RunBenchmark.path) |
700 with open(ProcessRunOutputDir.path, 'w') as csv_file: | 737 with open(ProcessRunOutputDir.path, 'w') as csv_file: |
701 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names + | 738 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names + |
702 sandwich_metrics.COMMON_CSV_COLUMN_NAMES)) | 739 sandwich_metrics.COMMON_CSV_COLUMN_NAMES)) |
703 writer.writeheader() | 740 writer.writeheader() |
704 for trace_metrics in run_metrics_list: | 741 for trace_metrics in run_metrics_list: |
705 writer.writerow(trace_metrics) | 742 writer.writerow(trace_metrics) |
706 | 743 |
707 self._common_builder.default_final_tasks.append(ProcessRunOutputDir) | 744 self._common_builder.default_final_tasks.append(ProcessRunOutputDir) |
OLD | NEW |