| OLD | NEW |
| 1 # Copyright 2016 The Chromium Authors. All rights reserved. | 1 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 # Use of this source code is governed by a BSD-style license that can be | 2 # Use of this source code is governed by a BSD-style license that can be |
| 3 # found in the LICENSE file. | 3 # found in the LICENSE file. |
| 4 | 4 |
| 5 """ | 5 """ |
| 6 Implements a task builder for benchmarking effects of NoState Prefetch. | 6 Implements a task builder for benchmarking effects of NoState Prefetch. |
| 7 Noticeable steps of the task pipeline: | 7 Noticeable steps of the task pipeline: |
| 8 * Save a WPR archive | 8 * Save a WPR archive |
| 9 * Process the WPR archive to make all resources cacheable | 9 * Process the WPR archive to make all resources cacheable |
| 10 * Process cache archive to patch response headers back to their original | 10 * Process cache archive to patch response headers back to their original |
| 11 values. | 11 values. |
| 12 * Find out which resources are discoverable by NoState Prefetch | 12 * Find out which resources are discoverable by NoState Prefetch |
| 13 (HTMLPreloadScanner) | 13 (HTMLPreloadScanner) |
| 14 * Load pages with empty/full/prefetched cache | 14 * Load pages with empty/full/prefetched cache |
| 15 * Extract most important metrics to a CSV | 15 * Extract most important metrics to a CSV |
| 16 """ | 16 """ |
| 17 | 17 |
| 18 import csv | 18 import csv |
| 19 import logging | 19 import logging |
| 20 import json | 20 import json |
| 21 import os | 21 import os |
| 22 import re | 22 import re |
| 23 import shutil | 23 import shutil |
| 24 from urlparse import urlparse | 24 import urlparse |
| 25 | 25 |
| 26 import chrome_cache | 26 import chrome_cache |
| 27 import common_util | 27 import common_util |
| 28 import loading_trace | 28 import loading_trace |
| 29 from prefetch_view import PrefetchSimulationView | 29 from prefetch_view import PrefetchSimulationView |
| 30 from request_dependencies_lens import RequestDependencyLens | 30 from request_dependencies_lens import RequestDependencyLens |
| 31 import sandwich_metrics | 31 import sandwich_metrics |
| 32 import sandwich_runner | 32 import sandwich_runner |
| 33 import sandwich_utils |
| 33 import task_manager | 34 import task_manager |
| 34 import wpr_backend | 35 import wpr_backend |
| 35 | 36 |
| 36 | 37 |
| 37 class Discoverer(object): | 38 class Discoverer(object): |
| 38 # Do not prefetch anything. | 39 # Do not prefetch anything. |
| 39 EmptyCache = 'empty-cache' | 40 EmptyCache = 'empty-cache' |
| 40 | 41 |
| 41 # Prefetches everything to load fully from cache (impossible in practice). | 42 # Prefetches everything to load fully from cache (impossible in practice). |
| 42 FullCache = 'full-cache' | 43 FullCache = 'full-cache' |
| (...skipping 19 matching lines...) Expand all Loading... |
| 62 Discoverer.MainDocument, | 63 Discoverer.MainDocument, |
| 63 Discoverer.Parser, | 64 Discoverer.Parser, |
| 64 Discoverer.HTMLPreloadScanner, | 65 Discoverer.HTMLPreloadScanner, |
| 65 Discoverer.HTMLPreloadScannerStore, | 66 Discoverer.HTMLPreloadScannerStore, |
| 66 ]) | 67 ]) |
| 67 | 68 |
| 68 | 69 |
| 69 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$') | 70 _UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$') |
| 70 | 71 |
| 71 | 72 |
| 73 def _NormalizeUrl(url): |
| 74 """Returns normalized URL such as removing trailing slashes.""" |
| 75 parsed_url = list(urlparse.urlparse(url)) |
| 76 parsed_url[2] = re.sub(r'/{2,}', r'/', parsed_url[2]) |
| 77 return urlparse.urlunparse(parsed_url) |
| 78 |
| 79 |
| 72 def _PatchWpr(wpr_archive): | 80 def _PatchWpr(wpr_archive): |
| 73 """Patches a WPR archive to get all resources into the HTTP cache and avoid | 81 """Patches a WPR archive to get all resources into the HTTP cache and avoid |
| 74 invalidation and revalidations. | 82 invalidation and revalidations. |
| 75 | 83 |
| 76 Args: | 84 Args: |
| 77 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch. | 85 wpr_archive: wpr_backend.WprArchiveBackend WPR archive to patch. |
| 78 """ | 86 """ |
| 79 # Sets the resources cache max-age to 10 years. | 87 # Sets the resources cache max-age to 10 years. |
| 80 MAX_AGE = 10 * 365 * 24 * 60 * 60 | 88 MAX_AGE = 10 * 365 * 24 * 60 * 60 |
| 81 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE) | 89 CACHE_CONTROL = 'public, max-age={}'.format(MAX_AGE) |
| (...skipping 112 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 194 logging.info('number of requests discovered by %s: %d', | 202 logging.info('number of requests discovered by %s: %d', |
| 195 subresource_discoverer, len(requests)) | 203 subresource_discoverer, len(requests)) |
| 196 return requests | 204 return requests |
| 197 | 205 |
| 198 | 206 |
| 199 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests): | 207 def _PruneOutOriginalNoStoreRequests(original_headers_path, requests): |
| 200 with open(original_headers_path) as file_input: | 208 with open(original_headers_path) as file_input: |
| 201 original_headers = json.load(file_input) | 209 original_headers = json.load(file_input) |
| 202 pruned_requests = set() | 210 pruned_requests = set() |
| 203 for request in requests: | 211 for request in requests: |
| 204 request_original_headers = original_headers[request.url] | 212 url = _NormalizeUrl(request.url) |
| 213 if url not in original_headers: |
| 214 # TODO(gabadie): Investigate why these requests were not in WPR. |
| 215 assert request.failed |
| 216 logging.warning( |
| 217 'could not find original headers for: %s (failure: %s)', |
| 218 url, request.error_text) |
| 219 continue |
| 220 request_original_headers = original_headers[url] |
| 205 if ('cache-control' in request_original_headers and | 221 if ('cache-control' in request_original_headers and |
| 206 'no-store' in request_original_headers['cache-control'].lower()): | 222 'no-store' in request_original_headers['cache-control'].lower()): |
| 207 pruned_requests.add(request) | 223 pruned_requests.add(request) |
| 208 return [r for r in requests if r not in pruned_requests] | 224 return [r for r in requests if r not in pruned_requests] |
| 209 | 225 |
| 210 | 226 |
| 211 def _ExtractDiscoverableUrls( | 227 def _ExtractDiscoverableUrls( |
| 212 original_headers_path, loading_trace_path, subresource_discoverer): | 228 original_headers_path, loading_trace_path, subresource_discoverer): |
| 213 """Extracts discoverable resource urls from a loading trace according to a | 229 """Extracts discoverable resource urls from a loading trace according to a |
| 214 sub-resource discoverer. | 230 sub-resource discoverer. |
| (...skipping 147 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 362 def VerifyWprLog(self, wpr_log_path): | 378 def VerifyWprLog(self, wpr_log_path): |
| 363 """Verifies WPR log with previously verified traces.""" | 379 """Verifies WPR log with previously verified traces.""" |
| 364 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) | 380 all_wpr_requests = wpr_backend.ExtractRequestsFromLog(wpr_log_path) |
| 365 all_wpr_urls = set() | 381 all_wpr_urls = set() |
| 366 unserved_wpr_urls = set() | 382 unserved_wpr_urls = set() |
| 367 wpr_command_colliding_urls = set() | 383 wpr_command_colliding_urls = set() |
| 368 | 384 |
| 369 for request in all_wpr_requests: | 385 for request in all_wpr_requests: |
| 370 if request.is_wpr_host: | 386 if request.is_wpr_host: |
| 371 continue | 387 continue |
| 372 if urlparse(request.url).path.startswith('/web-page-replay'): | 388 if urlparse.urlparse(request.url).path.startswith('/web-page-replay'): |
| 373 wpr_command_colliding_urls.add(request.url) | 389 wpr_command_colliding_urls.add(request.url) |
| 374 elif request.is_served is False: | 390 elif request.is_served is False: |
| 375 unserved_wpr_urls.add(request.url) | 391 unserved_wpr_urls.add(request.url) |
| 376 all_wpr_urls.add(request.url) | 392 all_wpr_urls.add(request.url) |
| 377 | 393 |
| 378 _PrintUrlSetComparison(set(), unserved_wpr_urls, | 394 _PrintUrlSetComparison(set(), unserved_wpr_urls, |
| 379 'Distinct unserved resources from WPR') | 395 'Distinct unserved resources from WPR') |
| 380 _PrintUrlSetComparison(set(), wpr_command_colliding_urls, | 396 _PrintUrlSetComparison(set(), wpr_command_colliding_urls, |
| 381 'Distinct resources colliding to WPR commands') | 397 'Distinct resources colliding to WPR commands') |
| 382 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests, | 398 _PrintUrlSetComparison(all_wpr_urls, self._all_sent_url_requests, |
| (...skipping 91 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 474 logging.info('loading trace: %s', trace_path) | 490 logging.info('loading trace: %s', trace_path) |
| 475 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) | 491 trace = loading_trace.LoadingTrace.FromJsonFile(trace_path) |
| 476 | 492 |
| 477 logging.info('verifying trace: %s', trace_path) | 493 logging.info('verifying trace: %s', trace_path) |
| 478 run_output_verifier.VerifyTrace(trace) | 494 run_output_verifier.VerifyTrace(trace) |
| 479 | 495 |
| 480 logging.info('extracting metrics from trace: %s', trace_path) | 496 logging.info('extracting metrics from trace: %s', trace_path) |
| 481 served_from_network_bytes = 0 | 497 served_from_network_bytes = 0 |
| 482 served_from_cache_bytes = 0 | 498 served_from_cache_bytes = 0 |
| 483 urls_hitting_network = set() | 499 urls_hitting_network = set() |
| 500 response_sizes = {} |
| 484 for request in _FilterOutDataAndIncompleteRequests( | 501 for request in _FilterOutDataAndIncompleteRequests( |
| 485 trace.request_track.GetEvents()): | 502 trace.request_track.GetEvents()): |
| 486 # Ignore requests served from the blink's cache. | 503 # Ignore requests served from the blink's cache. |
| 487 if request.served_from_cache: | 504 if request.served_from_cache: |
| 488 continue | 505 continue |
| 489 urls_hitting_network.add(request.url) | 506 urls_hitting_network.add(request.url) |
| 490 if request.from_disk_cache: | 507 if request.from_disk_cache: |
| 491 served_from_cache_bytes += cached_encoded_data_lengths[request.url] | 508 if request.url in cached_encoded_data_lengths: |
| 509 response_size = cached_encoded_data_lengths[request.url] |
| 510 else: |
| 511 # Some fat webpages may overflow the Memory cache, and so some |
| 512 # requests might be served from disk cache couple of times per page |
| 513 # load. |
| 514 logging.warning('Looks like could be served from memory cache: %s', |
| 515 request.url) |
| 516 response_size = response_sizes[request.url] |
| 517 served_from_cache_bytes += response_size |
| 492 else: | 518 else: |
| 493 served_from_network_bytes += request.GetEncodedDataLength() | 519 response_size = request.GetEncodedDataLength() |
| 520 served_from_network_bytes += response_size |
| 521 response_sizes[request.url] = response_size |
| 494 | 522 |
| 495 # Make sure the served from blink's cache requests have at least one | 523 # Make sure the served from blink's cache requests have at least one |
| 496 # corresponding request that was not served from the blink's cache. | 524 # corresponding request that was not served from the blink's cache. |
| 497 for request in _FilterOutDataAndIncompleteRequests( | 525 for request in _FilterOutDataAndIncompleteRequests( |
| 498 trace.request_track.GetEvents()): | 526 trace.request_track.GetEvents()): |
| 499 assert (request.url in urls_hitting_network or | 527 assert (request.url in urls_hitting_network or |
| 500 not request.served_from_cache) | 528 not request.served_from_cache) |
| 501 | 529 |
| 502 run_metrics = { | 530 run_metrics = { |
| 503 'url': trace.url, | 531 'url': trace.url, |
| (...skipping 63 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 567 dependencies=[self._common_builder.original_wpr_task]) | 595 dependencies=[self._common_builder.original_wpr_task]) |
| 568 def BuildPatchedWpr(): | 596 def BuildPatchedWpr(): |
| 569 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path) | 597 common_util.EnsureParentDirectoryExists(BuildPatchedWpr.path) |
| 570 shutil.copyfile( | 598 shutil.copyfile( |
| 571 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path) | 599 self._common_builder.original_wpr_task.path, BuildPatchedWpr.path) |
| 572 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path) | 600 wpr_archive = wpr_backend.WprArchiveBackend(BuildPatchedWpr.path) |
| 573 | 601 |
| 574 # Save up original response headers. | 602 # Save up original response headers. |
| 575 original_response_headers = {e.url: e.GetResponseHeadersDict() \ | 603 original_response_headers = {e.url: e.GetResponseHeadersDict() \ |
| 576 for e in wpr_archive.ListUrlEntries()} | 604 for e in wpr_archive.ListUrlEntries()} |
| 605 logging.info('save up response headers for %d resources', |
| 606 len(original_response_headers)) |
| 607 if not original_response_headers: |
| 608 # TODO(gabadie): How is it possible to not even have the main resource |
| 609 # in the WPR archive? Example URL can be found in: |
| 610 # http://crbug.com/623966#c5 |
| 611 raise Exception( |
| 612 'Looks like no resources were recorded in WPR during: {}'.format( |
| 613 self._common_builder.original_wpr_task.name)) |
| 577 with open(self._original_headers_path, 'w') as file_output: | 614 with open(self._original_headers_path, 'w') as file_output: |
| 578 json.dump(original_response_headers, file_output) | 615 json.dump(original_response_headers, file_output) |
| 579 | 616 |
| 580 # Patch WPR. | 617 # Patch WPR. |
| 581 _PatchWpr(wpr_archive) | 618 _PatchWpr(wpr_archive) |
| 582 wpr_archive.Persist() | 619 wpr_archive.Persist() |
| 583 | 620 |
| 584 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr]) | 621 @self.RegisterTask('common/original-cache.zip', [BuildPatchedWpr]) |
| 585 def BuildOriginalCache(): | 622 def BuildOriginalCache(): |
| 586 runner = self._common_builder.CreateSandwichRunner() | 623 runner = self._common_builder.CreateSandwichRunner() |
| (...skipping 111 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 698 run_metrics_list = _ProcessRunOutputDir( | 735 run_metrics_list = _ProcessRunOutputDir( |
| 699 cache_validation_result, benchmark_setup, RunBenchmark.path) | 736 cache_validation_result, benchmark_setup, RunBenchmark.path) |
| 700 with open(ProcessRunOutputDir.path, 'w') as csv_file: | 737 with open(ProcessRunOutputDir.path, 'w') as csv_file: |
| 701 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names + | 738 writer = csv.DictWriter(csv_file, fieldnames=(additional_column_names + |
| 702 sandwich_metrics.COMMON_CSV_COLUMN_NAMES)) | 739 sandwich_metrics.COMMON_CSV_COLUMN_NAMES)) |
| 703 writer.writeheader() | 740 writer.writeheader() |
| 704 for trace_metrics in run_metrics_list: | 741 for trace_metrics in run_metrics_list: |
| 705 writer.writerow(trace_metrics) | 742 writer.writerow(trace_metrics) |
| 706 | 743 |
| 707 self._common_builder.default_final_tasks.append(ProcessRunOutputDir) | 744 self._common_builder.default_final_tasks.append(ProcessRunOutputDir) |
| OLD | NEW |