Index: tools/android/loading/sandwich_prefetch.py |
diff --git a/tools/android/loading/sandwich_prefetch.py b/tools/android/loading/sandwich_prefetch.py |
index be8f6098136ac8f08da0cab25a1ef291f43d23c1..0d261b801d2a720585fe803e06c9bc5b0de551c1 100644 |
--- a/tools/android/loading/sandwich_prefetch.py |
+++ b/tools/android/loading/sandwich_prefetch.py |
@@ -21,7 +21,7 @@ import json |
import os |
import re |
import shutil |
-from urlparse import urlparse |
+import urlparse |
import chrome_cache |
import common_util |
@@ -30,6 +30,7 @@ from prefetch_view import PrefetchSimulationView |
from request_dependencies_lens import RequestDependencyLens |
import sandwich_metrics |
import sandwich_runner |
+import sandwich_utils |
import task_manager |
import wpr_backend |
@@ -69,6 +70,13 @@ SUBRESOURCE_DISCOVERERS = set([ |
_UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$') |
+def _NormalizeUrl(url): |
+ """Returns normalized URL such as removing trailing slashes.""" |
+ parsed_url = list(urlparse.urlparse(url)) |
+ parsed_url[2] = re.sub(r'/{2,}', r'/', parsed_url[2]) |
+ return urlparse.urlunparse(parsed_url) |
+ |
+ |
def _PatchWpr(wpr_archive): |
"""Patches a WPR archive to get all resources into the HTTP cache and avoid |
invalidation and revalidations. |
@@ -201,7 +209,15 @@ def _PruneOutOriginalNoStoreRequests(original_headers_path, requests): |
original_headers = json.load(file_input) |
pruned_requests = set() |
for request in requests: |
- request_original_headers = original_headers[request.url] |
+ url = _NormalizeUrl(request.url) |
+ if url not in original_headers: |
+ # TODO(gabadie): Investigate why these requests were not in WPR. |
+ assert request.failed |
+ logging.warning( |
+ 'could not find original headers for: %s (failure: %s)', |
+ url, request.error_text) |
+ continue |
+ request_original_headers = original_headers[url] |
if ('cache-control' in request_original_headers and |
'no-store' in request_original_headers['cache-control'].lower()): |
pruned_requests.add(request) |
@@ -369,7 +385,7 @@ class _RunOutputVerifier(object): |
for request in all_wpr_requests: |
if request.is_wpr_host: |
continue |
- if urlparse(request.url).path.startswith('/web-page-replay'): |
+ if urlparse.urlparse(request.url).path.startswith('/web-page-replay'): |
wpr_command_colliding_urls.add(request.url) |
elif request.is_served is False: |
unserved_wpr_urls.add(request.url) |
@@ -481,6 +497,7 @@ def _ProcessRunOutputDir( |
served_from_network_bytes = 0 |
served_from_cache_bytes = 0 |
urls_hitting_network = set() |
+ response_sizes = {} |
for request in _FilterOutDataAndIncompleteRequests( |
trace.request_track.GetEvents()): |
# Ignore requests served from the blink's cache. |
@@ -488,9 +505,20 @@ def _ProcessRunOutputDir( |
continue |
urls_hitting_network.add(request.url) |
if request.from_disk_cache: |
- served_from_cache_bytes += cached_encoded_data_lengths[request.url] |
+ if request.url in cached_encoded_data_lengths: |
+ response_size = cached_encoded_data_lengths[request.url] |
+ else: |
+ # Some fat webpages may overflow the Memory cache, and so some |
+ # requests might be served from disk cache couple of times per page |
+ # load. |
+ logging.warning('Looks like could be served from memory cache: %s', |
+ request.url) |
+ response_size = response_sizes[request.url] |
+ served_from_cache_bytes += response_size |
else: |
- served_from_network_bytes += request.GetEncodedDataLength() |
+ response_size = request.GetEncodedDataLength() |
+ served_from_network_bytes += response_size |
+ response_sizes[request.url] = response_size |
# Make sure the served from blink's cache requests have at least one |
# corresponding request that was not served from the blink's cache. |
@@ -574,6 +602,15 @@ class PrefetchBenchmarkBuilder(task_manager.Builder): |
# Save up original response headers. |
original_response_headers = {e.url: e.GetResponseHeadersDict() \ |
for e in wpr_archive.ListUrlEntries()} |
+ logging.info('save up response headers for %d resources', |
+ len(original_response_headers)) |
+ if not original_response_headers: |
+ # TODO(gabadie): How is it possible to not even have the main resource |
+ # in the WPR archive? Example URL can be found in: |
+ # http://crbug.com/623966#c5 |
+ raise Exception( |
+ 'Looks like no resources were recorded in WPR during: {}'.format( |
+ self._common_builder.original_wpr_task.name)) |
with open(self._original_headers_path, 'w') as file_output: |
json.dump(original_response_headers, file_output) |