tools/android/loading/sandwich_prefetch.py - Issue 2112483002: sandwich: Fixes two sources of KeyError task failures

Unified Diff: tools/android/loading/sandwich_prefetch.py

Issue 2112483002: sandwich: Fixes two sources of KeyError task failures (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Addresses Egor's comments Created 4 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: tools/android/loading/sandwich_prefetch.py

diff --git a/tools/android/loading/sandwich_prefetch.py b/tools/android/loading/sandwich_prefetch.py

index be8f6098136ac8f08da0cab25a1ef291f43d23c1..0d261b801d2a720585fe803e06c9bc5b0de551c1 100644

--- a/tools/android/loading/sandwich_prefetch.py

+++ b/tools/android/loading/sandwich_prefetch.py

@@ -21,7 +21,7 @@ import json

import os

import re

import shutil

-from urlparse import urlparse

+import urlparse

import chrome_cache

import common_util

@@ -30,6 +30,7 @@ from prefetch_view import PrefetchSimulationView

from request_dependencies_lens import RequestDependencyLens

import sandwich_metrics

import sandwich_runner

+import sandwich_utils

import task_manager

import wpr_backend

@@ -69,6 +70,13 @@ SUBRESOURCE_DISCOVERERS = set([

_UPLOAD_DATA_STREAM_REQUESTS_REGEX = re.compile(r'^\d+/(?P<url>.*)$')

+def _NormalizeUrl(url):

+ """Returns normalized URL such as removing trailing slashes."""

+ parsed_url = list(urlparse.urlparse(url))

+ parsed_url[2] = re.sub(r'/{2,}', r'/', parsed_url[2])

+ return urlparse.urlunparse(parsed_url)

def _PatchWpr(wpr_archive):

"""Patches a WPR archive to get all resources into the HTTP cache and avoid

invalidation and revalidations.

@@ -201,7 +209,15 @@ def _PruneOutOriginalNoStoreRequests(original_headers_path, requests):

original_headers = json.load(file_input)

pruned_requests = set()

for request in requests:

- request_original_headers = original_headers[request.url]

+ url = _NormalizeUrl(request.url)

+ if url not in original_headers:

+ # TODO(gabadie): Investigate why these requests were not in WPR.

+ assert request.failed

+ logging.warning(

+ 'could not find original headers for: %s (failure: %s)',

+ url, request.error_text)

+ continue

+ request_original_headers = original_headers[url]

if ('cache-control' in request_original_headers and

'no-store' in request_original_headers['cache-control'].lower()):

pruned_requests.add(request)

@@ -369,7 +385,7 @@ class _RunOutputVerifier(object):

for request in all_wpr_requests:

if request.is_wpr_host:

continue

- if urlparse(request.url).path.startswith('/web-page-replay'):

+ if urlparse.urlparse(request.url).path.startswith('/web-page-replay'):

wpr_command_colliding_urls.add(request.url)

elif request.is_served is False:

unserved_wpr_urls.add(request.url)

@@ -481,6 +497,7 @@ def _ProcessRunOutputDir(

served_from_network_bytes = 0

served_from_cache_bytes = 0

urls_hitting_network = set()

+ response_sizes = {}

for request in _FilterOutDataAndIncompleteRequests(

trace.request_track.GetEvents()):

# Ignore requests served from the blink's cache.

@@ -488,9 +505,20 @@ def _ProcessRunOutputDir(

continue

urls_hitting_network.add(request.url)

if request.from_disk_cache:

- served_from_cache_bytes += cached_encoded_data_lengths[request.url]

+ if request.url in cached_encoded_data_lengths:

+ response_size = cached_encoded_data_lengths[request.url]

+ else:

+ # Some fat webpages may overflow the Memory cache, and so some

+ # requests might be served from disk cache couple of times per page

+ # load.

+ logging.warning('Looks like could be served from memory cache: %s',

+ request.url)

+ response_size = response_sizes[request.url]

+ served_from_cache_bytes += response_size

else:

- served_from_network_bytes += request.GetEncodedDataLength()

+ response_size = request.GetEncodedDataLength()

+ served_from_network_bytes += response_size

+ response_sizes[request.url] = response_size

# Make sure the served from blink's cache requests have at least one

# corresponding request that was not served from the blink's cache.

@@ -574,6 +602,15 @@ class PrefetchBenchmarkBuilder(task_manager.Builder):

# Save up original response headers.

original_response_headers = {e.url: e.GetResponseHeadersDict() \

for e in wpr_archive.ListUrlEntries()}

+ logging.info('save up response headers for %d resources',

+ len(original_response_headers))

+ if not original_response_headers:

+ # TODO(gabadie): How is it possible to not even have the main resource

+ # in the WPR archive? Example URL can be found in:

+ # http://crbug.com/623966#c5

+ raise Exception(

+ 'Looks like no resources were recorded in WPR during: {}'.format(

+ self._common_builder.original_wpr_task.name))

with open(self._original_headers_path, 'w') as file_output:

json.dump(original_response_headers, file_output)

« no previous file with comments | « no previous file | no next file » | no next file with comments »