Index: tools/real_world_impact/real_world_impact.py |
diff --git a/tools/real_world_impact/real_world_impact.py b/tools/real_world_impact/real_world_impact.py |
new file mode 100755 |
index 0000000000000000000000000000000000000000..cbfad3bd3b18eff0ed76bef7bcd0d1ff7b333427 |
--- /dev/null |
+++ b/tools/real_world_impact/real_world_impact.py |
@@ -0,0 +1,554 @@ |
+#!/usr/bin/env python |
+# Copyright (c) 2014 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+# Tool for seeing the real world impact of a patch. |
+# |
+# Layout Tests can tell you whether something has changed, but this can help |
+# you determine whether a subtle/controversial change is beneficial or not. |
+# |
+# It dumps the rendering of a large number of sites, both with and without a |
+# patch being evaluated, then sorts them by greatest difference in rendering, |
+# such that a human reviewer can quickly review the most impacted sites, |
+# rather than having to manually try sites to see if anything changes. |
+# |
+# In future it might be possible to extend this to other kinds of differences, |
+# e.g. page load times. |
+ |
+import argparse |
+from argparse import RawTextHelpFormatter |
+from contextlib import closing |
+import datetime |
+import errno |
+from distutils.spawn import find_executable |
+from operator import itemgetter |
+import multiprocessing |
+import os |
+import re |
+from cStringIO import StringIO |
+import subprocess |
+import sys |
+import textwrap |
+import time |
+from urllib2 import urlopen |
+from urlparse import urlparse |
+import webbrowser |
+from zipfile import ZipFile |
+ |
+from nsfw_urls import nsfw_urls |
+ |
+action = None |
+allow_js = False |
+additional_content_shell_flags = "" |
+chromium_src_root = "" |
+chromium_out_dir = "" |
+image_diff = "" |
+content_shell = "" |
+output_dir = "" |
+num_sites = 100 |
+urls = [] |
+print_lock = multiprocessing.Lock() |
+ |
+ |
+def MakeDirsIfNotExist(dir): |
+ try: |
+ os.makedirs(dir) |
+ except OSError as e: |
+ if e.errno != errno.EEXIST: |
+ raise |
+ |
+ |
+def SetupPathsAndOut(): |
+ global chromium_src_root, chromium_out_dir, output_dir |
+ global image_diff, content_shell |
+ chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__), |
+ os.pardir, |
+ os.pardir)) |
+ # Find out directory (might be out_linux for users of cr). |
+ for out_suffix in ["_linux", ""]: |
+ out_dir = os.path.join(chromium_src_root, "out" + out_suffix) |
+ if os.path.exists(out_dir): |
+ chromium_out_dir = out_dir |
+ break |
+ if not chromium_out_dir: |
+ return False |
+ |
+ this_script_name = "real_world_impact" |
+ output_dir = os.path.join(chromium_out_dir, |
+ "Release", |
+ this_script_name) |
+ MakeDirsIfNotExist(output_dir) |
+ |
+ image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") |
+ |
+ if sys.platform == 'darwin': |
+ content_shell = os.path.join(chromium_out_dir, "Release", |
+ "Content Shell.app/Contents/MacOS/Content Shell") |
+ elif sys.platform.startswith('linux'): |
+ content_shell = os.path.join(chromium_out_dir, "Release", |
+ "content_shell") |
+ elif sys.platform.startswith('win'): |
+ content_shell = os.path.join(chromium_out_dir, "Release", |
+ "content_shell.exe") |
+ return True |
+ |
+ |
+def CheckPrerequisites(): |
+ if not find_executable("wget"): |
+ print "wget not found! Install wget and re-run this." |
+ return False |
+ if not os.path.exists(image_diff): |
+ print "image_diff not found (%s)!" % image_diff |
+ print "Build the image_diff target and re-run this." |
+ return False |
+ if not os.path.exists(content_shell): |
+ print "Content shell not found (%s)!" % content_shell |
+ print "Build Release/content_shell and re-run this." |
+ return False |
+ return True |
+ |
+ |
+def PickSampleUrls(): |
+ global urls |
+ data_dir = os.path.join(output_dir, "data") |
+ MakeDirsIfNotExist(data_dir) |
+ |
+ # Download Alexa top 1,000,000 sites |
+ # TODO(johnme): Should probably update this when it gets too stale... |
+ csv_path = os.path.join(data_dir, "top-1m.csv") |
+ if not os.path.exists(csv_path): |
+ print "Downloading list of top 1,000,000 sites from Alexa..." |
+ csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" |
+ with closing(urlopen(csv_url)) as stream: |
+ ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir) |
+ |
+ bad_urls_path = os.path.join(data_dir, "bad_urls.txt") |
+ if os.path.exists(bad_urls_path): |
+ with open(bad_urls_path) as f: |
+ bad_urls = set(f.read().splitlines()) |
+ else: |
+ bad_urls = set() |
+ |
+ # See if we've already selected a sample of size num_sites (this way, if you |
+ # call this script with arguments "before N" then "after N", where N is the |
+ # same number, we'll use the same sample, as expected!). |
+ urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites) |
+ if not os.path.exists(urls_path): |
+ if action == 'compare': |
+ print ("Error: you must run 'before %d' and 'after %d' before " |
+ "running 'compare %d'") % (num_sites, num_sites, num_sites) |
+ return False |
+ print "Picking %d sample urls..." % num_sites |
+ |
+ # TODO(johnme): For now this just gets the top num_sites entries. In future |
+ # this should pick a weighted random sample. For example, it could fit a |
+ # power-law distribution, which is a good model of website popularity |
+ # (http://www.useit.com/alertbox/9704b.html). |
+ urls = [] |
+ remaining_num_sites = num_sites |
+ with open(csv_path) as f: |
+ for entry in f: |
+ if remaining_num_sites <= 0: |
+ break |
+ remaining_num_sites -= 1 |
+ hostname = entry.strip().split(',')[1] |
+ if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths. |
+ url = "http://%s/" % hostname |
+ if not url in bad_urls: |
+ urls.append(url) |
+ # Don't write these to disk yet; we'll do that in SaveWorkingUrls below |
+ # once we have tried to download them and seen which ones fail. |
+ else: |
+ with open(urls_path) as f: |
+ urls = [u for u in f.read().splitlines() if not u in bad_urls] |
+ return True |
+ |
+ |
+def SaveWorkingUrls(): |
+ # TODO(johnme): Update the list if a url that used to work goes offline. |
+ urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites) |
+ if not os.path.exists(urls_path): |
+ with open(urls_path, 'w') as f: |
+ f.writelines(u + '\n' for u in urls) |
+ |
+ |
+def PrintElapsedTime(elapsed, detail=""): |
+ elapsed = round(elapsed * 10) / 10.0 |
+ m = elapsed / 60 |
+ s = elapsed % 60 |
+ print "Took %dm%.1fs" % (m, s), detail |
+ |
+ |
+def DownloadStaticCopyTask(url): |
+ url_parts = urlparse(url) |
+ host_dir = os.path.join(output_dir, "data", url_parts.hostname) |
+ # Use wget for now, as does a reasonable job of spidering page dependencies |
+ # (e.g. CSS, JS, images). |
+ success = True |
+ try: |
+ subprocess.check_call(["wget", |
+ "--execute", "robots=off", |
+ ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS " |
+ "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C" |
+ "hrome/32.0.1700.14 Safari/537.36"), |
+ "--page-requisites", |
+ "--span-hosts", |
+ "--adjust-extension", |
+ "--convert-links", |
+ "--directory-prefix=" + host_dir, |
+ "--force-directories", |
+ "--default-page=index.html", |
+ "--no-check-certificate", |
+ "--timeout=5", # 5s timeout |
+ "--tries=2", |
+ "--quiet", |
+ url]) |
+ except KeyboardInterrupt: |
+ success = False |
+ except subprocess.CalledProcessError: |
+ # Ignoring these for now, as some sites have issues with their subresources |
+ # yet still produce a renderable index.html |
+ pass #success = False |
+ if success: |
+ download_path = os.path.join(host_dir, url_parts.hostname, "index.html") |
+ if not os.path.exists(download_path): |
+ success = False |
+ else: |
+ with print_lock: |
+ print "Downloaded:", url |
+ if not success: |
+ with print_lock: |
+ print "Failed to download:", url |
+ return False |
+ return True |
+ |
+ |
+def DownloadStaticCopies(): |
+ global urls |
+ new_urls = [] |
+ for url in urls: |
+ url_parts = urlparse(url) |
+ host_dir = os.path.join(output_dir, "data", url_parts.hostname) |
+ download_path = os.path.join(host_dir, url_parts.hostname, "index.html") |
+ if not os.path.exists(download_path): |
+ new_urls.append(url) |
+ |
+ if new_urls: |
+ print "Downloading static copies of %d sites..." % len(new_urls) |
+ start_time = time.time() |
+ |
+ results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls) |
+ failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret] |
+ if failed_urls: |
+ bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt") |
+ with open(bad_urls_path, 'a') as f: |
+ f.writelines(u + '\n' for u in failed_urls) |
+ failed_urls_set = set(failed_urls) |
+ urls = [u for u in urls if u not in failed_urls_set] |
+ |
+ PrintElapsedTime(time.time() - start_time) |
+ |
+ SaveWorkingUrls() |
+ |
+ |
+def RunDrtTask(url): |
+ url_parts = urlparse(url) |
+ host_dir = os.path.join(output_dir, "data", url_parts.hostname) |
+ html_path = os.path.join(host_dir, url_parts.hostname, "index.html") |
+ |
+ if not allow_js: |
+ nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html") |
+ if not os.path.exists(nojs_path): |
+ with open(html_path) as f: |
+ html = f.read() |
+ if not html: |
+ return False |
+ # These aren't intended to be XSS safe :) |
+ block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)' |
+ r'\b.*?<\s*\/\s*\1\s*>') |
+ block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)' |
+ html = re.sub(block_tags, '', html, flags=re.I|re.S) |
+ html = re.sub(block_attrs, '', html, flags=re.I) |
+ with open(nojs_path, 'w') as f: |
+ f.write(html) |
+ html_path = nojs_path |
+ |
+ start_time = time.time() |
+ |
+ with open(os.devnull, "w") as fnull: |
+ p = subprocess.Popen([content_shell, |
+ "--dump-render-tree", |
+ additional_content_shell_flags, |
+ # The single quote is not a typo, it's a separator! |
+ html_path + "'--pixel-test" |
+ ], |
+ shell=False, |
+ stdout=subprocess.PIPE, |
+ stderr=fnull) |
+ result = p.stdout.read() |
+ PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A" |
+ PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82" |
+ try: |
+ start = result.index(PNG_START) |
+ end = result.rindex(PNG_END) + 8 |
+ except ValueError: |
+ return False |
+ |
+ png_path = os.path.join(output_dir, action, url_parts.hostname + ".png") |
+ MakeDirsIfNotExist(os.path.dirname(png_path)) |
+ with open(png_path, 'wb') as f: |
+ f.write(result[start:end]) |
+ elapsed_time = (time.time() - start_time, url) |
+ return elapsed_time |
+ |
+ |
+def RunDrt(): |
+ print "Taking screenshots of %d pages..." % len(urls) |
+ start_time = time.time() |
+ |
+ results = multiprocessing.Pool().map(RunDrtTask, urls, 1) |
+ |
+ max_time, url = max(t for t in results if t) |
+ elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url) |
+ PrintElapsedTime(time.time() - start_time, elapsed_detail) |
+ |
+ |
+def CompareResultsTask(url): |
+ url_parts = urlparse(url) |
+ before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png") |
+ after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png") |
+ diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png") |
+ MakeDirsIfNotExist(os.path.join(output_dir, "diff")) |
+ |
+ # TODO(johnme): Don't hardcode "real_world_impact". |
+ red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA" |
+ "ABAAEAAAICRAEAOw==") |
+ |
+ before_exists = os.path.exists(before_path) |
+ after_exists = os.path.exists(after_path) |
+ if not before_exists and not after_exists: |
+ # TODO(johnme): Make this more informative. |
+ return (-100, url, red_path) |
+ if before_exists != after_exists: |
+ # TODO(johnme): Make this more informative. |
+ return (200, url, red_path) |
+ |
+ # Get percentage difference. |
+ p = subprocess.Popen([image_diff, "--histogram", |
+ before_path, after_path], |
+ shell=False, |
+ stdout=subprocess.PIPE) |
+ output,_ = p.communicate() |
+ if p.returncode == 0: |
+ return (0, url, before_path) |
+ diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n' |
+ 'exact diff: (\d+\.\d{2})% (?:passed|failed)', output) |
+ if not diff_match: |
+ raise Exception("image_diff output format changed") |
+ histogram_diff = float(diff_match.group(1)) |
+ exact_diff = float(diff_match.group(2)) |
+ combined_diff = max(histogram_diff + exact_diff / 8, 0.001) |
+ |
+ # Produce diff PNG. |
+ subprocess.call([image_diff, "--diff", before_path, after_path, diff_path]) |
+ return (combined_diff, url, diff_path) |
+ |
+ |
+def CompareResults(): |
+ print "Running image_diff on %d pages..." % len(urls) |
+ start_time = time.time() |
+ |
+ results = multiprocessing.Pool().map(CompareResultsTask, urls) |
+ results.sort(key=itemgetter(0), reverse=True) |
+ |
+ PrintElapsedTime(time.time() - start_time) |
+ |
+ now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M") |
+ html_start = textwrap.dedent("""\ |
+ <!DOCTYPE html> |
+ <html> |
+ <head> |
+ <title>Real World Impact report %s</title> |
+ <script> |
+ var togglingImg = null; |
+ var toggleTimer = null; |
+ |
+ var before = true; |
+ function toggle() { |
+ var newFolder = before ? "before" : "after"; |
+ togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder); |
+ before = !before; |
+ toggleTimer = setTimeout(toggle, 300); |
+ } |
+ |
+ function startToggle(img) { |
+ before = true; |
+ togglingImg = img; |
+ if (!img.origSrc) |
+ img.origSrc = img.src; |
+ toggle(); |
+ } |
+ function stopToggle(img) { |
+ clearTimeout(toggleTimer); |
+ img.src = img.origSrc; |
+ } |
+ |
+ document.onkeydown = function(e) { |
+ e = e || window.event; |
+ var keyCode = e.keyCode || e.which; |
+ var newFolder; |
+ switch (keyCode) { |
+ case 49: //'1' |
+ newFolder = "before"; break; |
+ case 50: //'2' |
+ newFolder = "after"; break; |
+ case 51: //'3' |
+ newFolder = "diff"; break; |
+ default: |
+ return; |
+ } |
+ var imgs = document.getElementsByTagName("img"); |
+ for (var i = 0; i < imgs.length; i++) { |
+ imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder); |
+ } |
+ }; |
+ </script> |
+ <style> |
+ h1 { |
+ font-family: sans; |
+ } |
+ h2 { |
+ font-family: monospace; |
+ white-space: pre; |
+ } |
+ .nsfw-spacer { |
+ height: 50vh; |
+ } |
+ .nsfw-warning { |
+ background: yellow; |
+ border: 10px solid red; |
+ } |
+ .info { |
+ font-size: 1.2em; |
+ font-style: italic; |
+ } |
+ body:not(.details-supported) details { |
+ display: none; |
+ } |
+ </style> |
+ </head> |
+ <body> |
+ <script> |
+ if ('open' in document.createElement('details')) |
+ document.body.className = "details-supported"; |
+ </script> |
+ <!--<div class="nsfw-spacer"></div>--> |
+ <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d |
+ and may be NSFW.</p> |
+ <!--<div class="nsfw-spacer"></div>--> |
+ <h1>Real World Impact report %s</h1> |
+ <p class="info">Press 1, 2 and 3 to switch between before, after and diff |
+ screenshots respectively; or hover over the images to rapidly alternate |
+ between before and after.</p> |
+ """ % (now, num_sites, now)) |
+ |
+ html_same_row = """\ |
+ <h2>No difference on <a href="%s">%s</a>.</h2> |
+ """ |
+ |
+ html_diff_row = """\ |
+ <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> |
+ <img src="%s" width="800" height="600" |
+ onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> |
+ """ |
+ |
+ html_nsfw_diff_row = """\ |
+ <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> |
+ <details> |
+ <summary>This site may be NSFW. Click to expand/collapse.</summary> |
+ <img src="%s" width="800" height="600" |
+ onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> |
+ </details> |
+ """ |
+ |
+ html_end = textwrap.dedent("""\ |
+ </body> |
+ </html>""") |
+ |
+ html_path = os.path.join(output_dir, "diff.html") |
+ with open(html_path, 'w') as f: |
+ f.write(html_start) |
+ for (diff_float, url, diff_path) in results: |
+ diff_path = os.path.relpath(diff_path, output_dir) |
+ if diff_float == 0: |
+ f.write(html_same_row % (url, url)) |
+ elif url in nsfw_urls: |
+ f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path)) |
+ else: |
+ f.write(html_diff_row % (diff_float, url, url, diff_path)) |
+ f.write(html_end) |
+ |
+ webbrowser.open_new_tab("file://" + html_path) |
+ |
+ |
+def main(argv): |
+ global num_sites, action, allow_js, additional_content_shell_flags |
+ |
+ parser = argparse.ArgumentParser( |
+ formatter_class=RawTextHelpFormatter, |
+ description="Compare the real world impact of a content shell change.", |
+ epilog=textwrap.dedent("""\ |
+ Example usage: |
+ 1. Build content_shell in out/Release without any changes. |
+ 2. Run: %s before [num sites to test (default %d)]. |
+ 3. Either: |
+ a. Apply your controversial patch and rebuild content_shell. |
+ b. Pass --additional_flags="--enable_your_flag" in step 4. |
+ 4. Run: %s after [num sites to test (default %d)]. |
+ 5. Run: %s compare [num sites to test (default %d)]. |
+ This will open the results in your web browser. |
+ """ % (argv[0], num_sites, argv[0], num_sites, argv[0], num_sites))) |
+ parser.add_argument("--allow_js", help="Don't disable Javascript", |
+ action="store_true") |
+ parser.add_argument("--additional_flags", |
+ help="Additional flags to pass to content shell") |
+ parser.add_argument("action", |
+ help=textwrap.dedent("""\ |
+ Action to perform. |
+ download - Just download the sites. |
+ before - Run content shell and record 'before' result. |
+ after - Run content shell and record 'after' result. |
+ compare - Compare before and after results. |
+ """), |
+ choices=["download", "before", "after", "compare"]) |
+ parser.add_argument("num_sites", |
+ help="Number of sites (default %s)" % num_sites, |
+ type=int, default=num_sites, nargs='?') |
+ args = parser.parse_args() |
+ |
+ action = args.action |
+ |
+ if (args.num_sites): |
+ num_sites = args.num_sites |
+ |
+ if (args.allow_js): |
+ allow_js = args.allow_js |
+ |
+ if (args.additional_flags): |
+ additional_content_shell_flags = args.additional_flags |
+ |
+ if not SetupPathsAndOut() or not CheckPrerequisites() or not PickSampleUrls(): |
+ return 1 |
+ |
+ if action == 'compare': |
+ CompareResults() |
+ else: |
+ DownloadStaticCopies() |
+ if action != 'download': |
+ RunDrt() |
+ return 0 |
+ |
+ |
+if __name__ == '__main__': |
+ sys.exit(main(sys.argv)) |