tools/real_world_impact/real_world_impact.py - Issue 209393002: Real world impact script: scalable manual rendering QA

Side by Side Diff: tools/real_world_impact/real_world_impact.py

Issue 209393002: Real world impact script: scalable manual rendering QA (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Add wget retry limit (--tries) Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2014 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 # Tool for seeing the real world impact of a patch.

	7 #

	8 # Layout Tests can tell you whether something has changed, but this can help

	9 # you determine whether a subtle/controversial change is beneficial or not.

	10 #

	11 # It dumps the rendering of a large number of sites, both with and without a

	12 # patch being evaluated, then sorts them by greatest difference in rendering,

	13 # such that a human reviewer can quickly review the most impacted sites,

	14 # rather than having to manually try sites to see if anything changes.

	15 #

	16 # In future it might be possible to extend this to other kinds of differences,

	17 # e.g. page load times.

	18

	19 from contextlib import closing

	20 import datetime

	21 import errno

	22 from distutils.spawn import find_executable

	23 from operator import itemgetter

	24 import multiprocessing

	25 import os

	26 import re

	27 from cStringIO import StringIO

	28 import subprocess

	29 import sys

	30 import textwrap

	31 import time

	32 from urllib2 import urlopen

	33 from urlparse import urlparse

	34 import webbrowser

	35 from zipfile import ZipFile

	36

	37 from nsfw_urls import nsfw_urls

	38

	39 action = None

	40 allow_js = False

	41 chromium_src_root = ""

	42 chromium_out_dir = ""

	43 output_dir = ""

	44 num_sites = 1000

	45 urls = []

	46 print_lock = multiprocessing.Lock()

	47

	48

	49 def PrintUsage(argv0):

	50 this_script = os.path.basename(argv0)

	51 print textwrap.dedent("""\

	52 Real World Impact usage:

	53 1. Build content_shell in out/Release without controversial patch.

	54 2. Run: %s before [num sites to test (default %d)]

	55 3. Apply the controversial patch, and rebuild content_shell in out/Release.

	56 4. Run: %s after [num sites to test (default %d)]

	57 5. Run: %s compare [num sites to test (default %d)]

	58 Output is stored in: %s

	59 The compare step will open results in your web browser."""

	60 % (this_script, num_sites, this_script, num_sites, this_script, num_sites,

	61 output_dir))

	62

	63

	64 def MakeDirsIfNotExist(dir):

	65 try:

	66 os.makedirs(dir)

	67 except OSError as e:

	68 if e.errno != errno.EEXIST:

	69 raise

	70

	71 def MakeOutDir():

	72 global chromium_src_root, chromium_out_dir, output_dir

	73 chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__),

	74 os.pardir,

	75 os.pardir))

	76 # Find out directory (might be out_linux for users of cr).

	77 for out_suffix in ["_linux", ""]:

	78 out_dir = os.path.join(chromium_src_root, "out" + out_suffix)

	79 if os.path.exists(out_dir):

	80 chromium_out_dir = out_dir

	81 break

	82 if not chromium_out_dir:

	83 return False

	84

	85 this_script_name = "real_world_impact"

	86 output_dir = os.path.join(chromium_out_dir,

	87 "Release",

	88 this_script_name)

	89 MakeDirsIfNotExist(output_dir)

	90 return True

	91

	92

	93 def CheckPrerequisites():

	94 if not find_executable("wget"):

	95 print "Please install wget and re-run this."

	96 return False

	97 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")

	98 if not os.path.exists(image_diff):

	99 print "Please build the image_diff target and re-run this."

	100 return False

	101 return True

	102

	103

	104 def PickSampleUrls():

	105 global urls

	106 data_dir = os.path.join(output_dir, "data")

	107 MakeDirsIfNotExist(data_dir)

	108

	109 # Download Alexa top 1,000,000 sites

	110 # TODO(johnme): Should probably update this when it gets too stale...

	111 csv_path = os.path.join(data_dir, "top-1m.csv")

	112 if not os.path.exists(csv_path):

	113 print "Downloading list of top 1,000,000 sites from Alexa..."

	114 csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"

	115 with closing(urlopen(csv_url)) as stream:

	116 ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir)

	117

	118 bad_urls_path = os.path.join(data_dir, "bad_urls.txt")

	119 if os.path.exists(bad_urls_path):

	120 with open(bad_urls_path) as f:

	121 bad_urls = set(f.read().splitlines())

	122 else:

	123 bad_urls = set()

	124

	125 # See if we've already selected a sample of size num_sites (this way, if you

	126 # call this script with arguments "before N" then "after N", where N is the

	127 # same number, we'll use the same sample, as expected!).

	128 urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites)

	129 if not os.path.exists(urls_path):

	130 if action == 'compare':

	131 print ("Error: you must run 'before %d' and 'after %d' before "

	132 "running 'compare %d'") % (num_sites, num_sites, num_sites)

	133 return False

	134 print "Picking %d sample urls..." % num_sites

	135

	136 # TODO(johnme): For now this just gets the top num_sites entries. In future

	137 # this should pick a weighted random sample. For example, it could fit a

	138 # power-law distribution, which is a good model of website popularity

	139 # (http://www.useit.com/alertbox/9704b.html).

	140 urls = []

	141 remaining_num_sites = num_sites

	142 with open(csv_path) as f:

	143 for entry in f:

	144 if remaining_num_sites <= 0:

	145 break

	146 remaining_num_sites -= 1

	147 hostname = entry.strip().split(',')[1]

	148 if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths.

	149 url = "http://%s/" % hostname

	150 if not url in bad_urls:

	151 urls.append(url)

	152 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below

	153 # once we have tried to download them and seen which ones fail.

	154 else:

	155 with open(urls_path) as f:

	156 urls = [u for u in f.read().splitlines() if not u in bad_urls]

	157 return True

	158

	159

	160 def SaveWorkingUrls():

	161 # TODO(johnme): Update the list if a url that used to work goes offline.

	162 urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites)

	163 if not os.path.exists(urls_path):

	164 with open(urls_path, 'w') as f:

	165 f.writelines(u + '\n' for u in urls)

	166

	167

	168 def PrintElapsedTime(elapsed, detail=""):

	169 elapsed = round(elapsed * 10) / 10.0

	170 m = elapsed / 60

	171 s = elapsed % 60

	172 print "Took %dm%.1fs" % (m, s), detail

	173

	174

	175 def DownloadStaticCopyTask(url):

	176 url_parts = urlparse(url)

	177 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	178 # Use wget for now, as does a reasonable job of spidering page dependencies

	179 # (e.g. CSS, JS, images).

	180 success = True

	181 try:

	182 subprocess.check_call(["wget",

	183 "--execute", "robots=off",

	184 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS "

	185 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C"

	186 "hrome/32.0.1700.14 Safari/537.36"),

	187 "--page-requisites",

	188 "--span-hosts",

	189 "--adjust-extension",

	190 "--convert-links",

	191 "--directory-prefix=" + host_dir,

	192 "--force-directories",

	193 "--default-page=index.html",

	194 "--no-check-certificate",

	195 "--timeout 20", # 20s timeout
	skobes 2014/03/22 06:32:54 I'd probably use a shorter timeout, like 5s. I'd probably use a shorter timeout, like 5s.
	196 "--tries 2", # 20s timeout

	197 "--quiet",

	198 url])

	199 except KeyboardInterrupt:

	200 success = False

	201 except subprocess.CalledProcessError:

	202 # Ignoring these for now, as some sites have issues with their subresources

	203 # yet still produce a renderable index.html

	204 pass #success = False

	205 if success:

	206 download_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	207 if not os.path.exists(download_path):

	208 success = False

	209 else:

	210 with print_lock:

	211 print "Downloaded:", url

	212 if not success:

	213 with print_lock:

	214 print "Failed to download:", url

	215 return False

	216 return True

	217

	218

	219 def DownloadStaticCopies():

	220 global urls

	221 new_urls = []

	222 for url in urls:

	223 url_parts = urlparse(url)

	224 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	225 download_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	226 if not os.path.exists(download_path):

	227 new_urls.append(url)

	228

	229 if new_urls:

	230 print "Downloading static copies of %d sites..." % len(new_urls)

	231 start_time = time.time()

	232

	233 results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)

	234 failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret]

	235 if failed_urls:

	236 bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt")

	237 with open(bad_urls_path, 'a') as f:

	238 f.writelines(u + '\n' for u in failed_urls)

	239 failed_urls_set = set(failed_urls)

	240 urls = [u for u in urls if u not in failed_urls_set]

	241

	242 PrintElapsedTime(time.time() - start_time)

	243

	244 SaveWorkingUrls()

	245

	246

	247 def RunDrtTask(url):

	248 url_parts = urlparse(url)

	249 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	250 html_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	251

	252 if not allow_js:

	253 nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html")

	254 if not os.path.exists(nojs_path):

	255 with open(html_path) as f:

	256 html = f.read()

	257 if not html:

	258 return False

	259 # These aren't intended to be XSS safe :)

	260 block_tags = (r'<\s*(script\|object\|video\|audio\|iframe\|frameset\|frame)'

	261 r'\b.?<\s\/\s\1\s>')

	262 block_attrs = r'\s(onload\|onerror)\s=\s(\'[^\']\'\|"[^"]\|\S*)'

	263 html = re.sub(block_tags, '', html, flags=re.I\|re.S)

	264 html = re.sub(block_attrs, '', html, flags=re.I)

	265 with open(nojs_path, 'w') as f:

	266 f.write(html)

	267 html_path = nojs_path

	268

	269 if sys.platform == 'darwin':

	270 content_shell = os.path.join(chromium_out_dir, "Release",

	271 "Content Shell.app/Contents/MacOS/Content Shell")

	272 elif sys.platform.startswith('linux'):

	273 content_shell = os.path.join(chromium_out_dir, "Release",

	274 "content_shell")

	275 elif sys.platform.startswith('win'):

	276 content_shell = os.path.join(chromium_out_dir, "Release",

	277 "content_shell.exe")

	278 start_time = time.time()

	279

	280 with open(os.devnull, "w") as fnull:

	281 p = subprocess.Popen([content_shell,

	282 "--dump-render-tree",

	283 # The single quote is not a typo, it's a separator!

	284 html_path + "'--pixel-test"

	285 ],

	286 shell=False,

	287 stdout=subprocess.PIPE,

	288 stderr=fnull)

	289 result = p.stdout.read()

	290 PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"

	291 PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82"

	292 try:

	293 start = result.index(PNG_START)

	294 end = result.rindex(PNG_END) + 8

	295 except ValueError:

	296 return False

	297

	298 png_path = os.path.join(output_dir, action, url_parts.hostname + ".png")

	299 MakeDirsIfNotExist(os.path.dirname(png_path))

	300 with open(png_path, 'wb') as f:

	301 f.write(result[start:end])

	302 elapsed_time = (time.time() - start_time, url)

	303 return elapsed_time

	304

	305

	306 def RunDrt():

	307 print "Taking screenshots of %d pages..." % len(urls)

	308 start_time = time.time()

	309

	310 results = multiprocessing.Pool().map(RunDrtTask, urls, 1)

	311

	312 max_time, url = max(t for t in results if t)

	313 elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url)

	314 PrintElapsedTime(time.time() - start_time, elapsed_detail)

	315

	316

	317 def CompareResultsTask(url):

	318 url_parts = urlparse(url)

	319 before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png")

	320 after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png")

	321 diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png")

	322 MakeDirsIfNotExist(os.path.join(output_dir, "diff"))

	323

	324 # TODO(johnme): Don't hardcode "real_world_impact".

	325 red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA"

	326 "ABAAEAAAICRAEAOw==")

	327

	328 before_exists = os.path.exists(before_path)

	329 after_exists = os.path.exists(after_path)

	330 if not before_exists and not after_exists:

	331 # TODO(johnme): Make this more informative.

	332 return (-100, url, red_path)

	333 if before_exists != after_exists:

	334 # TODO(johnme): Make this more informative.

	335 return (200, url, red_path)

	336

	337 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")

	338

	339 # Get percentage difference.

	340 p = subprocess.Popen([image_diff, "--histogram",

	341 before_path, after_path],

	342 shell=False,

	343 stdout=subprocess.PIPE)

	344 output,_ = p.communicate()

	345 if p.returncode == 0:

	346 return (0, url, before_path)

	347 diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed\|failed)\n'

	348 'exact diff: (\d+\.\d{2})% (?:passed\|failed)', output)

	349 if not diff_match:

	350 raise Exception("image_diff output format changed")

	351 histogram_diff = float(diff_match.group(1))

	352 exact_diff = float(diff_match.group(2))

	353 combined_diff = max(histogram_diff + exact_diff / 8, 0.001)

	354

	355 # Produce diff PNG.

	356 subprocess.call([image_diff, "--diff", before_path, after_path, diff_path])

	357 return (combined_diff, url, diff_path)

	358

	359

	360 def CompareResults():

	361 print "Running image_diff on %d pages..." % len(urls)

	362 start_time = time.time()

	363

	364 results = multiprocessing.Pool().map(CompareResultsTask, urls)

	365 results.sort(key=itemgetter(0), reverse=True)

	366

	367 PrintElapsedTime(time.time() - start_time)

	368

	369 now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M")

	370 html_start = textwrap.dedent("""\

	371 <!DOCTYPE html>

	372 <html>

	373 <head>

	374 <title>Real World Impact report %s</title>

	375 <script>

	376 var togglingImg = null;

	377 var toggleTimer = null;

	378

	379 var before = true;

	380 function toggle() {

	381 var newFolder = before ? "before" : "after";

	382 togglingImg.src = togglingImg.src.replace(/before\|after\|diff/, newFolder);

	383 before = !before;

	384 toggleTimer = setTimeout(toggle, 300);

	385 }

	386

	387 function startToggle(img) {

	388 before = true;

	389 togglingImg = img;

	390 if (!img.origSrc)

	391 img.origSrc = img.src;

	392 toggle();

	393 }

	394 function stopToggle(img) {

	395 clearTimeout(toggleTimer);

	396 img.src = img.origSrc;

	397 }

	398

	399 document.onkeydown = function(e) {

	400 e = e \|\| window.event;

	401 var keyCode = e.keyCode \|\| e.which;

	402 var newFolder;

	403 switch (keyCode) {

	404 case 49: //'1'

	405 newFolder = "before"; break;

	406 case 50: //'2'

	407 newFolder = "after"; break;

	408 case 51: //'3'

	409 newFolder = "diff"; break;

	410 default:

	411 return;

	412 }

	413 var imgs = document.getElementsByTagName("img");

	414 for (var i = 0; i < imgs.length; i++) {

	415 imgs[i].src = imgs[i].src.replace(/before\|after\|diff/, newFolder);

	416 }

	417 };

	418 </script>

	419 <style>

	420 h1 {

	421 font-family: sans;

	422 }

	423 h2 {

	424 font-family: monospace;

	425 white-space: pre;

	426 }

	427 .nsfw-spacer {

	428 height: 50vh;

	429 }

	430 .nsfw-warning {

	431 background: yellow;

	432 border: 10px solid red;

	433 }

	434 .info {

	435 font-size: 1.2em;

	436 font-style: italic;

	437 }

	438 body:not(.details-supported) details {

	439 display: none;

	440 }

	441 </style>

	442 </head>

	443 <body>

	444 <script>

	445 if ('open' in document.createElement('details'))

	446 document.body.className = "details-supported";

	447 </script>

	448 <!--<div class="nsfw-spacer"></div>-->

	449 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d

	450 and may be NSFW.</p>

	451 <!--<div class="nsfw-spacer"></div>-->

	452 <h1>Real World Impact report %s</h1>

	453 <p class="info">Press 1, 2 and 3 to switch between before, after and diff

	454 screenshots respectively; or hover over the images to rapidly alternate

	455 between before and after.</p>

	456 """ % (now, num_sites, now))

	457

	458 html_same_row = """\

	459 <h2>No difference on <a href="%s">%s</a>.</h2>

	460 """

	461

	462 html_diff_row = """\

	463 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>

	464 <img src="%s" width="800" height="600"

	465 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">

	466 """

	467

	468 html_nsfw_diff_row = """\

	469 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>

	470 <details>

	471 <summary>This site may be NSFW. Click to expand/collapse.</summary>

	472 <img src="%s" width="800" height="600"

	473 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">

	474 </details>

	475 """

	476

	477 html_end = textwrap.dedent("""\

	478 </body>

	479 </html>""")

	480

	481 html_path = os.path.join(output_dir, "diff.html")

	482 with open(html_path, 'w') as f:

	483 f.write(html_start)

	484 for (diff_float, url, diff_path) in results:

	485 diff_path = os.path.relpath(diff_path, output_dir)

	486 if diff_float == 0:

	487 f.write(html_same_row % (url, url))

	488 elif url in nsfw_urls:

	489 f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path))

	490 else:

	491 f.write(html_diff_row % (diff_float, url, url, diff_path))

	492 f.write(html_end)

	493

	494 webbrowser.open_new_tab("file://" + html_path)

	495

	496

	497 def main(argv):

	498 global num_sites, action, allow_js

	499

	500 for arg in argv[1:]:

	501 try:

	502 num_sites = int(arg)

	503 except ValueError:

	504 if arg == '--allow-js':

	505 allow_js = True

	506 elif arg in ['download', 'before', 'after', 'compare']:
	skobes 2014/03/22 06:32:54 Usage text doesn't mention 'download'... Usage text doesn't mention 'download'...
	507 action = arg

	508 else:

	509 # Unrecognized argument. Show usage.

	510 action = None

	511 break

	512

	513 if not action or (action in ['download', 'compare'] and allow_js):

	514 PrintUsage(argv[0])

	515 return 2

	516

	517 if not MakeOutDir() or not CheckPrerequisites() or not PickSampleUrls():

	518 return 1

	519

	520 if action == 'compare':

	521 CompareResults()

	522 else:

	523 DownloadStaticCopies()

	524 if action != 'download':

	525 RunDrt()

	526 return 0

	527

	528

	529 if __name__ == '__main__':

	530 sys.exit(main(sys.argv))

OLD	NEW

« no previous file with comments | « tools/real_world_impact/nsfw_urls.py ('k') | no next file » | no next file with comments »