tools/real_world_impact/real_world_impact.py - Issue 209393002: Real world impact script: scalable manual rendering QA

Side by Side Diff: tools/real_world_impact/real_world_impact.py

Issue 209393002: Real world impact script: scalable manual rendering QA (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Make num_sites properly optional Created 6 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2014 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 # Tool for seeing the real world impact of a patch.

	7 #

	8 # Layout Tests can tell you whether something has changed, but this can help

	9 # you determine whether a subtle/controversial change is beneficial or not.

	10 #

	11 # It dumps the rendering of a large number of sites, both with and without a

	12 # patch being evaluated, then sorts them by greatest difference in rendering,

	13 # such that a human reviewer can quickly review the most impacted sites,

	14 # rather than having to manually try sites to see if anything changes.

	15 #

	16 # In future it might be possible to extend this to other kinds of differences,

	17 # e.g. page load times.

	18

	19 import argparse

	20 from argparse import RawTextHelpFormatter

	21 from contextlib import closing

	22 import datetime

	23 import errno

	24 from distutils.spawn import find_executable

	25 from operator import itemgetter

	26 import multiprocessing

	27 import os

	28 import re

	29 from cStringIO import StringIO

	30 import subprocess

	31 import sys

	32 import textwrap

	33 import time

	34 from urllib2 import urlopen

	35 from urlparse import urlparse

	36 import webbrowser

	37 from zipfile import ZipFile

	38

	39 from nsfw_urls import nsfw_urls

	40

	41 action = None

	42 allow_js = False

	43 additional_content_shell_flags = ""

	44 chromium_src_root = ""

	45 chromium_out_dir = ""

	46 image_diff = ""

	47 content_shell = ""

	48 output_dir = ""

	49 num_sites = 100

	50 urls = []

	51 print_lock = multiprocessing.Lock()

	52

	53

	54 def MakeDirsIfNotExist(dir):

	55 try:

	56 os.makedirs(dir)

	57 except OSError as e:

	58 if e.errno != errno.EEXIST:

	59 raise

	60

	61

	62 def SetupPathsAndOut():

	63 global chromium_src_root, chromium_out_dir, output_dir

	64 global image_diff, content_shell

	65 chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__),

	66 os.pardir,

	67 os.pardir))

	68 # Find out directory (might be out_linux for users of cr).

	69 for out_suffix in ["_linux", ""]:

	70 out_dir = os.path.join(chromium_src_root, "out" + out_suffix)

	71 if os.path.exists(out_dir):

	72 chromium_out_dir = out_dir

	73 break

	74 if not chromium_out_dir:

	75 return False

	76

	77 this_script_name = "real_world_impact"

	78 output_dir = os.path.join(chromium_out_dir,

	79 "Release",

	80 this_script_name)

	81 MakeDirsIfNotExist(output_dir)

	82

	83 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")

	84

	85 if sys.platform == 'darwin':

	86 content_shell = os.path.join(chromium_out_dir, "Release",

	87 "Content Shell.app/Contents/MacOS/Content Shell")

	88 elif sys.platform.startswith('linux'):

	89 content_shell = os.path.join(chromium_out_dir, "Release",

	90 "content_shell")

	91 elif sys.platform.startswith('win'):

	92 content_shell = os.path.join(chromium_out_dir, "Release",

	93 "content_shell.exe")

	94 return True

	95

	96

	97 def CheckPrerequisites():

	98 if not find_executable("wget"):

	99 print "wget not found! Install wget and re-run this."

	100 return False

	101 if not os.path.exists(image_diff):

	102 print "image_diff not found (%s)!" % image_diff

	103 print "Build the image_diff target and re-run this."

	104 return False

	105 if not os.path.exists(content_shell):

	106 print "Content shell not found (%s)!" % content_shell

	107 print "Build Release/content_shell and re-run this."

	108 return False

	109 return True

	110

	111

	112 def PickSampleUrls():

	113 global urls

	114 data_dir = os.path.join(output_dir, "data")

	115 MakeDirsIfNotExist(data_dir)

	116

	117 # Download Alexa top 1,000,000 sites

	118 # TODO(johnme): Should probably update this when it gets too stale...

	119 csv_path = os.path.join(data_dir, "top-1m.csv")

	120 if not os.path.exists(csv_path):

	121 print "Downloading list of top 1,000,000 sites from Alexa..."

	122 csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"

	123 with closing(urlopen(csv_url)) as stream:

	124 ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir)

	125

	126 bad_urls_path = os.path.join(data_dir, "bad_urls.txt")

	127 if os.path.exists(bad_urls_path):

	128 with open(bad_urls_path) as f:

	129 bad_urls = set(f.read().splitlines())

	130 else:

	131 bad_urls = set()

	132

	133 # See if we've already selected a sample of size num_sites (this way, if you

	134 # call this script with arguments "before N" then "after N", where N is the

	135 # same number, we'll use the same sample, as expected!).

	136 urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites)

	137 if not os.path.exists(urls_path):

	138 if action == 'compare':

	139 print ("Error: you must run 'before %d' and 'after %d' before "

	140 "running 'compare %d'") % (num_sites, num_sites, num_sites)

	141 return False

	142 print "Picking %d sample urls..." % num_sites

	143

	144 # TODO(johnme): For now this just gets the top num_sites entries. In future

	145 # this should pick a weighted random sample. For example, it could fit a

	146 # power-law distribution, which is a good model of website popularity

	147 # (http://www.useit.com/alertbox/9704b.html).

	148 urls = []

	149 remaining_num_sites = num_sites

	150 with open(csv_path) as f:

	151 for entry in f:

	152 if remaining_num_sites <= 0:

	153 break

	154 remaining_num_sites -= 1

	155 hostname = entry.strip().split(',')[1]

	156 if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths.

	157 url = "http://%s/" % hostname

	158 if not url in bad_urls:

	159 urls.append(url)

	160 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below

	161 # once we have tried to download them and seen which ones fail.

	162 else:

	163 with open(urls_path) as f:

	164 urls = [u for u in f.read().splitlines() if not u in bad_urls]

	165 return True

	166

	167

	168 def SaveWorkingUrls():

	169 # TODO(johnme): Update the list if a url that used to work goes offline.

	170 urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites)

	171 if not os.path.exists(urls_path):

	172 with open(urls_path, 'w') as f:

	173 f.writelines(u + '\n' for u in urls)

	174

	175

	176 def PrintElapsedTime(elapsed, detail=""):

	177 elapsed = round(elapsed * 10) / 10.0

	178 m = elapsed / 60

	179 s = elapsed % 60

	180 print "Took %dm%.1fs" % (m, s), detail

	181

	182

	183 def DownloadStaticCopyTask(url):

	184 url_parts = urlparse(url)

	185 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	186 # Use wget for now, as does a reasonable job of spidering page dependencies

	187 # (e.g. CSS, JS, images).

	188 success = True

	189 try:

	190 subprocess.check_call(["wget",

	191 "--execute", "robots=off",

	192 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS "

	193 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C"

	194 "hrome/32.0.1700.14 Safari/537.36"),

	195 "--page-requisites",

	196 "--span-hosts",

	197 "--adjust-extension",

	198 "--convert-links",

	199 "--directory-prefix=" + host_dir,

	200 "--force-directories",

	201 "--default-page=index.html",

	202 "--no-check-certificate",

	203 "--timeout=5", # 5s timeout

	204 "--tries=2",

	205 "--quiet",

	206 url])

	207 except KeyboardInterrupt:

	208 success = False

	209 except subprocess.CalledProcessError:

	210 # Ignoring these for now, as some sites have issues with their subresources

	211 # yet still produce a renderable index.html

	212 pass #success = False

	213 if success:

	214 download_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	215 if not os.path.exists(download_path):

	216 success = False

	217 else:

	218 with print_lock:

	219 print "Downloaded:", url

	220 if not success:

	221 with print_lock:

	222 print "Failed to download:", url

	223 return False

	224 return True

	225

	226

	227 def DownloadStaticCopies():

	228 global urls

	229 new_urls = []

	230 for url in urls:

	231 url_parts = urlparse(url)

	232 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	233 download_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	234 if not os.path.exists(download_path):

	235 new_urls.append(url)

	236

	237 if new_urls:

	238 print "Downloading static copies of %d sites..." % len(new_urls)

	239 start_time = time.time()

	240

	241 results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls)

	242 failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret]

	243 if failed_urls:

	244 bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt")

	245 with open(bad_urls_path, 'a') as f:

	246 f.writelines(u + '\n' for u in failed_urls)

	247 failed_urls_set = set(failed_urls)

	248 urls = [u for u in urls if u not in failed_urls_set]

	249

	250 PrintElapsedTime(time.time() - start_time)

	251

	252 SaveWorkingUrls()

	253

	254

	255 def RunDrtTask(url):

	256 url_parts = urlparse(url)

	257 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	258 html_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	259

	260 if not allow_js:

	261 nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html")

	262 if not os.path.exists(nojs_path):

	263 with open(html_path) as f:

	264 html = f.read()

	265 if not html:

	266 return False

	267 # These aren't intended to be XSS safe :)

	268 block_tags = (r'<\s*(script\|object\|video\|audio\|iframe\|frameset\|frame)'

	269 r'\b.?<\s\/\s\1\s>')

	270 block_attrs = r'\s(onload\|onerror)\s=\s(\'[^\']\'\|"[^"]\|\S*)'

	271 html = re.sub(block_tags, '', html, flags=re.I\|re.S)

	272 html = re.sub(block_attrs, '', html, flags=re.I)

	273 with open(nojs_path, 'w') as f:

	274 f.write(html)

	275 html_path = nojs_path

	276

	277 start_time = time.time()

	278

	279 with open(os.devnull, "w") as fnull:

	280 p = subprocess.Popen([content_shell,

	281 "--dump-render-tree",

	282 additional_content_shell_flags,

	283 # The single quote is not a typo, it's a separator!

	284 html_path + "'--pixel-test"

	285 ],

	286 shell=False,

	287 stdout=subprocess.PIPE,

	288 stderr=fnull)

	289 result = p.stdout.read()

	290 PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"

	291 PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82"

	292 try:

	293 start = result.index(PNG_START)

	294 end = result.rindex(PNG_END) + 8

	295 except ValueError:

	296 return False

	297

	298 png_path = os.path.join(output_dir, action, url_parts.hostname + ".png")

	299 MakeDirsIfNotExist(os.path.dirname(png_path))

	300 with open(png_path, 'wb') as f:

	301 f.write(result[start:end])

	302 elapsed_time = (time.time() - start_time, url)

	303 return elapsed_time

	304

	305

	306 def RunDrt():

	307 print "Taking screenshots of %d pages..." % len(urls)

	308 start_time = time.time()

	309

	310 results = multiprocessing.Pool().map(RunDrtTask, urls, 1)

	311

	312 max_time, url = max(t for t in results if t)

	313 elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url)

	314 PrintElapsedTime(time.time() - start_time, elapsed_detail)

	315

	316

	317 def CompareResultsTask(url):

	318 url_parts = urlparse(url)

	319 before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png")

	320 after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png")

	321 diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png")

	322 MakeDirsIfNotExist(os.path.join(output_dir, "diff"))

	323

	324 # TODO(johnme): Don't hardcode "real_world_impact".

	325 red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA"

	326 "ABAAEAAAICRAEAOw==")

	327

	328 before_exists = os.path.exists(before_path)

	329 after_exists = os.path.exists(after_path)

	330 if not before_exists and not after_exists:

	331 # TODO(johnme): Make this more informative.

	332 return (-100, url, red_path)

	333 if before_exists != after_exists:

	334 # TODO(johnme): Make this more informative.

	335 return (200, url, red_path)

	336

	337 # Get percentage difference.

	338 p = subprocess.Popen([image_diff, "--histogram",

	339 before_path, after_path],

	340 shell=False,

	341 stdout=subprocess.PIPE)

	342 output,_ = p.communicate()

	343 if p.returncode == 0:

	344 return (0, url, before_path)

	345 diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed\|failed)\n'

	346 'exact diff: (\d+\.\d{2})% (?:passed\|failed)', output)

	347 if not diff_match:

	348 raise Exception("image_diff output format changed")

	349 histogram_diff = float(diff_match.group(1))

	350 exact_diff = float(diff_match.group(2))

	351 combined_diff = max(histogram_diff + exact_diff / 8, 0.001)

	352

	353 # Produce diff PNG.

	354 subprocess.call([image_diff, "--diff", before_path, after_path, diff_path])

	355 return (combined_diff, url, diff_path)

	356

	357

	358 def CompareResults():

	359 print "Running image_diff on %d pages..." % len(urls)

	360 start_time = time.time()

	361

	362 results = multiprocessing.Pool().map(CompareResultsTask, urls)

	363 results.sort(key=itemgetter(0), reverse=True)

	364

	365 PrintElapsedTime(time.time() - start_time)

	366

	367 now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M")

	368 html_start = textwrap.dedent("""\

	369 <!DOCTYPE html>

	370 <html>

	371 <head>

	372 <title>Real World Impact report %s</title>

	373 <script>

	374 var togglingImg = null;

	375 var toggleTimer = null;

	376

	377 var before = true;

	378 function toggle() {

	379 var newFolder = before ? "before" : "after";

	380 togglingImg.src = togglingImg.src.replace(/before\|after\|diff/, newFolder);

	381 before = !before;

	382 toggleTimer = setTimeout(toggle, 300);

	383 }

	384

	385 function startToggle(img) {

	386 before = true;

	387 togglingImg = img;

	388 if (!img.origSrc)

	389 img.origSrc = img.src;

	390 toggle();

	391 }

	392 function stopToggle(img) {

	393 clearTimeout(toggleTimer);

	394 img.src = img.origSrc;

	395 }

	396

	397 document.onkeydown = function(e) {

	398 e = e \|\| window.event;

	399 var keyCode = e.keyCode \|\| e.which;

	400 var newFolder;

	401 switch (keyCode) {

	402 case 49: //'1'

	403 newFolder = "before"; break;

	404 case 50: //'2'

	405 newFolder = "after"; break;

	406 case 51: //'3'

	407 newFolder = "diff"; break;

	408 default:

	409 return;

	410 }

	411 var imgs = document.getElementsByTagName("img");

	412 for (var i = 0; i < imgs.length; i++) {

	413 imgs[i].src = imgs[i].src.replace(/before\|after\|diff/, newFolder);

	414 }

	415 };

	416 </script>

	417 <style>

	418 h1 {

	419 font-family: sans;

	420 }

	421 h2 {

	422 font-family: monospace;

	423 white-space: pre;

	424 }

	425 .nsfw-spacer {

	426 height: 50vh;

	427 }

	428 .nsfw-warning {

	429 background: yellow;

	430 border: 10px solid red;

	431 }

	432 .info {

	433 font-size: 1.2em;

	434 font-style: italic;

	435 }

	436 body:not(.details-supported) details {

	437 display: none;

	438 }

	439 </style>

	440 </head>

	441 <body>

	442 <script>

	443 if ('open' in document.createElement('details'))

	444 document.body.className = "details-supported";

	445 </script>

	446 <!--<div class="nsfw-spacer"></div>-->

	447 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d

	448 and may be NSFW.</p>

	449 <!--<div class="nsfw-spacer"></div>-->

	450 <h1>Real World Impact report %s</h1>

	451 <p class="info">Press 1, 2 and 3 to switch between before, after and diff

	452 screenshots respectively; or hover over the images to rapidly alternate

	453 between before and after.</p>

	454 """ % (now, num_sites, now))

	455

	456 html_same_row = """\

	457 <h2>No difference on <a href="%s">%s</a>.</h2>

	458 """

	459

	460 html_diff_row = """\

	461 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>

	462 <img src="%s" width="800" height="600"

	463 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">

	464 """

	465

	466 html_nsfw_diff_row = """\

	467 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>

	468 <details>

	469 <summary>This site may be NSFW. Click to expand/collapse.</summary>

	470 <img src="%s" width="800" height="600"

	471 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">

	472 </details>

	473 """

	474

	475 html_end = textwrap.dedent("""\

	476 </body>

	477 </html>""")

	478

	479 html_path = os.path.join(output_dir, "diff.html")

	480 with open(html_path, 'w') as f:

	481 f.write(html_start)

	482 for (diff_float, url, diff_path) in results:

	483 diff_path = os.path.relpath(diff_path, output_dir)

	484 if diff_float == 0:

	485 f.write(html_same_row % (url, url))

	486 elif url in nsfw_urls:

	487 f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path))

	488 else:

	489 f.write(html_diff_row % (diff_float, url, url, diff_path))

	490 f.write(html_end)

	491

	492 webbrowser.open_new_tab("file://" + html_path)

	493

	494

	495 def main(argv):

	496 global num_sites, action, allow_js, additional_content_shell_flags

	497

	498 parser = argparse.ArgumentParser(

	499 formatter_class=RawTextHelpFormatter,

	500 description="Compare the real world impact of a content shell change.",

	501 epilog=textwrap.dedent("""\

	502 Example usage:

	503 1. Build content_shell in out/Release without any changes.

	504 2. Run: %s before [num sites to test (default %d)].

	505 3. Either:

	506 a. Apply your controversial patch and rebuild content_shell.

	507 b. Pass --additional_flags="--enable_your_flag" in step 4.

	508 4. Run: %s after [num sites to test (default %d)].

	509 5. Run: %s compare [num sites to test (default %d)].

	510 This will open the results in your web browser.

	511 """ % (argv[0], num_sites, argv[0], num_sites, argv[0], num_sites)))

	512 parser.add_argument("--allow_js", help="Don't disable Javascript",

	513 action="store_true")

	514 parser.add_argument("--additional_flags",

	515 help="Additional flags to pass to content shell")

	516 parser.add_argument("action",

	517 help=textwrap.dedent("""\

	518 Action to perform.

	519 download - Just download the sites.

	520 before - Run content shell and record 'before' result.

	521 after - Run content shell and record 'after' result.

	522 compare - Compare before and after results.

	523 """),

	524 choices=["download", "before", "after", "compare"])

	525 parser.add_argument("num_sites",

	526 help="Number of sites (default %s)" % num_sites,

	527 type=int, default=num_sites, nargs='?')

	528 args = parser.parse_args()

	529

	530 action = args.action

	531

	532 if (args.num_sites):

	533 num_sites = args.num_sites

	534

	535 if (args.allow_js):

	536 allow_js = args.allow_js

	537

	538 if (args.additional_flags):

	539 additional_content_shell_flags = args.additional_flags

	540

	541 if not SetupPathsAndOut() or not CheckPrerequisites() or not PickSampleUrls():

	542 return 1

	543

	544 if action == 'compare':

	545 CompareResults()

	546 else:

	547 DownloadStaticCopies()

	548 if action != 'download':

	549 RunDrt()

	550 return 0

	551

	552

	553 if __name__ == '__main__':

	554 sys.exit(main(sys.argv))

OLD	NEW

« tools/real_world_impact/nsfw_urls.py ('K') | « tools/real_world_impact/nsfw_urls.py ('k') | no next file » | no next file with comments »