tools/real_world_impact/real_world_impact.py - Issue 112423006: Real world impact script: scalable manual rendering QA

Side by Side Diff: tools/real_world_impact/real_world_impact.py

Issue 112423006: Real world impact script: scalable manual rendering QA (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Created 7 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright (c) 2012 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 # Tool for seeing the real world impact of a patch.

	7 #

	8 # Layout Tests can tell you whether something has changed, but this can help

	9 # you determine whether a subtle/controversial change is beneficial or not.

	10 #

	11 # It dumps the rendering of a large number of sites, both with and without a

	12 # patch being evaluated, then sorts them by greatest difference in rendering,

	13 # such that a human reviewer can quickly review the most impacted sites,

	14 # rather than having to manually try sites to see if anything changes.

	15 #

	16 # In future it might be possible to extend this to other kinds of differences,

	17 # e.g. page load times.

	18

	19 from contextlib import closing

	20 import datetime

	21 import errno

	22 from distutils.spawn import find_executable

	23 from operator import itemgetter

	24 import multiprocessing

	25 import os

	26 import re

	27 from cStringIO import StringIO

	28 import subprocess

	29 import sys

	30 import textwrap

	31 import time

	32 from urllib2 import urlopen

	33 from urlparse import urlparse

	34 import webbrowser

	35 from zipfile import ZipFile

	36

	37 from nsfw_urls import nsfw_urls

	38

	39 action = None

	40 allow_js = False

	41 chromium_src_root = ""

	42 chromium_out_dir = ""

	43 output_dir = ""

	44 num_sites = 1000

	45 urls = []

	46 print_lock = multiprocessing.Lock()

	47

	48

	49 def PrintUsage(argv0):

	50 this_script = os.path.basename(argv0)

	51 print textwrap.dedent("""\

	52 USAGE
	pdr. 2014/01/06 23:27:33 Please list the command line options here Please list the command line options here
	53 1. Build content_shell in out/Release, without the controversial patch.

	54 2. Run: %s before [num sites to test (default %d)]

	55 3. Apply the controversial patch, and rebuild content_shell in out/Release.

	56 4. Run: %s after [num sites to test (default %d)]

	57 5. Run: %s compare [num sites to test (default %d)]

	58 Output is stored in: %s

	59 The compare step will open results in your web browser."""

	60 % (this_script, num_sites, this_script, num_sites, this_script, num_sites,

	61 output_dir))

	62

	63

	64 def MakeDirsIfNotExist(dir):

	65 try:

	66 os.makedirs(dir)

	67 except OSError as e:

	68 if e.errno != errno.EEXIST:

	69 raise

	70

	71 def MakeOutDir():

	72 global chromium_src_root, chromium_out_dir, output_dir

	73 chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__),

	74 os.pardir,

	75 os.pardir))

	76 # Find out directory (might be out_linux for users of cr).

	77 for out_suffix in ["_linux", ""]:

	78 out_dir = os.path.join(chromium_src_root, "out" + out_suffix)

	79 if os.path.exists(out_dir):

	80 chromium_out_dir = out_dir

	81 break

	82 if not chromium_out_dir:

	83 return False

	84

	85 this_script_name = "real_world_impact"

	86 output_dir = os.path.join(chromium_out_dir,

	87 "Release",

	88 this_script_name)

	89 MakeDirsIfNotExist(output_dir)

	90 return True

	91

	92

	93 def CheckPrerequisites():

	94 if not find_executable("wget"):

	95 print "Please install wget and re-run this."

	96 return False

	97 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")

	98 if not os.path.exists(image_diff):

	99 print "Please build the image_diff target and re-run this."

	100 return False

	101 return True

	102

	103

	104 def PickSampleUrls():

	105 global urls

	106 data_dir = os.path.join(output_dir, "data")

	107 MakeDirsIfNotExist(data_dir)

	108

	109 # Download Alexa top 1,000,000 sites

	110 # TODO(johnme): Should probably update this when it gets too stale...

	111 csv_path = os.path.join(data_dir, "top-1m.csv")

	112 if not os.path.exists(csv_path):

	113 print "Downloading list of top 1,000,000 sites from Alexa..."

	114 csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip"

	115 with closing(urlopen(csv_url)) as stream:

	116 ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir)

	117

	118 bad_urls_path = os.path.join(data_dir, "bad_urls.txt")

	119 if os.path.exists(bad_urls_path):

	120 with open(bad_urls_path) as f:

	121 bad_urls = set(f.read().splitlines())

	122 else:

	123 bad_urls = set() #["jrj.com.cn"])

	124

	125 # See if we've already selected a sample of size num_sites (this way, if you

	126 # call this script with arguments "before N" then "after N", where N is the

	127 # same number, we'll use the same sample, as expected!).

	128 urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites)

	129 if not os.path.exists(urls_path):

	130 if action == 'compare':

	131 print ("Error: you must run 'before %d' and 'after %d' before "

	132 "running 'compare %d'") % (num_sites, num_sites, num_sites)

	133 return False

	134 print "Picking %d sample urls..." % num_sites

	135

	136 # TODO(johnme): For now this just gets the top num_sites entries. In future

	137 # this should pick a weighted random sample. For example, it could fit a

	138 # power-law distribution, which is a good model of website popularity

	139 # (http://www.useit.com/alertbox/9704b.html).

	140 urls = []

	141 remaining_num_sites = num_sites

	142 with open(csv_path) as f:

	143 for entry in f:

	144 if remaining_num_sites <= 0:

	145 break

	146 remaining_num_sites -= 1

	147 hostname = entry.strip().split(',')[1]

	148 if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths.

	149 url = "http://%s/" % hostname

	150 if not url in bad_urls:

	151 urls.append(url)

	152 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below

	153 # once we have tried to download them and seen which ones fail.

	154 else:

	155 with open(urls_path) as f:

	156 urls = [u for u in f.read().splitlines() if not u in bad_urls]

	157 return True

	158

	159

	160 def SaveWorkingUrls():

	161 # TODO(johnme): Update the list if a url that used to work goes offline.

	162 urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites)

	163 if not os.path.exists(urls_path):

	164 with open(urls_path, 'w') as f:

	165 f.writelines(u + '\n' for u in urls)

	166

	167

	168 def PrintElapsedTime(elapsed, detail=""):

	169 elapsed = round(elapsed * 10) / 10.0

	170 m = elapsed / 60

	171 s = elapsed % 60

	172 print "Took %dm%.1fs" % (m, s), detail

	173

	174

	175 def DownloadStaticCopyTask(url):

	176 url_parts = urlparse(url)

	177 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	178 # Use wget for now, as does a reasonable job of spidering page dependencies

	179 # (e.g. CSS, JS, images).

	180 success = True

	181 try:

	182 subprocess.check_call(["wget",

	183 "--execute", "robots=off",

	184 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS "

	185 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C"

	186 "hrome/32.0.1700.14 Safari/537.36"),

	187 "--page-requisites",

	188 "--span-hosts",

	189 "--adjust-extension",

	190 "--convert-links",

	191 "--directory-prefix=" + host_dir,

	192 "--force-directories",

	193 "--default-page=index.html",

	194 "--quiet",

	195 url])

	196 except subprocess.CalledProcessError:

	197 # Ignoring these for now, as some sites have issues with their subresources

	198 # yet still produce a renderable index.html

	199 pass #success = False

	200 if success:

	201 download_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	202 if not os.path.exists(download_path):

	203 success = False

	204 if not success:

	205 with print_lock:

	206 print "Failed to download:", url

	207 return False

	208 return True

	209

	210

	211 def DownloadStaticCopies():

	212 global urls

	213 new_urls = []

	214 for url in urls:

	215 url_parts = urlparse(url)

	216 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	217 download_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	218 if not os.path.exists(download_path):

	219 new_urls.append(url)

	220

	221 if new_urls:

	222 print "Downloading static copies of %d sites..." % len(new_urls)

	223 start_time = time.time()

	224

	225 results = multiprocessing.Pool(8).map(DownloadStaticCopyTask, new_urls)

	226 failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret]

	227 if failed_urls:

	228 bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt")

	229 with open(bad_urls_path, 'a') as f:

	230 f.writelines(u + '\n' for u in failed_urls)

	231 failed_urls_set = set(failed_urls)

	232 urls = [u for u in urls if u not in failed_urls_set]

	233

	234 PrintElapsedTime(time.time() - start_time)

	235

	236 SaveWorkingUrls()

	237

	238

	239 def RunDrtTask(url):

	240 url_parts = urlparse(url)

	241 host_dir = os.path.join(output_dir, "data", url_parts.hostname)

	242 html_path = os.path.join(host_dir, url_parts.hostname, "index.html")

	243

	244 if not allow_js:

	245 nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html")

	246 if not os.path.exists(nojs_path):

	247 with open(html_path) as f:

	248 html = f.read()

	249 if not html:

	250 return False

	251 # These aren't intended to be XSS safe :)

	252 block_tags = (r'<\s*(script\|object\|video\|audio\|iframe\|frameset\|frame)'

	253 r'\b.?<\s\/\s\1\s>')

	254 block_attrs = r'\s(onload\|onerror)\s=\s(\'[^\']\'\|"[^"]\|\S*)'

	255 html = re.sub(block_tags, '', html, flags=re.I\|re.S)

	256 html = re.sub(block_attrs, '', html, flags=re.I)

	257 with open(nojs_path, 'w') as f:

	258 f.write(html)

	259 html_path = nojs_path

	260

	261 content_shell = os.path.join(chromium_out_dir, "Release", "content_shell")

	262 start_time = time.time()

	263

	264 with open(os.devnull, "w") as fnull:

	265 p = subprocess.Popen([content_shell,

	266 "--dump-render-tree",

	267 # The single quote is not a typo, it's a separator!

	268 html_path + "'--pixel-test"

	269 ],

	270 shell=False,

	271 stdout=subprocess.PIPE,

	272 stderr=fnull)

	273

	274 result = p.stdout.read()

	275 PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A"

	276 PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82"

	277 try:

	278 start = result.index(PNG_START)

	279 end = result.rindex(PNG_END) + 8

	280 except ValueError:

	281 return False

	282

	283 png_path = os.path.join(output_dir, action, url_parts.hostname + ".png")

	284 MakeDirsIfNotExist(os.path.dirname(png_path))

	285 with open(png_path, 'wb') as f:

	286 f.write(result[start:end])

	287 elapsed_time = (time.time() - start_time, url)

	288 return elapsed_time

	289

	290

	291 def RunDrt():

	292 print "Taking screenshots of %d pages..." % len(urls)

	293 start_time = time.time()

	294

	295 results = multiprocessing.Pool().map(RunDrtTask, urls, 1)

	296

	297 max_time, url = max(t for t in results if t)

	298 elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url)

	299 PrintElapsedTime(time.time() - start_time, elapsed_detail)

	300

	301

	302 def CompareResultsTask(url):

	303 url_parts = urlparse(url)

	304 before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png")

	305 after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png")

	306 diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png")

	307 MakeDirsIfNotExist(os.path.join(output_dir, "diff"))

	308

	309 # TODO(johnme): Don't hardcode "real_world_impact".

	310 red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA"

	311 "ABAAEAAAICRAEAOw==")

	312

	313 before_exists = os.path.exists(before_path)

	314 after_exists = os.path.exists(after_path)

	315 if not before_exists and not after_exists:

	316 # TODO(johnme): Make this more informative.

	317 return (-100, url, red_path)

	318 if before_exists != after_exists:

	319 # TODO(johnme): Make this more informative.

	320 return (200, url, red_path)

	321

	322 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff")

	323

	324 # Get percentage difference.

	325 p = subprocess.Popen([image_diff, "--histogram",

	326 before_path, after_path],

	327 shell=False,

	328 stdout=subprocess.PIPE)

	329 output,_ = p.communicate()

	330 if p.returncode == 0:

	331 return (0, url, before_path)

	332 diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed\|failed)\n'

	333 'exact diff: (\d+\.\d{2})% (?:passed\|failed)', output)

	334 if not diff_match:

	335 raise Exception("image_diff output format changed")

	336 histogram_diff = float(diff_match.group(1))

	337 exact_diff = float(diff_match.group(2))

	338 combined_diff = max(histogram_diff + exact_diff / 8, 0.001)

	339

	340 # Produce diff PNG.

	341 subprocess.call([image_diff, "--diff", before_path, after_path, diff_path])

	342 return (combined_diff, url, diff_path)

	343

	344

	345 def CompareResults():

	346 print "Running image_diff on %d pages..." % len(urls)

	347 start_time = time.time()

	348

	349 results = multiprocessing.Pool().map(CompareResultsTask, urls)

	350 results.sort(key=itemgetter(0), reverse=True)

	351

	352 PrintElapsedTime(time.time() - start_time)

	353

	354 now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M")

	355 html_start = textwrap.dedent("""\

	356 <!DOCTYPE html>

	357 <html>

	358 <head>

	359 <title>Real World Impact report %s</title>

	360 <script>

	361 var togglingImg = null;

	362 var toggleTimer = null;

	363

	364 var before = true;

	365 function toggle() {

	366 var newFolder = before ? "before" : "after";

	367 togglingImg.src = togglingImg.src.replace(/before\|after\|diff/, newFolder);

	368 before = !before;

	369 toggleTimer = setTimeout(toggle, 300);

	370 }

	371

	372 function startToggle(img) {

	373 before = true;

	374 togglingImg = img;

	375 if (!img.origSrc)

	376 img.origSrc = img.src;

	377 toggle();

	378 }

	379 function stopToggle(img) {

	380 clearTimeout(toggleTimer);

	381 img.src = img.origSrc;

	382 }

	383

	384 document.onkeydown = function(e) {

	385 e = e \|\| window.event;

	386 var keyCode = e.keyCode \|\| e.which;

	387 var newFolder;

	388 switch (keyCode) {

	389 case 49: //'1'

	390 newFolder = "before"; break;

	391 case 50: //'2'

	392 newFolder = "after"; break;

	393 case 51: //'3'

	394 newFolder = "diff"; break;

	395 default:

	396 return;

	397 }

	398 var imgs = document.getElementsByTagName("img");

	399 for (var i = 0; i < imgs.length; i++) {

	400 imgs[i].src = imgs[i].src.replace(/before\|after\|diff/, newFolder);

	401 }

	402 };

	403 </script>

	404 <style>

	405 h1 {

	406 font-family: sans;

	407 }

	408 h2 {

	409 font-family: monospace;

	410 white-space: pre;

	411 }

	412 .nsfw-spacer {

	413 height: 50vh;

	414 }

	415 .nsfw-warning {

	416 background: yellow;

	417 border: 10px solid red;

	418 }

	419 .info {

	420 font-size: 1.2em;

	421 font-style: italic;

	422 }

	423 body:not(.details-supported) details {

	424 display: none;

	425 }

	426 </style>

	427 </head>

	428 <body>

	429 <script>

	430 if ('open' in document.createElement('details'))

	431 document.body.className = "details-supported";

	432 </script>

	433 <!--<div class="nsfw-spacer"></div>-->

	434 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d

	435 and may be NSFW.</p>

	436 <!--<div class="nsfw-spacer"></div>-->

	437 <h1>Real World Impact report %s</h1>

	438 <p class="info">Press 1, 2 and 3 to switch between before, after and diff

	439 screenshots respectively; or hover over the images to rapidly alternate

	440 between before and after.</p>

	441 """ % (now, num_sites, now))

	442

	443 html_same_row = """\

	444 <h2>No difference on <a href="%s">%s</a>.</h2>

	445 """

	446

	447 html_diff_row = """\

	448 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>

	449 <img src="%s" width="800" height="600"

	450 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">

	451 """

	452

	453 html_nsfw_diff_row = """\

	454 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2>

	455 <details>

	456 <summary>This site may be NSFW. Click to expand/collapse.</summary>

	457 <img src="%s" width="800" height="600"

	458 onmouseover="startToggle(this)" onmouseout="stopToggle(this)">

	459 </details>

	460 """

	461

	462 html_end = textwrap.dedent("""\

	463 </body>

	464 </html>""")

	465

	466 html_path = os.path.join(output_dir, "diff.html")

	467 with open(html_path, 'w') as f:

	468 f.write(html_start)

	469 for (diff_float, url, diff_path) in results:

	470 diff_path = os.path.relpath(diff_path, output_dir)

	471 if diff_float == 0:

	472 f.write(html_same_row % (url, url))

	473 elif url in nsfw_urls:

	474 f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path))

	475 else:

	476 f.write(html_diff_row % (diff_float, url, url, diff_path))

	477 f.write(html_end)

	478

	479 webbrowser.open_new_tab("file://" + html_path)

	480

	481

	482 def main(argv):

	483 global num_sites, action, allow_js

	484

	485 for arg in argv[1:]:

	486 try:

	487 num_sites = int(arg)

	488 except ValueError:

	489 if arg == '--allow-js':

	490 allow_js = True

	491 elif arg in ['download', 'before', 'after', 'compare']:

	492 action = arg

	493 else:

	494 # Unrecognized argument. Show usage.

	495 action = None

	496 break

	497

	498 if not action or (action in ['download', 'compare'] and allow_js):

	499 PrintUsage(argv[0])

	500 return 2

	501

	502 if not MakeOutDir() or not CheckPrerequisites() or not PickSampleUrls():

	503 return 1

	504

	505 if action == 'compare':

	506 CompareResults()

	507 else:

	508 DownloadStaticCopies()

	509 if action != 'download':

	510 RunDrt()

	511 return 0

	512

	513

	514 if __name__ == '__main__':

	515 sys.exit(main(sys.argv))

OLD	NEW

« no previous file with comments | « tools/real_world_impact/nsfw_urls.py ('k') | no next file » | no next file with comments »