OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 # Tool for seeing the real world impact of a patch. |
| 7 # |
| 8 # Layout Tests can tell you whether something has changed, but this can help |
| 9 # you determine whether a subtle/controversial change is beneficial or not. |
| 10 # |
| 11 # It dumps the rendering of a large number of sites, both with and without a |
| 12 # patch being evaluated, then sorts them by greatest difference in rendering, |
| 13 # such that a human reviewer can quickly review the most impacted sites, |
| 14 # rather than having to manually try sites to see if anything changes. |
| 15 # |
| 16 # In future it might be possible to extend this to other kinds of differences, |
| 17 # e.g. page load times. |
| 18 |
| 19 import argparse |
| 20 from argparse import RawTextHelpFormatter |
| 21 from contextlib import closing |
| 22 import datetime |
| 23 import errno |
| 24 from distutils.spawn import find_executable |
| 25 from operator import itemgetter |
| 26 import multiprocessing |
| 27 import os |
| 28 import re |
| 29 from cStringIO import StringIO |
| 30 import subprocess |
| 31 import sys |
| 32 import textwrap |
| 33 import time |
| 34 from urllib2 import urlopen |
| 35 from urlparse import urlparse |
| 36 import webbrowser |
| 37 from zipfile import ZipFile |
| 38 |
| 39 from nsfw_urls import nsfw_urls |
| 40 |
| 41 action = None |
| 42 allow_js = False |
| 43 additional_content_shell_flags = "" |
| 44 chromium_src_root = "" |
| 45 chromium_out_dir = "" |
| 46 image_diff = "" |
| 47 content_shell = "" |
| 48 output_dir = "" |
| 49 num_sites = 100 |
| 50 urls = [] |
| 51 print_lock = multiprocessing.Lock() |
| 52 |
| 53 |
| 54 def MakeDirsIfNotExist(dir): |
| 55 try: |
| 56 os.makedirs(dir) |
| 57 except OSError as e: |
| 58 if e.errno != errno.EEXIST: |
| 59 raise |
| 60 |
| 61 |
| 62 def SetupPathsAndOut(): |
| 63 global chromium_src_root, chromium_out_dir, output_dir |
| 64 global image_diff, content_shell |
| 65 chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__), |
| 66 os.pardir, |
| 67 os.pardir)) |
| 68 # Find out directory (might be out_linux for users of cr). |
| 69 for out_suffix in ["_linux", ""]: |
| 70 out_dir = os.path.join(chromium_src_root, "out" + out_suffix) |
| 71 if os.path.exists(out_dir): |
| 72 chromium_out_dir = out_dir |
| 73 break |
| 74 if not chromium_out_dir: |
| 75 return False |
| 76 |
| 77 this_script_name = "real_world_impact" |
| 78 output_dir = os.path.join(chromium_out_dir, |
| 79 "Release", |
| 80 this_script_name) |
| 81 MakeDirsIfNotExist(output_dir) |
| 82 |
| 83 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") |
| 84 |
| 85 if sys.platform == 'darwin': |
| 86 content_shell = os.path.join(chromium_out_dir, "Release", |
| 87 "Content Shell.app/Contents/MacOS/Content Shell") |
| 88 elif sys.platform.startswith('linux'): |
| 89 content_shell = os.path.join(chromium_out_dir, "Release", |
| 90 "content_shell") |
| 91 elif sys.platform.startswith('win'): |
| 92 content_shell = os.path.join(chromium_out_dir, "Release", |
| 93 "content_shell.exe") |
| 94 return True |
| 95 |
| 96 |
| 97 def CheckPrerequisites(): |
| 98 if not find_executable("wget"): |
| 99 print "wget not found! Install wget and re-run this." |
| 100 return False |
| 101 if not os.path.exists(image_diff): |
| 102 print "image_diff not found (%s)!" % image_diff |
| 103 print "Build the image_diff target and re-run this." |
| 104 return False |
| 105 if not os.path.exists(content_shell): |
| 106 print "Content shell not found (%s)!" % content_shell |
| 107 print "Build Release/content_shell and re-run this." |
| 108 return False |
| 109 return True |
| 110 |
| 111 |
| 112 def PickSampleUrls(): |
| 113 global urls |
| 114 data_dir = os.path.join(output_dir, "data") |
| 115 MakeDirsIfNotExist(data_dir) |
| 116 |
| 117 # Download Alexa top 1,000,000 sites |
| 118 # TODO(johnme): Should probably update this when it gets too stale... |
| 119 csv_path = os.path.join(data_dir, "top-1m.csv") |
| 120 if not os.path.exists(csv_path): |
| 121 print "Downloading list of top 1,000,000 sites from Alexa..." |
| 122 csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" |
| 123 with closing(urlopen(csv_url)) as stream: |
| 124 ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir) |
| 125 |
| 126 bad_urls_path = os.path.join(data_dir, "bad_urls.txt") |
| 127 if os.path.exists(bad_urls_path): |
| 128 with open(bad_urls_path) as f: |
| 129 bad_urls = set(f.read().splitlines()) |
| 130 else: |
| 131 bad_urls = set() |
| 132 |
| 133 # See if we've already selected a sample of size num_sites (this way, if you |
| 134 # call this script with arguments "before N" then "after N", where N is the |
| 135 # same number, we'll use the same sample, as expected!). |
| 136 urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites) |
| 137 if not os.path.exists(urls_path): |
| 138 if action == 'compare': |
| 139 print ("Error: you must run 'before %d' and 'after %d' before " |
| 140 "running 'compare %d'") % (num_sites, num_sites, num_sites) |
| 141 return False |
| 142 print "Picking %d sample urls..." % num_sites |
| 143 |
| 144 # TODO(johnme): For now this just gets the top num_sites entries. In future |
| 145 # this should pick a weighted random sample. For example, it could fit a |
| 146 # power-law distribution, which is a good model of website popularity |
| 147 # (http://www.useit.com/alertbox/9704b.html). |
| 148 urls = [] |
| 149 remaining_num_sites = num_sites |
| 150 with open(csv_path) as f: |
| 151 for entry in f: |
| 152 if remaining_num_sites <= 0: |
| 153 break |
| 154 remaining_num_sites -= 1 |
| 155 hostname = entry.strip().split(',')[1] |
| 156 if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths. |
| 157 url = "http://%s/" % hostname |
| 158 if not url in bad_urls: |
| 159 urls.append(url) |
| 160 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below |
| 161 # once we have tried to download them and seen which ones fail. |
| 162 else: |
| 163 with open(urls_path) as f: |
| 164 urls = [u for u in f.read().splitlines() if not u in bad_urls] |
| 165 return True |
| 166 |
| 167 |
| 168 def SaveWorkingUrls(): |
| 169 # TODO(johnme): Update the list if a url that used to work goes offline. |
| 170 urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites) |
| 171 if not os.path.exists(urls_path): |
| 172 with open(urls_path, 'w') as f: |
| 173 f.writelines(u + '\n' for u in urls) |
| 174 |
| 175 |
| 176 def PrintElapsedTime(elapsed, detail=""): |
| 177 elapsed = round(elapsed * 10) / 10.0 |
| 178 m = elapsed / 60 |
| 179 s = elapsed % 60 |
| 180 print "Took %dm%.1fs" % (m, s), detail |
| 181 |
| 182 |
| 183 def DownloadStaticCopyTask(url): |
| 184 url_parts = urlparse(url) |
| 185 host_dir = os.path.join(output_dir, "data", url_parts.hostname) |
| 186 # Use wget for now, as does a reasonable job of spidering page dependencies |
| 187 # (e.g. CSS, JS, images). |
| 188 success = True |
| 189 try: |
| 190 subprocess.check_call(["wget", |
| 191 "--execute", "robots=off", |
| 192 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS " |
| 193 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C" |
| 194 "hrome/32.0.1700.14 Safari/537.36"), |
| 195 "--page-requisites", |
| 196 "--span-hosts", |
| 197 "--adjust-extension", |
| 198 "--convert-links", |
| 199 "--directory-prefix=" + host_dir, |
| 200 "--force-directories", |
| 201 "--default-page=index.html", |
| 202 "--no-check-certificate", |
| 203 "--timeout=5", # 5s timeout |
| 204 "--tries=2", |
| 205 "--quiet", |
| 206 url]) |
| 207 except KeyboardInterrupt: |
| 208 success = False |
| 209 except subprocess.CalledProcessError: |
| 210 # Ignoring these for now, as some sites have issues with their subresources |
| 211 # yet still produce a renderable index.html |
| 212 pass #success = False |
| 213 if success: |
| 214 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") |
| 215 if not os.path.exists(download_path): |
| 216 success = False |
| 217 else: |
| 218 with print_lock: |
| 219 print "Downloaded:", url |
| 220 if not success: |
| 221 with print_lock: |
| 222 print "Failed to download:", url |
| 223 return False |
| 224 return True |
| 225 |
| 226 |
| 227 def DownloadStaticCopies(): |
| 228 global urls |
| 229 new_urls = [] |
| 230 for url in urls: |
| 231 url_parts = urlparse(url) |
| 232 host_dir = os.path.join(output_dir, "data", url_parts.hostname) |
| 233 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") |
| 234 if not os.path.exists(download_path): |
| 235 new_urls.append(url) |
| 236 |
| 237 if new_urls: |
| 238 print "Downloading static copies of %d sites..." % len(new_urls) |
| 239 start_time = time.time() |
| 240 |
| 241 results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls) |
| 242 failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret] |
| 243 if failed_urls: |
| 244 bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt") |
| 245 with open(bad_urls_path, 'a') as f: |
| 246 f.writelines(u + '\n' for u in failed_urls) |
| 247 failed_urls_set = set(failed_urls) |
| 248 urls = [u for u in urls if u not in failed_urls_set] |
| 249 |
| 250 PrintElapsedTime(time.time() - start_time) |
| 251 |
| 252 SaveWorkingUrls() |
| 253 |
| 254 |
| 255 def RunDrtTask(url): |
| 256 url_parts = urlparse(url) |
| 257 host_dir = os.path.join(output_dir, "data", url_parts.hostname) |
| 258 html_path = os.path.join(host_dir, url_parts.hostname, "index.html") |
| 259 |
| 260 if not allow_js: |
| 261 nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html") |
| 262 if not os.path.exists(nojs_path): |
| 263 with open(html_path) as f: |
| 264 html = f.read() |
| 265 if not html: |
| 266 return False |
| 267 # These aren't intended to be XSS safe :) |
| 268 block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)' |
| 269 r'\b.*?<\s*\/\s*\1\s*>') |
| 270 block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)' |
| 271 html = re.sub(block_tags, '', html, flags=re.I|re.S) |
| 272 html = re.sub(block_attrs, '', html, flags=re.I) |
| 273 with open(nojs_path, 'w') as f: |
| 274 f.write(html) |
| 275 html_path = nojs_path |
| 276 |
| 277 start_time = time.time() |
| 278 |
| 279 with open(os.devnull, "w") as fnull: |
| 280 p = subprocess.Popen([content_shell, |
| 281 "--dump-render-tree", |
| 282 additional_content_shell_flags, |
| 283 # The single quote is not a typo, it's a separator! |
| 284 html_path + "'--pixel-test" |
| 285 ], |
| 286 shell=False, |
| 287 stdout=subprocess.PIPE, |
| 288 stderr=fnull) |
| 289 result = p.stdout.read() |
| 290 PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A" |
| 291 PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82" |
| 292 try: |
| 293 start = result.index(PNG_START) |
| 294 end = result.rindex(PNG_END) + 8 |
| 295 except ValueError: |
| 296 return False |
| 297 |
| 298 png_path = os.path.join(output_dir, action, url_parts.hostname + ".png") |
| 299 MakeDirsIfNotExist(os.path.dirname(png_path)) |
| 300 with open(png_path, 'wb') as f: |
| 301 f.write(result[start:end]) |
| 302 elapsed_time = (time.time() - start_time, url) |
| 303 return elapsed_time |
| 304 |
| 305 |
| 306 def RunDrt(): |
| 307 print "Taking screenshots of %d pages..." % len(urls) |
| 308 start_time = time.time() |
| 309 |
| 310 results = multiprocessing.Pool().map(RunDrtTask, urls, 1) |
| 311 |
| 312 max_time, url = max(t for t in results if t) |
| 313 elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url) |
| 314 PrintElapsedTime(time.time() - start_time, elapsed_detail) |
| 315 |
| 316 |
| 317 def CompareResultsTask(url): |
| 318 url_parts = urlparse(url) |
| 319 before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png") |
| 320 after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png") |
| 321 diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png") |
| 322 MakeDirsIfNotExist(os.path.join(output_dir, "diff")) |
| 323 |
| 324 # TODO(johnme): Don't hardcode "real_world_impact". |
| 325 red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA" |
| 326 "ABAAEAAAICRAEAOw==") |
| 327 |
| 328 before_exists = os.path.exists(before_path) |
| 329 after_exists = os.path.exists(after_path) |
| 330 if not before_exists and not after_exists: |
| 331 # TODO(johnme): Make this more informative. |
| 332 return (-100, url, red_path) |
| 333 if before_exists != after_exists: |
| 334 # TODO(johnme): Make this more informative. |
| 335 return (200, url, red_path) |
| 336 |
| 337 # Get percentage difference. |
| 338 p = subprocess.Popen([image_diff, "--histogram", |
| 339 before_path, after_path], |
| 340 shell=False, |
| 341 stdout=subprocess.PIPE) |
| 342 output,_ = p.communicate() |
| 343 if p.returncode == 0: |
| 344 return (0, url, before_path) |
| 345 diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n' |
| 346 'exact diff: (\d+\.\d{2})% (?:passed|failed)', output) |
| 347 if not diff_match: |
| 348 raise Exception("image_diff output format changed") |
| 349 histogram_diff = float(diff_match.group(1)) |
| 350 exact_diff = float(diff_match.group(2)) |
| 351 combined_diff = max(histogram_diff + exact_diff / 8, 0.001) |
| 352 |
| 353 # Produce diff PNG. |
| 354 subprocess.call([image_diff, "--diff", before_path, after_path, diff_path]) |
| 355 return (combined_diff, url, diff_path) |
| 356 |
| 357 |
| 358 def CompareResults(): |
| 359 print "Running image_diff on %d pages..." % len(urls) |
| 360 start_time = time.time() |
| 361 |
| 362 results = multiprocessing.Pool().map(CompareResultsTask, urls) |
| 363 results.sort(key=itemgetter(0), reverse=True) |
| 364 |
| 365 PrintElapsedTime(time.time() - start_time) |
| 366 |
| 367 now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M") |
| 368 html_start = textwrap.dedent("""\ |
| 369 <!DOCTYPE html> |
| 370 <html> |
| 371 <head> |
| 372 <title>Real World Impact report %s</title> |
| 373 <script> |
| 374 var togglingImg = null; |
| 375 var toggleTimer = null; |
| 376 |
| 377 var before = true; |
| 378 function toggle() { |
| 379 var newFolder = before ? "before" : "after"; |
| 380 togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder); |
| 381 before = !before; |
| 382 toggleTimer = setTimeout(toggle, 300); |
| 383 } |
| 384 |
| 385 function startToggle(img) { |
| 386 before = true; |
| 387 togglingImg = img; |
| 388 if (!img.origSrc) |
| 389 img.origSrc = img.src; |
| 390 toggle(); |
| 391 } |
| 392 function stopToggle(img) { |
| 393 clearTimeout(toggleTimer); |
| 394 img.src = img.origSrc; |
| 395 } |
| 396 |
| 397 document.onkeydown = function(e) { |
| 398 e = e || window.event; |
| 399 var keyCode = e.keyCode || e.which; |
| 400 var newFolder; |
| 401 switch (keyCode) { |
| 402 case 49: //'1' |
| 403 newFolder = "before"; break; |
| 404 case 50: //'2' |
| 405 newFolder = "after"; break; |
| 406 case 51: //'3' |
| 407 newFolder = "diff"; break; |
| 408 default: |
| 409 return; |
| 410 } |
| 411 var imgs = document.getElementsByTagName("img"); |
| 412 for (var i = 0; i < imgs.length; i++) { |
| 413 imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder); |
| 414 } |
| 415 }; |
| 416 </script> |
| 417 <style> |
| 418 h1 { |
| 419 font-family: sans; |
| 420 } |
| 421 h2 { |
| 422 font-family: monospace; |
| 423 white-space: pre; |
| 424 } |
| 425 .nsfw-spacer { |
| 426 height: 50vh; |
| 427 } |
| 428 .nsfw-warning { |
| 429 background: yellow; |
| 430 border: 10px solid red; |
| 431 } |
| 432 .info { |
| 433 font-size: 1.2em; |
| 434 font-style: italic; |
| 435 } |
| 436 body:not(.details-supported) details { |
| 437 display: none; |
| 438 } |
| 439 </style> |
| 440 </head> |
| 441 <body> |
| 442 <script> |
| 443 if ('open' in document.createElement('details')) |
| 444 document.body.className = "details-supported"; |
| 445 </script> |
| 446 <!--<div class="nsfw-spacer"></div>--> |
| 447 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d |
| 448 and may be NSFW.</p> |
| 449 <!--<div class="nsfw-spacer"></div>--> |
| 450 <h1>Real World Impact report %s</h1> |
| 451 <p class="info">Press 1, 2 and 3 to switch between before, after and diff |
| 452 screenshots respectively; or hover over the images to rapidly alternate |
| 453 between before and after.</p> |
| 454 """ % (now, num_sites, now)) |
| 455 |
| 456 html_same_row = """\ |
| 457 <h2>No difference on <a href="%s">%s</a>.</h2> |
| 458 """ |
| 459 |
| 460 html_diff_row = """\ |
| 461 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> |
| 462 <img src="%s" width="800" height="600" |
| 463 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> |
| 464 """ |
| 465 |
| 466 html_nsfw_diff_row = """\ |
| 467 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> |
| 468 <details> |
| 469 <summary>This site may be NSFW. Click to expand/collapse.</summary> |
| 470 <img src="%s" width="800" height="600" |
| 471 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> |
| 472 </details> |
| 473 """ |
| 474 |
| 475 html_end = textwrap.dedent("""\ |
| 476 </body> |
| 477 </html>""") |
| 478 |
| 479 html_path = os.path.join(output_dir, "diff.html") |
| 480 with open(html_path, 'w') as f: |
| 481 f.write(html_start) |
| 482 for (diff_float, url, diff_path) in results: |
| 483 diff_path = os.path.relpath(diff_path, output_dir) |
| 484 if diff_float == 0: |
| 485 f.write(html_same_row % (url, url)) |
| 486 elif url in nsfw_urls: |
| 487 f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path)) |
| 488 else: |
| 489 f.write(html_diff_row % (diff_float, url, url, diff_path)) |
| 490 f.write(html_end) |
| 491 |
| 492 webbrowser.open_new_tab("file://" + html_path) |
| 493 |
| 494 |
| 495 def main(argv): |
| 496 global num_sites, action, allow_js, additional_content_shell_flags |
| 497 |
| 498 parser = argparse.ArgumentParser( |
| 499 formatter_class=RawTextHelpFormatter, |
| 500 description="Compare the real world impact of a content shell change.", |
| 501 epilog=textwrap.dedent("""\ |
| 502 Example usage: |
| 503 1. Build content_shell in out/Release without any changes. |
| 504 2. Run: %s before [num sites to test (default %d)]. |
| 505 3. Either: |
| 506 a. Apply your controversial patch and rebuild content_shell. |
| 507 b. Pass --additional_flags="--enable_your_flag" in step 4. |
| 508 4. Run: %s after [num sites to test (default %d)]. |
| 509 5. Run: %s compare [num sites to test (default %d)]. |
| 510 This will open the results in your web browser. |
| 511 """ % (argv[0], num_sites, argv[0], num_sites, argv[0], num_sites))) |
| 512 parser.add_argument("--allow_js", help="Don't disable Javascript", |
| 513 action="store_true") |
| 514 parser.add_argument("--additional_flags", |
| 515 help="Additional flags to pass to content shell") |
| 516 parser.add_argument("action", |
| 517 help=textwrap.dedent("""\ |
| 518 Action to perform. |
| 519 download - Just download the sites. |
| 520 before - Run content shell and record 'before' result. |
| 521 after - Run content shell and record 'after' result. |
| 522 compare - Compare before and after results. |
| 523 """), |
| 524 choices=["download", "before", "after", "compare"]) |
| 525 parser.add_argument("num_sites", |
| 526 help="Number of sites (default %s)" % num_sites, |
| 527 type=int, default=num_sites, nargs='?') |
| 528 args = parser.parse_args() |
| 529 |
| 530 action = args.action |
| 531 |
| 532 if (args.num_sites): |
| 533 num_sites = args.num_sites |
| 534 |
| 535 if (args.allow_js): |
| 536 allow_js = args.allow_js |
| 537 |
| 538 if (args.additional_flags): |
| 539 additional_content_shell_flags = args.additional_flags |
| 540 |
| 541 if not SetupPathsAndOut() or not CheckPrerequisites() or not PickSampleUrls(): |
| 542 return 1 |
| 543 |
| 544 if action == 'compare': |
| 545 CompareResults() |
| 546 else: |
| 547 DownloadStaticCopies() |
| 548 if action != 'download': |
| 549 RunDrt() |
| 550 return 0 |
| 551 |
| 552 |
| 553 if __name__ == '__main__': |
| 554 sys.exit(main(sys.argv)) |
OLD | NEW |