Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 #!/usr/bin/env python | |
| 2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
| 3 # Use of this source code is governed by a BSD-style license that can be | |
| 4 # found in the LICENSE file. | |
| 5 | |
| 6 # Tool for seeing the real world impact of a patch. | |
| 7 # | |
| 8 # Layout Tests can tell you whether something has changed, but this can help | |
| 9 # you determine whether a subtle/controversial change is beneficial or not. | |
| 10 # | |
| 11 # It dumps the rendering of a large number of sites, both with and without a | |
| 12 # patch being evaluated, then sorts them by greatest difference in rendering, | |
| 13 # such that a human reviewer can quickly review the most impacted sites, | |
| 14 # rather than having to manually try sites to see if anything changes. | |
| 15 # | |
| 16 # In future it might be possible to extend this to other kinds of differences, | |
| 17 # e.g. page load times. | |
| 18 | |
| 19 from contextlib import closing | |
| 20 import datetime | |
| 21 import errno | |
| 22 from distutils.spawn import find_executable | |
| 23 from operator import itemgetter | |
| 24 import multiprocessing | |
| 25 import os | |
| 26 import re | |
| 27 from cStringIO import StringIO | |
| 28 import subprocess | |
| 29 import sys | |
| 30 import textwrap | |
| 31 import time | |
| 32 from urllib2 import urlopen | |
| 33 from urlparse import urlparse | |
| 34 import webbrowser | |
| 35 from zipfile import ZipFile | |
| 36 | |
| 37 from nsfw_urls import nsfw_urls | |
| 38 | |
| 39 action = None | |
| 40 allow_js = False | |
| 41 chromium_src_root = "" | |
| 42 chromium_out_dir = "" | |
| 43 output_dir = "" | |
| 44 num_sites = 1000 | |
| 45 urls = [] | |
| 46 print_lock = multiprocessing.Lock() | |
| 47 | |
| 48 | |
| 49 def PrintUsage(argv0): | |
| 50 this_script = os.path.basename(argv0) | |
| 51 print textwrap.dedent("""\ | |
| 52 USAGE | |
|
pdr.
2014/01/06 23:27:33
Please list the command line options here
| |
| 53 1. Build content_shell in out/Release, without the controversial patch. | |
| 54 2. Run: %s before [num sites to test (default %d)] | |
| 55 3. Apply the controversial patch, and rebuild content_shell in out/Release. | |
| 56 4. Run: %s after [num sites to test (default %d)] | |
| 57 5. Run: %s compare [num sites to test (default %d)] | |
| 58 Output is stored in: %s | |
| 59 The compare step will open results in your web browser.""" | |
| 60 % (this_script, num_sites, this_script, num_sites, this_script, num_sites, | |
| 61 output_dir)) | |
| 62 | |
| 63 | |
| 64 def MakeDirsIfNotExist(dir): | |
| 65 try: | |
| 66 os.makedirs(dir) | |
| 67 except OSError as e: | |
| 68 if e.errno != errno.EEXIST: | |
| 69 raise | |
| 70 | |
| 71 def MakeOutDir(): | |
| 72 global chromium_src_root, chromium_out_dir, output_dir | |
| 73 chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__), | |
| 74 os.pardir, | |
| 75 os.pardir)) | |
| 76 # Find out directory (might be out_linux for users of cr). | |
| 77 for out_suffix in ["_linux", ""]: | |
| 78 out_dir = os.path.join(chromium_src_root, "out" + out_suffix) | |
| 79 if os.path.exists(out_dir): | |
| 80 chromium_out_dir = out_dir | |
| 81 break | |
| 82 if not chromium_out_dir: | |
| 83 return False | |
| 84 | |
| 85 this_script_name = "real_world_impact" | |
| 86 output_dir = os.path.join(chromium_out_dir, | |
| 87 "Release", | |
| 88 this_script_name) | |
| 89 MakeDirsIfNotExist(output_dir) | |
| 90 return True | |
| 91 | |
| 92 | |
| 93 def CheckPrerequisites(): | |
| 94 if not find_executable("wget"): | |
| 95 print "Please install wget and re-run this." | |
| 96 return False | |
| 97 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") | |
| 98 if not os.path.exists(image_diff): | |
| 99 print "Please build the image_diff target and re-run this." | |
| 100 return False | |
| 101 return True | |
| 102 | |
| 103 | |
| 104 def PickSampleUrls(): | |
| 105 global urls | |
| 106 data_dir = os.path.join(output_dir, "data") | |
| 107 MakeDirsIfNotExist(data_dir) | |
| 108 | |
| 109 # Download Alexa top 1,000,000 sites | |
| 110 # TODO(johnme): Should probably update this when it gets too stale... | |
| 111 csv_path = os.path.join(data_dir, "top-1m.csv") | |
| 112 if not os.path.exists(csv_path): | |
| 113 print "Downloading list of top 1,000,000 sites from Alexa..." | |
| 114 csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" | |
| 115 with closing(urlopen(csv_url)) as stream: | |
| 116 ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir) | |
| 117 | |
| 118 bad_urls_path = os.path.join(data_dir, "bad_urls.txt") | |
| 119 if os.path.exists(bad_urls_path): | |
| 120 with open(bad_urls_path) as f: | |
| 121 bad_urls = set(f.read().splitlines()) | |
| 122 else: | |
| 123 bad_urls = set() #["jrj.com.cn"]) | |
| 124 | |
| 125 # See if we've already selected a sample of size num_sites (this way, if you | |
| 126 # call this script with arguments "before N" then "after N", where N is the | |
| 127 # same number, we'll use the same sample, as expected!). | |
| 128 urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites) | |
| 129 if not os.path.exists(urls_path): | |
| 130 if action == 'compare': | |
| 131 print ("Error: you must run 'before %d' and 'after %d' before " | |
| 132 "running 'compare %d'") % (num_sites, num_sites, num_sites) | |
| 133 return False | |
| 134 print "Picking %d sample urls..." % num_sites | |
| 135 | |
| 136 # TODO(johnme): For now this just gets the top num_sites entries. In future | |
| 137 # this should pick a weighted random sample. For example, it could fit a | |
| 138 # power-law distribution, which is a good model of website popularity | |
| 139 # (http://www.useit.com/alertbox/9704b.html). | |
| 140 urls = [] | |
| 141 remaining_num_sites = num_sites | |
| 142 with open(csv_path) as f: | |
| 143 for entry in f: | |
| 144 if remaining_num_sites <= 0: | |
| 145 break | |
| 146 remaining_num_sites -= 1 | |
| 147 hostname = entry.strip().split(',')[1] | |
| 148 if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths. | |
| 149 url = "http://%s/" % hostname | |
| 150 if not url in bad_urls: | |
| 151 urls.append(url) | |
| 152 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below | |
| 153 # once we have tried to download them and seen which ones fail. | |
| 154 else: | |
| 155 with open(urls_path) as f: | |
| 156 urls = [u for u in f.read().splitlines() if not u in bad_urls] | |
| 157 return True | |
| 158 | |
| 159 | |
| 160 def SaveWorkingUrls(): | |
| 161 # TODO(johnme): Update the list if a url that used to work goes offline. | |
| 162 urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites) | |
| 163 if not os.path.exists(urls_path): | |
| 164 with open(urls_path, 'w') as f: | |
| 165 f.writelines(u + '\n' for u in urls) | |
| 166 | |
| 167 | |
| 168 def PrintElapsedTime(elapsed, detail=""): | |
| 169 elapsed = round(elapsed * 10) / 10.0 | |
| 170 m = elapsed / 60 | |
| 171 s = elapsed % 60 | |
| 172 print "Took %dm%.1fs" % (m, s), detail | |
| 173 | |
| 174 | |
| 175 def DownloadStaticCopyTask(url): | |
| 176 url_parts = urlparse(url) | |
| 177 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
| 178 # Use wget for now, as does a reasonable job of spidering page dependencies | |
| 179 # (e.g. CSS, JS, images). | |
| 180 success = True | |
| 181 try: | |
| 182 subprocess.check_call(["wget", | |
| 183 "--execute", "robots=off", | |
| 184 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS " | |
| 185 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C" | |
| 186 "hrome/32.0.1700.14 Safari/537.36"), | |
| 187 "--page-requisites", | |
| 188 "--span-hosts", | |
| 189 "--adjust-extension", | |
| 190 "--convert-links", | |
| 191 "--directory-prefix=" + host_dir, | |
| 192 "--force-directories", | |
| 193 "--default-page=index.html", | |
| 194 "--quiet", | |
| 195 url]) | |
| 196 except subprocess.CalledProcessError: | |
| 197 # Ignoring these for now, as some sites have issues with their subresources | |
| 198 # yet still produce a renderable index.html | |
| 199 pass #success = False | |
| 200 if success: | |
| 201 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
| 202 if not os.path.exists(download_path): | |
| 203 success = False | |
| 204 if not success: | |
| 205 with print_lock: | |
| 206 print "Failed to download:", url | |
| 207 return False | |
| 208 return True | |
| 209 | |
| 210 | |
| 211 def DownloadStaticCopies(): | |
| 212 global urls | |
| 213 new_urls = [] | |
| 214 for url in urls: | |
| 215 url_parts = urlparse(url) | |
| 216 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
| 217 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
| 218 if not os.path.exists(download_path): | |
| 219 new_urls.append(url) | |
| 220 | |
| 221 if new_urls: | |
| 222 print "Downloading static copies of %d sites..." % len(new_urls) | |
| 223 start_time = time.time() | |
| 224 | |
| 225 results = multiprocessing.Pool(8).map(DownloadStaticCopyTask, new_urls) | |
| 226 failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret] | |
| 227 if failed_urls: | |
| 228 bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt") | |
| 229 with open(bad_urls_path, 'a') as f: | |
| 230 f.writelines(u + '\n' for u in failed_urls) | |
| 231 failed_urls_set = set(failed_urls) | |
| 232 urls = [u for u in urls if u not in failed_urls_set] | |
| 233 | |
| 234 PrintElapsedTime(time.time() - start_time) | |
| 235 | |
| 236 SaveWorkingUrls() | |
| 237 | |
| 238 | |
| 239 def RunDrtTask(url): | |
| 240 url_parts = urlparse(url) | |
| 241 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
| 242 html_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
| 243 | |
| 244 if not allow_js: | |
| 245 nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html") | |
| 246 if not os.path.exists(nojs_path): | |
| 247 with open(html_path) as f: | |
| 248 html = f.read() | |
| 249 if not html: | |
| 250 return False | |
| 251 # These aren't intended to be XSS safe :) | |
| 252 block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)' | |
| 253 r'\b.*?<\s*\/\s*\1\s*>') | |
| 254 block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)' | |
| 255 html = re.sub(block_tags, '', html, flags=re.I|re.S) | |
| 256 html = re.sub(block_attrs, '', html, flags=re.I) | |
| 257 with open(nojs_path, 'w') as f: | |
| 258 f.write(html) | |
| 259 html_path = nojs_path | |
| 260 | |
| 261 content_shell = os.path.join(chromium_out_dir, "Release", "content_shell") | |
| 262 start_time = time.time() | |
| 263 | |
| 264 with open(os.devnull, "w") as fnull: | |
| 265 p = subprocess.Popen([content_shell, | |
| 266 "--dump-render-tree", | |
| 267 # The single quote is not a typo, it's a separator! | |
| 268 html_path + "'--pixel-test" | |
| 269 ], | |
| 270 shell=False, | |
| 271 stdout=subprocess.PIPE, | |
| 272 stderr=fnull) | |
| 273 | |
| 274 result = p.stdout.read() | |
| 275 PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A" | |
| 276 PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82" | |
| 277 try: | |
| 278 start = result.index(PNG_START) | |
| 279 end = result.rindex(PNG_END) + 8 | |
| 280 except ValueError: | |
| 281 return False | |
| 282 | |
| 283 png_path = os.path.join(output_dir, action, url_parts.hostname + ".png") | |
| 284 MakeDirsIfNotExist(os.path.dirname(png_path)) | |
| 285 with open(png_path, 'wb') as f: | |
| 286 f.write(result[start:end]) | |
| 287 elapsed_time = (time.time() - start_time, url) | |
| 288 return elapsed_time | |
| 289 | |
| 290 | |
| 291 def RunDrt(): | |
| 292 print "Taking screenshots of %d pages..." % len(urls) | |
| 293 start_time = time.time() | |
| 294 | |
| 295 results = multiprocessing.Pool().map(RunDrtTask, urls, 1) | |
| 296 | |
| 297 max_time, url = max(t for t in results if t) | |
| 298 elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url) | |
| 299 PrintElapsedTime(time.time() - start_time, elapsed_detail) | |
| 300 | |
| 301 | |
| 302 def CompareResultsTask(url): | |
| 303 url_parts = urlparse(url) | |
| 304 before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png") | |
| 305 after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png") | |
| 306 diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png") | |
| 307 MakeDirsIfNotExist(os.path.join(output_dir, "diff")) | |
| 308 | |
| 309 # TODO(johnme): Don't hardcode "real_world_impact". | |
| 310 red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA" | |
| 311 "ABAAEAAAICRAEAOw==") | |
| 312 | |
| 313 before_exists = os.path.exists(before_path) | |
| 314 after_exists = os.path.exists(after_path) | |
| 315 if not before_exists and not after_exists: | |
| 316 # TODO(johnme): Make this more informative. | |
| 317 return (-100, url, red_path) | |
| 318 if before_exists != after_exists: | |
| 319 # TODO(johnme): Make this more informative. | |
| 320 return (200, url, red_path) | |
| 321 | |
| 322 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") | |
| 323 | |
| 324 # Get percentage difference. | |
| 325 p = subprocess.Popen([image_diff, "--histogram", | |
| 326 before_path, after_path], | |
| 327 shell=False, | |
| 328 stdout=subprocess.PIPE) | |
| 329 output,_ = p.communicate() | |
| 330 if p.returncode == 0: | |
| 331 return (0, url, before_path) | |
| 332 diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n' | |
| 333 'exact diff: (\d+\.\d{2})% (?:passed|failed)', output) | |
| 334 if not diff_match: | |
| 335 raise Exception("image_diff output format changed") | |
| 336 histogram_diff = float(diff_match.group(1)) | |
| 337 exact_diff = float(diff_match.group(2)) | |
| 338 combined_diff = max(histogram_diff + exact_diff / 8, 0.001) | |
| 339 | |
| 340 # Produce diff PNG. | |
| 341 subprocess.call([image_diff, "--diff", before_path, after_path, diff_path]) | |
| 342 return (combined_diff, url, diff_path) | |
| 343 | |
| 344 | |
| 345 def CompareResults(): | |
| 346 print "Running image_diff on %d pages..." % len(urls) | |
| 347 start_time = time.time() | |
| 348 | |
| 349 results = multiprocessing.Pool().map(CompareResultsTask, urls) | |
| 350 results.sort(key=itemgetter(0), reverse=True) | |
| 351 | |
| 352 PrintElapsedTime(time.time() - start_time) | |
| 353 | |
| 354 now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M") | |
| 355 html_start = textwrap.dedent("""\ | |
| 356 <!DOCTYPE html> | |
| 357 <html> | |
| 358 <head> | |
| 359 <title>Real World Impact report %s</title> | |
| 360 <script> | |
| 361 var togglingImg = null; | |
| 362 var toggleTimer = null; | |
| 363 | |
| 364 var before = true; | |
| 365 function toggle() { | |
| 366 var newFolder = before ? "before" : "after"; | |
| 367 togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder); | |
| 368 before = !before; | |
| 369 toggleTimer = setTimeout(toggle, 300); | |
| 370 } | |
| 371 | |
| 372 function startToggle(img) { | |
| 373 before = true; | |
| 374 togglingImg = img; | |
| 375 if (!img.origSrc) | |
| 376 img.origSrc = img.src; | |
| 377 toggle(); | |
| 378 } | |
| 379 function stopToggle(img) { | |
| 380 clearTimeout(toggleTimer); | |
| 381 img.src = img.origSrc; | |
| 382 } | |
| 383 | |
| 384 document.onkeydown = function(e) { | |
| 385 e = e || window.event; | |
| 386 var keyCode = e.keyCode || e.which; | |
| 387 var newFolder; | |
| 388 switch (keyCode) { | |
| 389 case 49: //'1' | |
| 390 newFolder = "before"; break; | |
| 391 case 50: //'2' | |
| 392 newFolder = "after"; break; | |
| 393 case 51: //'3' | |
| 394 newFolder = "diff"; break; | |
| 395 default: | |
| 396 return; | |
| 397 } | |
| 398 var imgs = document.getElementsByTagName("img"); | |
| 399 for (var i = 0; i < imgs.length; i++) { | |
| 400 imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder); | |
| 401 } | |
| 402 }; | |
| 403 </script> | |
| 404 <style> | |
| 405 h1 { | |
| 406 font-family: sans; | |
| 407 } | |
| 408 h2 { | |
| 409 font-family: monospace; | |
| 410 white-space: pre; | |
| 411 } | |
| 412 .nsfw-spacer { | |
| 413 height: 50vh; | |
| 414 } | |
| 415 .nsfw-warning { | |
| 416 background: yellow; | |
| 417 border: 10px solid red; | |
| 418 } | |
| 419 .info { | |
| 420 font-size: 1.2em; | |
| 421 font-style: italic; | |
| 422 } | |
| 423 body:not(.details-supported) details { | |
| 424 display: none; | |
| 425 } | |
| 426 </style> | |
| 427 </head> | |
| 428 <body> | |
| 429 <script> | |
| 430 if ('open' in document.createElement('details')) | |
| 431 document.body.className = "details-supported"; | |
| 432 </script> | |
| 433 <!--<div class="nsfw-spacer"></div>--> | |
| 434 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d | |
| 435 and may be NSFW.</p> | |
| 436 <!--<div class="nsfw-spacer"></div>--> | |
| 437 <h1>Real World Impact report %s</h1> | |
| 438 <p class="info">Press 1, 2 and 3 to switch between before, after and diff | |
| 439 screenshots respectively; or hover over the images to rapidly alternate | |
| 440 between before and after.</p> | |
| 441 """ % (now, num_sites, now)) | |
| 442 | |
| 443 html_same_row = """\ | |
| 444 <h2>No difference on <a href="%s">%s</a>.</h2> | |
| 445 """ | |
| 446 | |
| 447 html_diff_row = """\ | |
| 448 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> | |
| 449 <img src="%s" width="800" height="600" | |
| 450 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> | |
| 451 """ | |
| 452 | |
| 453 html_nsfw_diff_row = """\ | |
| 454 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> | |
| 455 <details> | |
| 456 <summary>This site may be NSFW. Click to expand/collapse.</summary> | |
| 457 <img src="%s" width="800" height="600" | |
| 458 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> | |
| 459 </details> | |
| 460 """ | |
| 461 | |
| 462 html_end = textwrap.dedent("""\ | |
| 463 </body> | |
| 464 </html>""") | |
| 465 | |
| 466 html_path = os.path.join(output_dir, "diff.html") | |
| 467 with open(html_path, 'w') as f: | |
| 468 f.write(html_start) | |
| 469 for (diff_float, url, diff_path) in results: | |
| 470 diff_path = os.path.relpath(diff_path, output_dir) | |
| 471 if diff_float == 0: | |
| 472 f.write(html_same_row % (url, url)) | |
| 473 elif url in nsfw_urls: | |
| 474 f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path)) | |
| 475 else: | |
| 476 f.write(html_diff_row % (diff_float, url, url, diff_path)) | |
| 477 f.write(html_end) | |
| 478 | |
| 479 webbrowser.open_new_tab("file://" + html_path) | |
| 480 | |
| 481 | |
| 482 def main(argv): | |
| 483 global num_sites, action, allow_js | |
| 484 | |
| 485 for arg in argv[1:]: | |
| 486 try: | |
| 487 num_sites = int(arg) | |
| 488 except ValueError: | |
| 489 if arg == '--allow-js': | |
| 490 allow_js = True | |
| 491 elif arg in ['download', 'before', 'after', 'compare']: | |
| 492 action = arg | |
| 493 else: | |
| 494 # Unrecognized argument. Show usage. | |
| 495 action = None | |
| 496 break | |
| 497 | |
| 498 if not action or (action in ['download', 'compare'] and allow_js): | |
| 499 PrintUsage(argv[0]) | |
| 500 return 2 | |
| 501 | |
| 502 if not MakeOutDir() or not CheckPrerequisites() or not PickSampleUrls(): | |
| 503 return 1 | |
| 504 | |
| 505 if action == 'compare': | |
| 506 CompareResults() | |
| 507 else: | |
| 508 DownloadStaticCopies() | |
| 509 if action != 'download': | |
| 510 RunDrt() | |
| 511 return 0 | |
| 512 | |
| 513 | |
| 514 if __name__ == '__main__': | |
| 515 sys.exit(main(sys.argv)) | |
| OLD | NEW |