OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/env python | |
2 # Copyright (c) 2014 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 # Tool for seeing the real world impact of a patch. | |
7 # | |
8 # Layout Tests can tell you whether something has changed, but this can help | |
9 # you determine whether a subtle/controversial change is beneficial or not. | |
10 # | |
11 # It dumps the rendering of a large number of sites, both with and without a | |
12 # patch being evaluated, then sorts them by greatest difference in rendering, | |
13 # such that a human reviewer can quickly review the most impacted sites, | |
14 # rather than having to manually try sites to see if anything changes. | |
15 # | |
16 # In future it might be possible to extend this to other kinds of differences, | |
17 # e.g. page load times. | |
18 | |
19 from contextlib import closing | |
20 import datetime | |
21 import errno | |
22 from distutils.spawn import find_executable | |
23 from operator import itemgetter | |
24 import multiprocessing | |
25 import os | |
26 import re | |
27 from cStringIO import StringIO | |
28 import subprocess | |
29 import sys | |
30 import textwrap | |
31 import time | |
32 from urllib2 import urlopen | |
33 from urlparse import urlparse | |
34 import webbrowser | |
35 from zipfile import ZipFile | |
36 | |
37 from nsfw_urls import nsfw_urls | |
38 | |
39 action = None | |
40 allow_js = False | |
41 chromium_src_root = "" | |
42 chromium_out_dir = "" | |
43 output_dir = "" | |
44 num_sites = 1000 | |
45 urls = [] | |
46 print_lock = multiprocessing.Lock() | |
47 | |
48 | |
49 def PrintUsage(argv0): | |
50 this_script = os.path.basename(argv0) | |
51 print textwrap.dedent("""\ | |
52 Real World Impact usage: | |
53 1. Build content_shell in out/Release without controversial patch. | |
54 2. Run: %s before [num sites to test (default %d)] | |
55 3. Apply the controversial patch, and rebuild content_shell in out/Release. | |
56 4. Run: %s after [num sites to test (default %d)] | |
57 5. Run: %s compare [num sites to test (default %d)] | |
58 Output is stored in: %s | |
59 The compare step will open results in your web browser.""" | |
60 % (this_script, num_sites, this_script, num_sites, this_script, num_sites, | |
61 output_dir)) | |
62 | |
63 | |
64 def MakeDirsIfNotExist(dir): | |
65 try: | |
66 os.makedirs(dir) | |
67 except OSError as e: | |
68 if e.errno != errno.EEXIST: | |
69 raise | |
70 | |
71 def MakeOutDir(): | |
72 global chromium_src_root, chromium_out_dir, output_dir | |
73 chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__), | |
74 os.pardir, | |
75 os.pardir)) | |
76 # Find out directory (might be out_linux for users of cr). | |
77 for out_suffix in ["_linux", ""]: | |
78 out_dir = os.path.join(chromium_src_root, "out" + out_suffix) | |
79 if os.path.exists(out_dir): | |
80 chromium_out_dir = out_dir | |
81 break | |
82 if not chromium_out_dir: | |
83 return False | |
84 | |
85 this_script_name = "real_world_impact" | |
86 output_dir = os.path.join(chromium_out_dir, | |
87 "Release", | |
88 this_script_name) | |
89 MakeDirsIfNotExist(output_dir) | |
90 return True | |
91 | |
92 | |
93 def CheckPrerequisites(): | |
94 if not find_executable("wget"): | |
95 print "Please install wget and re-run this." | |
96 return False | |
97 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") | |
98 if not os.path.exists(image_diff): | |
99 print "Please build the image_diff target and re-run this." | |
100 return False | |
101 return True | |
102 | |
103 | |
104 def PickSampleUrls(): | |
105 global urls | |
106 data_dir = os.path.join(output_dir, "data") | |
107 MakeDirsIfNotExist(data_dir) | |
108 | |
109 # Download Alexa top 1,000,000 sites | |
110 # TODO(johnme): Should probably update this when it gets too stale... | |
111 csv_path = os.path.join(data_dir, "top-1m.csv") | |
112 if not os.path.exists(csv_path): | |
113 print "Downloading list of top 1,000,000 sites from Alexa..." | |
114 csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" | |
115 with closing(urlopen(csv_url)) as stream: | |
116 ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir) | |
117 | |
118 bad_urls_path = os.path.join(data_dir, "bad_urls.txt") | |
119 if os.path.exists(bad_urls_path): | |
120 with open(bad_urls_path) as f: | |
121 bad_urls = set(f.read().splitlines()) | |
122 else: | |
123 bad_urls = set() | |
124 | |
125 # See if we've already selected a sample of size num_sites (this way, if you | |
126 # call this script with arguments "before N" then "after N", where N is the | |
127 # same number, we'll use the same sample, as expected!). | |
128 urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites) | |
129 if not os.path.exists(urls_path): | |
130 if action == 'compare': | |
131 print ("Error: you must run 'before %d' and 'after %d' before " | |
132 "running 'compare %d'") % (num_sites, num_sites, num_sites) | |
133 return False | |
134 print "Picking %d sample urls..." % num_sites | |
135 | |
136 # TODO(johnme): For now this just gets the top num_sites entries. In future | |
137 # this should pick a weighted random sample. For example, it could fit a | |
138 # power-law distribution, which is a good model of website popularity | |
139 # (http://www.useit.com/alertbox/9704b.html). | |
140 urls = [] | |
141 remaining_num_sites = num_sites | |
142 with open(csv_path) as f: | |
143 for entry in f: | |
144 if remaining_num_sites <= 0: | |
145 break | |
146 remaining_num_sites -= 1 | |
147 hostname = entry.strip().split(',')[1] | |
148 if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths. | |
149 url = "http://%s/" % hostname | |
150 if not url in bad_urls: | |
151 urls.append(url) | |
152 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below | |
153 # once we have tried to download them and seen which ones fail. | |
154 else: | |
155 with open(urls_path) as f: | |
156 urls = [u for u in f.read().splitlines() if not u in bad_urls] | |
157 return True | |
158 | |
159 | |
160 def SaveWorkingUrls(): | |
161 # TODO(johnme): Update the list if a url that used to work goes offline. | |
162 urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites) | |
163 if not os.path.exists(urls_path): | |
164 with open(urls_path, 'w') as f: | |
165 f.writelines(u + '\n' for u in urls) | |
166 | |
167 | |
168 def PrintElapsedTime(elapsed, detail=""): | |
169 elapsed = round(elapsed * 10) / 10.0 | |
170 m = elapsed / 60 | |
171 s = elapsed % 60 | |
172 print "Took %dm%.1fs" % (m, s), detail | |
173 | |
174 | |
175 def DownloadStaticCopyTask(url): | |
176 url_parts = urlparse(url) | |
177 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
178 # Use wget for now, as does a reasonable job of spidering page dependencies | |
179 # (e.g. CSS, JS, images). | |
180 success = True | |
181 try: | |
182 subprocess.check_call(["wget", | |
183 "--execute", "robots=off", | |
184 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS " | |
185 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C" | |
186 "hrome/32.0.1700.14 Safari/537.36"), | |
187 "--page-requisites", | |
188 "--span-hosts", | |
189 "--adjust-extension", | |
190 "--convert-links", | |
191 "--directory-prefix=" + host_dir, | |
192 "--force-directories", | |
193 "--default-page=index.html", | |
194 "--no-check-certificate", | |
195 "--timeout 20", # 20s timeout | |
skobes
2014/03/22 06:32:54
I'd probably use a shorter timeout, like 5s.
| |
196 "--tries 2", # 20s timeout | |
197 "--quiet", | |
198 url]) | |
199 except KeyboardInterrupt: | |
200 success = False | |
201 except subprocess.CalledProcessError: | |
202 # Ignoring these for now, as some sites have issues with their subresources | |
203 # yet still produce a renderable index.html | |
204 pass #success = False | |
205 if success: | |
206 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
207 if not os.path.exists(download_path): | |
208 success = False | |
209 else: | |
210 with print_lock: | |
211 print "Downloaded:", url | |
212 if not success: | |
213 with print_lock: | |
214 print "Failed to download:", url | |
215 return False | |
216 return True | |
217 | |
218 | |
219 def DownloadStaticCopies(): | |
220 global urls | |
221 new_urls = [] | |
222 for url in urls: | |
223 url_parts = urlparse(url) | |
224 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
225 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
226 if not os.path.exists(download_path): | |
227 new_urls.append(url) | |
228 | |
229 if new_urls: | |
230 print "Downloading static copies of %d sites..." % len(new_urls) | |
231 start_time = time.time() | |
232 | |
233 results = multiprocessing.Pool(20).map(DownloadStaticCopyTask, new_urls) | |
234 failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret] | |
235 if failed_urls: | |
236 bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt") | |
237 with open(bad_urls_path, 'a') as f: | |
238 f.writelines(u + '\n' for u in failed_urls) | |
239 failed_urls_set = set(failed_urls) | |
240 urls = [u for u in urls if u not in failed_urls_set] | |
241 | |
242 PrintElapsedTime(time.time() - start_time) | |
243 | |
244 SaveWorkingUrls() | |
245 | |
246 | |
247 def RunDrtTask(url): | |
248 url_parts = urlparse(url) | |
249 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
250 html_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
251 | |
252 if not allow_js: | |
253 nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html") | |
254 if not os.path.exists(nojs_path): | |
255 with open(html_path) as f: | |
256 html = f.read() | |
257 if not html: | |
258 return False | |
259 # These aren't intended to be XSS safe :) | |
260 block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)' | |
261 r'\b.*?<\s*\/\s*\1\s*>') | |
262 block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)' | |
263 html = re.sub(block_tags, '', html, flags=re.I|re.S) | |
264 html = re.sub(block_attrs, '', html, flags=re.I) | |
265 with open(nojs_path, 'w') as f: | |
266 f.write(html) | |
267 html_path = nojs_path | |
268 | |
269 if sys.platform == 'darwin': | |
270 content_shell = os.path.join(chromium_out_dir, "Release", | |
271 "Content Shell.app/Contents/MacOS/Content Shell") | |
272 elif sys.platform.startswith('linux'): | |
273 content_shell = os.path.join(chromium_out_dir, "Release", | |
274 "content_shell") | |
275 elif sys.platform.startswith('win'): | |
276 content_shell = os.path.join(chromium_out_dir, "Release", | |
277 "content_shell.exe") | |
278 start_time = time.time() | |
279 | |
280 with open(os.devnull, "w") as fnull: | |
281 p = subprocess.Popen([content_shell, | |
282 "--dump-render-tree", | |
283 # The single quote is not a typo, it's a separator! | |
284 html_path + "'--pixel-test" | |
285 ], | |
286 shell=False, | |
287 stdout=subprocess.PIPE, | |
288 stderr=fnull) | |
289 result = p.stdout.read() | |
290 PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A" | |
291 PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82" | |
292 try: | |
293 start = result.index(PNG_START) | |
294 end = result.rindex(PNG_END) + 8 | |
295 except ValueError: | |
296 return False | |
297 | |
298 png_path = os.path.join(output_dir, action, url_parts.hostname + ".png") | |
299 MakeDirsIfNotExist(os.path.dirname(png_path)) | |
300 with open(png_path, 'wb') as f: | |
301 f.write(result[start:end]) | |
302 elapsed_time = (time.time() - start_time, url) | |
303 return elapsed_time | |
304 | |
305 | |
306 def RunDrt(): | |
307 print "Taking screenshots of %d pages..." % len(urls) | |
308 start_time = time.time() | |
309 | |
310 results = multiprocessing.Pool().map(RunDrtTask, urls, 1) | |
311 | |
312 max_time, url = max(t for t in results if t) | |
313 elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url) | |
314 PrintElapsedTime(time.time() - start_time, elapsed_detail) | |
315 | |
316 | |
317 def CompareResultsTask(url): | |
318 url_parts = urlparse(url) | |
319 before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png") | |
320 after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png") | |
321 diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png") | |
322 MakeDirsIfNotExist(os.path.join(output_dir, "diff")) | |
323 | |
324 # TODO(johnme): Don't hardcode "real_world_impact". | |
325 red_path = ("data:image/gif;base64,R0lGODlhAQABAPAAAP8AAP///yH5BAAAAAAALAAAAA" | |
326 "ABAAEAAAICRAEAOw==") | |
327 | |
328 before_exists = os.path.exists(before_path) | |
329 after_exists = os.path.exists(after_path) | |
330 if not before_exists and not after_exists: | |
331 # TODO(johnme): Make this more informative. | |
332 return (-100, url, red_path) | |
333 if before_exists != after_exists: | |
334 # TODO(johnme): Make this more informative. | |
335 return (200, url, red_path) | |
336 | |
337 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") | |
338 | |
339 # Get percentage difference. | |
340 p = subprocess.Popen([image_diff, "--histogram", | |
341 before_path, after_path], | |
342 shell=False, | |
343 stdout=subprocess.PIPE) | |
344 output,_ = p.communicate() | |
345 if p.returncode == 0: | |
346 return (0, url, before_path) | |
347 diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n' | |
348 'exact diff: (\d+\.\d{2})% (?:passed|failed)', output) | |
349 if not diff_match: | |
350 raise Exception("image_diff output format changed") | |
351 histogram_diff = float(diff_match.group(1)) | |
352 exact_diff = float(diff_match.group(2)) | |
353 combined_diff = max(histogram_diff + exact_diff / 8, 0.001) | |
354 | |
355 # Produce diff PNG. | |
356 subprocess.call([image_diff, "--diff", before_path, after_path, diff_path]) | |
357 return (combined_diff, url, diff_path) | |
358 | |
359 | |
360 def CompareResults(): | |
361 print "Running image_diff on %d pages..." % len(urls) | |
362 start_time = time.time() | |
363 | |
364 results = multiprocessing.Pool().map(CompareResultsTask, urls) | |
365 results.sort(key=itemgetter(0), reverse=True) | |
366 | |
367 PrintElapsedTime(time.time() - start_time) | |
368 | |
369 now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M") | |
370 html_start = textwrap.dedent("""\ | |
371 <!DOCTYPE html> | |
372 <html> | |
373 <head> | |
374 <title>Real World Impact report %s</title> | |
375 <script> | |
376 var togglingImg = null; | |
377 var toggleTimer = null; | |
378 | |
379 var before = true; | |
380 function toggle() { | |
381 var newFolder = before ? "before" : "after"; | |
382 togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder); | |
383 before = !before; | |
384 toggleTimer = setTimeout(toggle, 300); | |
385 } | |
386 | |
387 function startToggle(img) { | |
388 before = true; | |
389 togglingImg = img; | |
390 if (!img.origSrc) | |
391 img.origSrc = img.src; | |
392 toggle(); | |
393 } | |
394 function stopToggle(img) { | |
395 clearTimeout(toggleTimer); | |
396 img.src = img.origSrc; | |
397 } | |
398 | |
399 document.onkeydown = function(e) { | |
400 e = e || window.event; | |
401 var keyCode = e.keyCode || e.which; | |
402 var newFolder; | |
403 switch (keyCode) { | |
404 case 49: //'1' | |
405 newFolder = "before"; break; | |
406 case 50: //'2' | |
407 newFolder = "after"; break; | |
408 case 51: //'3' | |
409 newFolder = "diff"; break; | |
410 default: | |
411 return; | |
412 } | |
413 var imgs = document.getElementsByTagName("img"); | |
414 for (var i = 0; i < imgs.length; i++) { | |
415 imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder); | |
416 } | |
417 }; | |
418 </script> | |
419 <style> | |
420 h1 { | |
421 font-family: sans; | |
422 } | |
423 h2 { | |
424 font-family: monospace; | |
425 white-space: pre; | |
426 } | |
427 .nsfw-spacer { | |
428 height: 50vh; | |
429 } | |
430 .nsfw-warning { | |
431 background: yellow; | |
432 border: 10px solid red; | |
433 } | |
434 .info { | |
435 font-size: 1.2em; | |
436 font-style: italic; | |
437 } | |
438 body:not(.details-supported) details { | |
439 display: none; | |
440 } | |
441 </style> | |
442 </head> | |
443 <body> | |
444 <script> | |
445 if ('open' in document.createElement('details')) | |
446 document.body.className = "details-supported"; | |
447 </script> | |
448 <!--<div class="nsfw-spacer"></div>--> | |
449 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d | |
450 and may be NSFW.</p> | |
451 <!--<div class="nsfw-spacer"></div>--> | |
452 <h1>Real World Impact report %s</h1> | |
453 <p class="info">Press 1, 2 and 3 to switch between before, after and diff | |
454 screenshots respectively; or hover over the images to rapidly alternate | |
455 between before and after.</p> | |
456 """ % (now, num_sites, now)) | |
457 | |
458 html_same_row = """\ | |
459 <h2>No difference on <a href="%s">%s</a>.</h2> | |
460 """ | |
461 | |
462 html_diff_row = """\ | |
463 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> | |
464 <img src="%s" width="800" height="600" | |
465 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> | |
466 """ | |
467 | |
468 html_nsfw_diff_row = """\ | |
469 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> | |
470 <details> | |
471 <summary>This site may be NSFW. Click to expand/collapse.</summary> | |
472 <img src="%s" width="800" height="600" | |
473 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> | |
474 </details> | |
475 """ | |
476 | |
477 html_end = textwrap.dedent("""\ | |
478 </body> | |
479 </html>""") | |
480 | |
481 html_path = os.path.join(output_dir, "diff.html") | |
482 with open(html_path, 'w') as f: | |
483 f.write(html_start) | |
484 for (diff_float, url, diff_path) in results: | |
485 diff_path = os.path.relpath(diff_path, output_dir) | |
486 if diff_float == 0: | |
487 f.write(html_same_row % (url, url)) | |
488 elif url in nsfw_urls: | |
489 f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path)) | |
490 else: | |
491 f.write(html_diff_row % (diff_float, url, url, diff_path)) | |
492 f.write(html_end) | |
493 | |
494 webbrowser.open_new_tab("file://" + html_path) | |
495 | |
496 | |
497 def main(argv): | |
498 global num_sites, action, allow_js | |
499 | |
500 for arg in argv[1:]: | |
501 try: | |
502 num_sites = int(arg) | |
503 except ValueError: | |
504 if arg == '--allow-js': | |
505 allow_js = True | |
506 elif arg in ['download', 'before', 'after', 'compare']: | |
skobes
2014/03/22 06:32:54
Usage text doesn't mention 'download'...
| |
507 action = arg | |
508 else: | |
509 # Unrecognized argument. Show usage. | |
510 action = None | |
511 break | |
512 | |
513 if not action or (action in ['download', 'compare'] and allow_js): | |
514 PrintUsage(argv[0]) | |
515 return 2 | |
516 | |
517 if not MakeOutDir() or not CheckPrerequisites() or not PickSampleUrls(): | |
518 return 1 | |
519 | |
520 if action == 'compare': | |
521 CompareResults() | |
522 else: | |
523 DownloadStaticCopies() | |
524 if action != 'download': | |
525 RunDrt() | |
526 return 0 | |
527 | |
528 | |
529 if __name__ == '__main__': | |
530 sys.exit(main(sys.argv)) | |
OLD | NEW |