OLD | NEW |
---|---|
(Empty) | |
1 #!/usr/bin/env python | |
2 # Copyright (c) 2012 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 # Tool for seeing the real world impact of a patch. | |
7 # | |
8 # Layout Tests can tell you whether something has changed, but this can help | |
9 # you determine whether a subtle/controversial change is beneficial or not. | |
10 # | |
11 # It dumps the rendering of a large number of sites, both with and without a | |
12 # patch being evaluated, then sorts them by greatest difference in rendering, | |
13 # such that a human reviewer can quickly review the most impacted sites, | |
14 # rather than having to manually try sites to see if anything changes. | |
15 # | |
16 # In future it might be possible to extend this to other kinds of differences, | |
17 # e.g. page load times. | |
18 | |
19 from contextlib import closing | |
20 import datetime | |
21 import errno | |
22 from distutils.spawn import find_executable | |
23 from operator import itemgetter | |
24 import multiprocessing | |
25 import os | |
26 import re | |
27 from cStringIO import StringIO | |
28 import subprocess | |
29 import sys | |
30 import textwrap | |
31 import time | |
32 from urllib2 import urlopen | |
33 from urlparse import urlparse | |
34 import webbrowser | |
35 from zipfile import ZipFile | |
36 | |
37 from nsfw_urls import nsfw_urls | |
38 | |
39 action = None | |
40 allow_js = False | |
41 chromium_src_root = "" | |
42 chromium_out_dir = "" | |
43 output_dir = "" | |
44 num_sites = 1000 | |
45 urls = [] | |
46 print_lock = multiprocessing.Lock() | |
47 | |
48 | |
49 def PrintUsage(argv0): | |
50 this_script = os.path.basename(argv0) | |
51 print textwrap.dedent("""\ | |
52 USAGE | |
pdr.
2014/01/06 23:27:33
Please list the command line options here
| |
53 1. Build content_shell in out/Release, without the controversial patch. | |
54 2. Run: %s before [num sites to test (default %d)] | |
55 3. Apply the controversial patch, and rebuild content_shell in out/Release. | |
56 4. Run: %s after [num sites to test (default %d)] | |
57 5. Run: %s compare [num sites to test (default %d)] | |
58 Output is stored in: %s | |
59 The compare step will open results in your web browser.""" | |
60 % (this_script, num_sites, this_script, num_sites, this_script, num_sites, | |
61 output_dir)) | |
62 | |
63 | |
64 def MakeDirsIfNotExist(dir): | |
65 try: | |
66 os.makedirs(dir) | |
67 except OSError as e: | |
68 if e.errno != errno.EEXIST: | |
69 raise | |
70 | |
71 def MakeOutDir(): | |
72 global chromium_src_root, chromium_out_dir, output_dir | |
73 chromium_src_root = os.path.abspath(os.path.join(os.path.dirname(__file__), | |
74 os.pardir, | |
75 os.pardir)) | |
76 # Find out directory (might be out_linux for users of cr). | |
77 for out_suffix in ["_linux", ""]: | |
78 out_dir = os.path.join(chromium_src_root, "out" + out_suffix) | |
79 if os.path.exists(out_dir): | |
80 chromium_out_dir = out_dir | |
81 break | |
82 if not chromium_out_dir: | |
83 return False | |
84 | |
85 this_script_name = "real_world_impact" | |
86 output_dir = os.path.join(chromium_out_dir, | |
87 "Release", | |
88 this_script_name) | |
89 MakeDirsIfNotExist(output_dir) | |
90 return True | |
91 | |
92 | |
93 def CheckPrerequisites(): | |
94 if not find_executable("wget"): | |
95 print "Please install wget and re-run this." | |
96 return False | |
97 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") | |
98 if not os.path.exists(image_diff): | |
99 print "Please build the image_diff target and re-run this." | |
100 return False | |
101 return True | |
102 | |
103 | |
104 def PickSampleUrls(): | |
105 global urls | |
106 data_dir = os.path.join(output_dir, "data") | |
107 MakeDirsIfNotExist(data_dir) | |
108 | |
109 # Download Alexa top 1,000,000 sites | |
110 # TODO(johnme): Should probably update this when it gets too stale... | |
111 csv_path = os.path.join(data_dir, "top-1m.csv") | |
112 if not os.path.exists(csv_path): | |
113 print "Downloading list of top 1,000,000 sites from Alexa..." | |
114 csv_url = "http://s3.amazonaws.com/alexa-static/top-1m.csv.zip" | |
115 with closing(urlopen(csv_url)) as stream: | |
116 ZipFile(StringIO(stream.read())).extract("top-1m.csv", data_dir) | |
117 | |
118 bad_urls_path = os.path.join(data_dir, "bad_urls.txt") | |
119 if os.path.exists(bad_urls_path): | |
120 with open(bad_urls_path) as f: | |
121 bad_urls = set(f.read().splitlines()) | |
122 else: | |
123 bad_urls = set() #["jrj.com.cn"]) | |
124 | |
125 # See if we've already selected a sample of size num_sites (this way, if you | |
126 # call this script with arguments "before N" then "after N", where N is the | |
127 # same number, we'll use the same sample, as expected!). | |
128 urls_path = os.path.join(data_dir, "%06d_urls.txt" % num_sites) | |
129 if not os.path.exists(urls_path): | |
130 if action == 'compare': | |
131 print ("Error: you must run 'before %d' and 'after %d' before " | |
132 "running 'compare %d'") % (num_sites, num_sites, num_sites) | |
133 return False | |
134 print "Picking %d sample urls..." % num_sites | |
135 | |
136 # TODO(johnme): For now this just gets the top num_sites entries. In future | |
137 # this should pick a weighted random sample. For example, it could fit a | |
138 # power-law distribution, which is a good model of website popularity | |
139 # (http://www.useit.com/alertbox/9704b.html). | |
140 urls = [] | |
141 remaining_num_sites = num_sites | |
142 with open(csv_path) as f: | |
143 for entry in f: | |
144 if remaining_num_sites <= 0: | |
145 break | |
146 remaining_num_sites -= 1 | |
147 hostname = entry.strip().split(',')[1] | |
148 if not '/' in hostname: # Skip Alexa 1,000,000 entries that have paths. | |
149 url = "http://%s/" % hostname | |
150 if not url in bad_urls: | |
151 urls.append(url) | |
152 # Don't write these to disk yet; we'll do that in SaveWorkingUrls below | |
153 # once we have tried to download them and seen which ones fail. | |
154 else: | |
155 with open(urls_path) as f: | |
156 urls = [u for u in f.read().splitlines() if not u in bad_urls] | |
157 return True | |
158 | |
159 | |
160 def SaveWorkingUrls(): | |
161 # TODO(johnme): Update the list if a url that used to work goes offline. | |
162 urls_path = os.path.join(output_dir, "data", "%06d_urls.txt" % num_sites) | |
163 if not os.path.exists(urls_path): | |
164 with open(urls_path, 'w') as f: | |
165 f.writelines(u + '\n' for u in urls) | |
166 | |
167 | |
168 def PrintElapsedTime(elapsed, detail=""): | |
169 elapsed = round(elapsed * 10) / 10.0 | |
170 m = elapsed / 60 | |
171 s = elapsed % 60 | |
172 print "Took %dm%.1fs" % (m, s), detail | |
173 | |
174 | |
175 def DownloadStaticCopyTask(url): | |
176 url_parts = urlparse(url) | |
177 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
178 # Use wget for now, as does a reasonable job of spidering page dependencies | |
179 # (e.g. CSS, JS, images). | |
180 success = True | |
181 try: | |
182 subprocess.check_call(["wget", | |
183 "--execute", "robots=off", | |
184 ("--user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS " | |
185 "X 10_8_5) AppleWebKit/537.36 (KHTML, like Gecko) C" | |
186 "hrome/32.0.1700.14 Safari/537.36"), | |
187 "--page-requisites", | |
188 "--span-hosts", | |
189 "--adjust-extension", | |
190 "--convert-links", | |
191 "--directory-prefix=" + host_dir, | |
192 "--force-directories", | |
193 "--default-page=index.html", | |
194 "--quiet", | |
195 url]) | |
196 except subprocess.CalledProcessError: | |
197 # Ignoring these for now, as some sites have issues with their subresources | |
198 # yet still produce a renderable index.html | |
199 pass #success = False | |
200 if success: | |
201 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
202 if not os.path.exists(download_path): | |
203 success = False | |
204 if not success: | |
205 with print_lock: | |
206 print "Failed to download:", url | |
207 return False | |
208 return True | |
209 | |
210 | |
211 def DownloadStaticCopies(): | |
212 global urls | |
213 new_urls = [] | |
214 for url in urls: | |
215 url_parts = urlparse(url) | |
216 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
217 download_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
218 if not os.path.exists(download_path): | |
219 new_urls.append(url) | |
220 | |
221 if new_urls: | |
222 print "Downloading static copies of %d sites..." % len(new_urls) | |
223 start_time = time.time() | |
224 | |
225 results = multiprocessing.Pool(8).map(DownloadStaticCopyTask, new_urls) | |
226 failed_urls = [new_urls[i] for i,ret in enumerate(results) if not ret] | |
227 if failed_urls: | |
228 bad_urls_path = os.path.join(output_dir, "data", "bad_urls.txt") | |
229 with open(bad_urls_path, 'a') as f: | |
230 f.writelines(u + '\n' for u in failed_urls) | |
231 failed_urls_set = set(failed_urls) | |
232 urls = [u for u in urls if u not in failed_urls_set] | |
233 | |
234 PrintElapsedTime(time.time() - start_time) | |
235 | |
236 SaveWorkingUrls() | |
237 | |
238 | |
239 def RunDrtTask(url): | |
240 url_parts = urlparse(url) | |
241 host_dir = os.path.join(output_dir, "data", url_parts.hostname) | |
242 html_path = os.path.join(host_dir, url_parts.hostname, "index.html") | |
243 | |
244 if not allow_js: | |
245 nojs_path = os.path.join(host_dir, url_parts.hostname, "index-nojs.html") | |
246 if not os.path.exists(nojs_path): | |
247 with open(html_path) as f: | |
248 html = f.read() | |
249 if not html: | |
250 return False | |
251 # These aren't intended to be XSS safe :) | |
252 block_tags = (r'<\s*(script|object|video|audio|iframe|frameset|frame)' | |
253 r'\b.*?<\s*\/\s*\1\s*>') | |
254 block_attrs = r'\s(onload|onerror)\s*=\s*(\'[^\']*\'|"[^"]*|\S*)' | |
255 html = re.sub(block_tags, '', html, flags=re.I|re.S) | |
256 html = re.sub(block_attrs, '', html, flags=re.I) | |
257 with open(nojs_path, 'w') as f: | |
258 f.write(html) | |
259 html_path = nojs_path | |
260 | |
261 content_shell = os.path.join(chromium_out_dir, "Release", "content_shell") | |
262 start_time = time.time() | |
263 | |
264 with open(os.devnull, "w") as fnull: | |
265 p = subprocess.Popen([content_shell, | |
266 "--dump-render-tree", | |
267 # The single quote is not a typo, it's a separator! | |
268 html_path + "'--pixel-test" | |
269 ], | |
270 shell=False, | |
271 stdout=subprocess.PIPE, | |
272 stderr=fnull) | |
273 | |
274 result = p.stdout.read() | |
275 PNG_START = b"\x89\x50\x4E\x47\x0D\x0A\x1A\x0A" | |
276 PNG_END = b"\x49\x45\x4E\x44\xAE\x42\x60\x82" | |
277 try: | |
278 start = result.index(PNG_START) | |
279 end = result.rindex(PNG_END) + 8 | |
280 except ValueError: | |
281 return False | |
282 | |
283 png_path = os.path.join(output_dir, action, url_parts.hostname + ".png") | |
284 MakeDirsIfNotExist(os.path.dirname(png_path)) | |
285 with open(png_path, 'wb') as f: | |
286 f.write(result[start:end]) | |
287 elapsed_time = (time.time() - start_time, url) | |
288 return elapsed_time | |
289 | |
290 | |
291 def RunDrt(): | |
292 print "Taking screenshots of %d pages..." % len(urls) | |
293 start_time = time.time() | |
294 | |
295 results = multiprocessing.Pool().map(RunDrtTask, urls, 1) | |
296 | |
297 max_time, url = max(t for t in results if t) | |
298 elapsed_detail = "(slowest: %.2fs on %s)" % (max_time, url) | |
299 PrintElapsedTime(time.time() - start_time, elapsed_detail) | |
300 | |
301 | |
302 def CompareResultsTask(url): | |
303 url_parts = urlparse(url) | |
304 before_path = os.path.join(output_dir, "before", url_parts.hostname + ".png") | |
305 after_path = os.path.join(output_dir, "after", url_parts.hostname + ".png") | |
306 diff_path = os.path.join(output_dir, "diff", url_parts.hostname + ".png") | |
307 MakeDirsIfNotExist(os.path.join(output_dir, "diff")) | |
308 | |
309 # TODO(johnme): Don't hardcode "real_world_impact". | |
310 red_path = ("" | |
311 "ABAAEAAAICRAEAOw==") | |
312 | |
313 before_exists = os.path.exists(before_path) | |
314 after_exists = os.path.exists(after_path) | |
315 if not before_exists and not after_exists: | |
316 # TODO(johnme): Make this more informative. | |
317 return (-100, url, red_path) | |
318 if before_exists != after_exists: | |
319 # TODO(johnme): Make this more informative. | |
320 return (200, url, red_path) | |
321 | |
322 image_diff = os.path.join(chromium_out_dir, "Release", "image_diff") | |
323 | |
324 # Get percentage difference. | |
325 p = subprocess.Popen([image_diff, "--histogram", | |
326 before_path, after_path], | |
327 shell=False, | |
328 stdout=subprocess.PIPE) | |
329 output,_ = p.communicate() | |
330 if p.returncode == 0: | |
331 return (0, url, before_path) | |
332 diff_match = re.match(r'histogram diff: (\d+\.\d{2})% (?:passed|failed)\n' | |
333 'exact diff: (\d+\.\d{2})% (?:passed|failed)', output) | |
334 if not diff_match: | |
335 raise Exception("image_diff output format changed") | |
336 histogram_diff = float(diff_match.group(1)) | |
337 exact_diff = float(diff_match.group(2)) | |
338 combined_diff = max(histogram_diff + exact_diff / 8, 0.001) | |
339 | |
340 # Produce diff PNG. | |
341 subprocess.call([image_diff, "--diff", before_path, after_path, diff_path]) | |
342 return (combined_diff, url, diff_path) | |
343 | |
344 | |
345 def CompareResults(): | |
346 print "Running image_diff on %d pages..." % len(urls) | |
347 start_time = time.time() | |
348 | |
349 results = multiprocessing.Pool().map(CompareResultsTask, urls) | |
350 results.sort(key=itemgetter(0), reverse=True) | |
351 | |
352 PrintElapsedTime(time.time() - start_time) | |
353 | |
354 now = datetime.datetime.today().strftime("%a %Y-%m-%d %H:%M") | |
355 html_start = textwrap.dedent("""\ | |
356 <!DOCTYPE html> | |
357 <html> | |
358 <head> | |
359 <title>Real World Impact report %s</title> | |
360 <script> | |
361 var togglingImg = null; | |
362 var toggleTimer = null; | |
363 | |
364 var before = true; | |
365 function toggle() { | |
366 var newFolder = before ? "before" : "after"; | |
367 togglingImg.src = togglingImg.src.replace(/before|after|diff/, newFolder); | |
368 before = !before; | |
369 toggleTimer = setTimeout(toggle, 300); | |
370 } | |
371 | |
372 function startToggle(img) { | |
373 before = true; | |
374 togglingImg = img; | |
375 if (!img.origSrc) | |
376 img.origSrc = img.src; | |
377 toggle(); | |
378 } | |
379 function stopToggle(img) { | |
380 clearTimeout(toggleTimer); | |
381 img.src = img.origSrc; | |
382 } | |
383 | |
384 document.onkeydown = function(e) { | |
385 e = e || window.event; | |
386 var keyCode = e.keyCode || e.which; | |
387 var newFolder; | |
388 switch (keyCode) { | |
389 case 49: //'1' | |
390 newFolder = "before"; break; | |
391 case 50: //'2' | |
392 newFolder = "after"; break; | |
393 case 51: //'3' | |
394 newFolder = "diff"; break; | |
395 default: | |
396 return; | |
397 } | |
398 var imgs = document.getElementsByTagName("img"); | |
399 for (var i = 0; i < imgs.length; i++) { | |
400 imgs[i].src = imgs[i].src.replace(/before|after|diff/, newFolder); | |
401 } | |
402 }; | |
403 </script> | |
404 <style> | |
405 h1 { | |
406 font-family: sans; | |
407 } | |
408 h2 { | |
409 font-family: monospace; | |
410 white-space: pre; | |
411 } | |
412 .nsfw-spacer { | |
413 height: 50vh; | |
414 } | |
415 .nsfw-warning { | |
416 background: yellow; | |
417 border: 10px solid red; | |
418 } | |
419 .info { | |
420 font-size: 1.2em; | |
421 font-style: italic; | |
422 } | |
423 body:not(.details-supported) details { | |
424 display: none; | |
425 } | |
426 </style> | |
427 </head> | |
428 <body> | |
429 <script> | |
430 if ('open' in document.createElement('details')) | |
431 document.body.className = "details-supported"; | |
432 </script> | |
433 <!--<div class="nsfw-spacer"></div>--> | |
434 <p class="nsfw-warning">Warning: sites below are taken from the Alexa top %d | |
435 and may be NSFW.</p> | |
436 <!--<div class="nsfw-spacer"></div>--> | |
437 <h1>Real World Impact report %s</h1> | |
438 <p class="info">Press 1, 2 and 3 to switch between before, after and diff | |
439 screenshots respectively; or hover over the images to rapidly alternate | |
440 between before and after.</p> | |
441 """ % (now, num_sites, now)) | |
442 | |
443 html_same_row = """\ | |
444 <h2>No difference on <a href="%s">%s</a>.</h2> | |
445 """ | |
446 | |
447 html_diff_row = """\ | |
448 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> | |
449 <img src="%s" width="800" height="600" | |
450 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> | |
451 """ | |
452 | |
453 html_nsfw_diff_row = """\ | |
454 <h2>%7.3f%% difference on <a href="%s">%s</a>:</h2> | |
455 <details> | |
456 <summary>This site may be NSFW. Click to expand/collapse.</summary> | |
457 <img src="%s" width="800" height="600" | |
458 onmouseover="startToggle(this)" onmouseout="stopToggle(this)"> | |
459 </details> | |
460 """ | |
461 | |
462 html_end = textwrap.dedent("""\ | |
463 </body> | |
464 </html>""") | |
465 | |
466 html_path = os.path.join(output_dir, "diff.html") | |
467 with open(html_path, 'w') as f: | |
468 f.write(html_start) | |
469 for (diff_float, url, diff_path) in results: | |
470 diff_path = os.path.relpath(diff_path, output_dir) | |
471 if diff_float == 0: | |
472 f.write(html_same_row % (url, url)) | |
473 elif url in nsfw_urls: | |
474 f.write(html_nsfw_diff_row % (diff_float, url, url, diff_path)) | |
475 else: | |
476 f.write(html_diff_row % (diff_float, url, url, diff_path)) | |
477 f.write(html_end) | |
478 | |
479 webbrowser.open_new_tab("file://" + html_path) | |
480 | |
481 | |
482 def main(argv): | |
483 global num_sites, action, allow_js | |
484 | |
485 for arg in argv[1:]: | |
486 try: | |
487 num_sites = int(arg) | |
488 except ValueError: | |
489 if arg == '--allow-js': | |
490 allow_js = True | |
491 elif arg in ['download', 'before', 'after', 'compare']: | |
492 action = arg | |
493 else: | |
494 # Unrecognized argument. Show usage. | |
495 action = None | |
496 break | |
497 | |
498 if not action or (action in ['download', 'compare'] and allow_js): | |
499 PrintUsage(argv[0]) | |
500 return 2 | |
501 | |
502 if not MakeOutDir() or not CheckPrerequisites() or not PickSampleUrls(): | |
503 return 1 | |
504 | |
505 if action == 'compare': | |
506 CompareResults() | |
507 else: | |
508 DownloadStaticCopies() | |
509 if action != 'download': | |
510 RunDrt() | |
511 return 0 | |
512 | |
513 | |
514 if __name__ == '__main__': | |
515 sys.exit(main(sys.argv)) | |
OLD | NEW |