OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright 2014 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 import argparse | |
7 import json | |
8 import os | |
9 import shutil | |
10 import sys | |
11 import time | |
12 import urllib | |
13 | |
14 try: | |
15 from selenium import webdriver | |
16 except: | |
17 print 'ERROR:' | |
18 print 'Couldn\'t import webdriver. Please run `sudo ./install-build-deps.sh`.' | |
19 sys.exit(1) | |
20 | |
21 self_dir = os.path.abspath(os.path.dirname(__file__)) | |
22 | |
23 def addBuildtoolsToPath(): | |
24 envPath = os.environ['PATH'] | |
25 if not 'buildtools' in envPath: | |
26 os.environ['PATH'] = '%s/buildtools:%s' % (self_dir, envPath) | |
27 | |
28 def newDriver(): | |
29 chromeOptions = webdriver.ChromeOptions() | |
30 chromeOptions.add_argument('--enable-dom-distiller') | |
31 driver = webdriver.Chrome(chrome_options=chromeOptions) | |
32 driver.set_window_size(1600, 5000) | |
33 driver.set_page_load_timeout(20) | |
34 driver.set_script_timeout(30) | |
35 return driver | |
36 | |
37 def main(argv): | |
38 parser = argparse.ArgumentParser() | |
39 parser.add_argument('--out', required=True) | |
40 parser.add_argument('urls', nargs='*') | |
41 parser.add_argument('--force', action='store_true') | |
42 parser.add_argument('--urls-file') | |
43 parser.add_argument('--restart', action='store_true') | |
44 options = parser.parse_args(argv) | |
45 | |
46 outdir = options.out | |
47 if not options.restart: | |
48 if os.path.exists(outdir): | |
49 if not options.force: | |
50 print outdir + ' exists' | |
51 return 1 | |
52 shutil.rmtree(outdir, ignore_errors=True) | |
53 os.makedirs(outdir) | |
54 else: | |
55 if not os.path.exists(outdir): | |
56 print outdir + ' doesn\'t exist' | |
57 return 1 | |
58 | |
59 addBuildtoolsToPath() | |
60 | |
61 if options.urls: | |
62 files = options.urls | |
63 elif options.urls_file: | |
64 with open(options.urls_file) as u: | |
65 files = u.read().splitlines() | |
66 else: | |
67 print 'oh no' | |
68 return 1 | |
69 | |
70 driver = newDriver() | |
71 output = [] | |
72 startIndex = 0 | |
73 if options.restart: | |
74 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] | |
75 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(
f)[1] == '.info'] | |
76 for f in prevfiles: | |
77 with open(f) as infofile: | |
78 info = json.load(infofile) | |
79 output.append(info) | |
80 startIndex = max([i['index'] for i in output]) + 1 | |
81 print 'starting at ', startIndex | |
82 | |
83 feature_extractor = open('extract_features.js').read() | |
84 | |
85 try: | |
86 for i, f in enumerate(files): | |
87 prefix = '%s/%d' % (outdir, i) | |
88 if i < startIndex: | |
89 continue | |
90 try: | |
91 ss = '%s.png' % prefix | |
92 dss = '%s-distilled.png' % prefix | |
93 driver.get(f) | |
94 time.sleep(0.5) | |
95 features = driver.execute_script(feature_extractor) | |
96 data = { | |
97 'index': i, | |
98 'url': f, | |
99 'features': features | |
100 } | |
101 output.append(data) | |
102 with open('%s.features' % prefix, 'w') as info: | |
103 json.dump(data, info) | |
104 | |
105 except Exception as e: | |
106 print e | |
107 driver.quit() | |
108 driver = newDriver() | |
109 pass | |
110 | |
111 finally: | |
112 driver.quit() | |
113 | |
114 with open('%s/features' % outdir, 'w') as index: | |
115 json.dump(output, index) | |
116 return 0 | |
117 | |
118 if __name__ == '__main__': | |
119 sys.exit(main(sys.argv[1:])) | |
120 | |
OLD | NEW |