Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(489)

Side by Side Diff: get_features.py

Issue 1289123002: Merge branch 'master' into heuristics Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « foo/test.js ('k') | get_screenshots.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 import argparse
7 import json
8 import os
9 import shutil
10 import sys
11 import time
12 import urllib
13
14 try:
15 from selenium import webdriver
16 except:
17 print 'ERROR:'
18 print 'Couldn\'t import webdriver. Please run `sudo ./install-build-deps.sh`.'
19 sys.exit(1)
20
21 self_dir = os.path.abspath(os.path.dirname(__file__))
22
23 def addBuildtoolsToPath():
24 envPath = os.environ['PATH']
25 if not 'buildtools' in envPath:
26 os.environ['PATH'] = '%s/buildtools:%s' % (self_dir, envPath)
27
28 def newDriver():
29 chromeOptions = webdriver.ChromeOptions()
30 chromeOptions.add_argument('--enable-dom-distiller')
31 driver = webdriver.Chrome(chrome_options=chromeOptions)
32 driver.set_window_size(1600, 5000)
33 driver.set_page_load_timeout(20)
34 driver.set_script_timeout(30)
35 return driver
36
37 def main(argv):
38 parser = argparse.ArgumentParser()
39 parser.add_argument('--out', required=True)
40 parser.add_argument('urls', nargs='*')
41 parser.add_argument('--force', action='store_true')
42 parser.add_argument('--urls-file')
43 parser.add_argument('--restart', action='store_true')
44 options = parser.parse_args(argv)
45
46 outdir = options.out
47 if not options.restart:
48 if os.path.exists(outdir):
49 if not options.force:
50 print outdir + ' exists'
51 return 1
52 shutil.rmtree(outdir, ignore_errors=True)
53 os.makedirs(outdir)
54 else:
55 if not os.path.exists(outdir):
56 print outdir + ' doesn\'t exist'
57 return 1
58
59 addBuildtoolsToPath()
60
61 if options.urls:
62 files = options.urls
63 elif options.urls_file:
64 with open(options.urls_file) as u:
65 files = u.read().splitlines()
66 else:
67 print 'oh no'
68 return 1
69
70 driver = newDriver()
71 output = []
72 startIndex = 0
73 if options.restart:
74 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]
75 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext( f)[1] == '.info']
76 for f in prevfiles:
77 with open(f) as infofile:
78 info = json.load(infofile)
79 output.append(info)
80 startIndex = max([i['index'] for i in output]) + 1
81 print 'starting at ', startIndex
82
83 feature_extractor = open('extract_features.js').read()
84
85 try:
86 for i, f in enumerate(files):
87 prefix = '%s/%d' % (outdir, i)
88 if i < startIndex:
89 continue
90 try:
91 ss = '%s.png' % prefix
92 dss = '%s-distilled.png' % prefix
93 driver.get(f)
94 time.sleep(0.5)
95 features = driver.execute_script(feature_extractor)
96 data = {
97 'index': i,
98 'url': f,
99 'features': features
100 }
101 output.append(data)
102 with open('%s.features' % prefix, 'w') as info:
103 json.dump(data, info)
104
105 except Exception as e:
106 print e
107 driver.quit()
108 driver = newDriver()
109 pass
110
111 finally:
112 driver.quit()
113
114 with open('%s/features' % outdir, 'w') as index:
115 json.dump(output, index)
116 return 0
117
118 if __name__ == '__main__':
119 sys.exit(main(sys.argv[1:]))
120
OLDNEW
« no previous file with comments | « foo/test.js ('k') | get_screenshots.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698