Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(150)

Side by Side Diff: heuristics/distillable/get_screenshots.py

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/index.html » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2016 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 import argparse
7 import json
8 import os
9 import shutil
10 import sys
11 import time
12 import urllib
13 import random
14 from lockfile import FileLock
15
16 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
17
18 try:
19 from selenium import webdriver
20 except:
21 print 'ERROR:'
22 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`. ' % repo_root
23 sys.exit(1)
24
25 def addBuildtoolsToPath():
26 envPath = os.environ['PATH']
27 buildtoolsPath = repo_root + '/buildtools'
28 if not buildtoolsPath in envPath:
29 os.environ['PATH'] = buildtoolsPath + ':' + envPath
30
31 def getDistillerUrl(u):
32 params = { 'url': u}
33 return "chrome-distiller://blah/?" + urllib.urlencode(params)
34
35 def newDriver():
36 chromeOptions = webdriver.ChromeOptions()
37 chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
38 chromeOptions.add_argument('--enable-dom-distiller')
39 chromeOptions.add_argument('--save-page-as-mhtml')
40 driver = webdriver.Chrome(chrome_options=chromeOptions)
41 driver.set_page_load_timeout(60)
42 driver.set_script_timeout(60)
43 print "created a new chrome driver"
44 return driver
45
46 def writeAggregated(outdir, ext, out, in_marshal=False):
47 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]
48 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f) [1] == '.' + ext]
49 output = []
50 print 'reading %s files' % (ext)
51 for f in prevfiles:
52 with open(f) as infofile:
53 info = json.load(infofile)
54 output.append(info)
55 print 'done reading %s files' % (ext)
56
57 output = sorted(output, key=lambda k: k['index'])
58 print 'writing %s files' % (ext)
59 with open('%s/%s' % (outdir, out), 'w') as outf:
60 if in_marshal:
61 import marshal
62 marshal.dump(output, outf)
63 else:
64 json.dump(output, outf, indent=2)
65 print 'done writing %s files' % (ext)
66
67 def writeIndex(outdir):
68 writeAggregated(outdir, "info", "index")
69
70 def writeFeature(outdir):
71 writeAggregated(outdir, "feature", "feature", in_marshal=True)
72
73 def main(argv):
74 parser = argparse.ArgumentParser()
75 parser.add_argument('--out', required=True)
76 parser.add_argument('urls', nargs='*')
77 parser.add_argument('--force', action='store_true')
78 parser.add_argument('--urls-file')
79 parser.add_argument('--resume', action='store_true')
80 parser.add_argument('--write-index', action='store_true')
81 parser.add_argument('--save-mhtml', action='store_true')
82 options = parser.parse_args(argv)
83
84 outdir = options.out
85 if not options.resume:
86 if os.path.exists(outdir):
87 if not options.force:
88 print outdir + ' exists'
89 return 1
90 shutil.rmtree(outdir, ignore_errors=True)
91 os.makedirs(outdir)
92 else:
93 if not os.path.exists(outdir):
94 print outdir + ' doesn\'t exist'
95 return 1
96
97 addBuildtoolsToPath()
98
99 if options.urls:
100 files = options.urls
101 elif options.urls_file:
102 with open(options.urls_file) as u:
103 files = u.read().splitlines()
104 else:
105 print 'oh no'
106 return 1
107
108 if options.write_index:
109 writeIndex(outdir)
110 writeFeature(outdir)
111 print 'index is written'
112 return 0
113
114 driver = newDriver()
115
116 feature_extractor = open('extract_features.js').read()
117
118 try:
119 jobs = list(enumerate(files))
120 random.shuffle(jobs)
121 for i, f in jobs:
122 prefix = '%s/%d' % (outdir, i)
123 info = '%s.info' % prefix
124
125 if os.path.exists(info):
126 print "skip %d" % (i)
127 continue;
128
129 with FileLock('%s.lock' % (prefix)):
130 if os.path.exists(info):
131 print "SKIP %d" % (i)
132 continue;
133 try:
134 ss = '%s.png' % prefix
135 dss = '%s-distilled.png' % prefix
136 fea = '%s.feature' % prefix
137
138 driver.set_window_size(1280, 5000)
139 driver.get(f)
140 time.sleep(3) # wait for some async scripts
141 driver.save_screenshot(ss)
142 print "saved %s" % ss
143
144 features = driver.execute_script(feature_extractor)
145 data = {
146 'index': i,
147 'url': f,
148 'features': features
149 }
150 with open(fea, 'w') as outf:
151 json.dump(data, outf, indent=2)
152 print "saved %s" % fea
153
154 if options.save_mhtml:
155 mhtml = '%s.mhtml' % prefix
156 cmd = (
157 'xdotool key --clearmodifiers "ctrl+s" && ' +
158 'sleep 1 && ' +
159 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
160 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
161 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
162 'xdotool key --delay 20 --clearmodifiers Return'
163 ) % (os.getcwd() + '/' + mhtml)
164 os.system(cmd)
165 time.sleep(3) # wait for file saving
166 if not os.path.exists(mhtml):
167 # If the file is not saved, the focus point might be lost.
168 # Restart the whole xvfb environment to be safe.
169 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm l)
170 break
171
172 driver.set_window_size(640, 5000)
173 driver.get(getDistillerUrl(f))
174 time.sleep(20) # wait for multi-page, etc
175 driver.save_screenshot(dss)
176 print "saved %s" % dss
177
178 data = {
179 'index': i,
180 'url': f,
181 'screenshot': ss,
182 'distilled': dss,
183 }
184 with open(info, 'w') as info:
185 json.dump(data, info)
186
187 except Exception as e:
188 print e
189 print "Index=%d URL=%s" % (i, f)
190 driver.quit()
191 driver = newDriver()
192 pass
193
194 finally:
195 driver.quit()
196
197 return 0
198
199 if __name__ == '__main__':
200 sys.exit(main(sys.argv[1:]))
201
OLDNEW
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/index.html » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698