Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(295)

Side by Side Diff: heuristics/distillable/get_screenshots.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/server.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright 2016 The Chromium Authors. All rights reserved. 2 # Copyright 2016 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 import argparse 6 import argparse
7 import json 7 import json
8 import os 8 import os
9 import shutil 9 import shutil
10 import sys 10 import sys
11 import time 11 import time
12 import urllib 12 import urllib
13 import random 13 import random
14 from lockfile import FileLock 14 from lockfile import FileLock
15 15
16 from calculate_derived_features import CalcDerivedFeatures
17
16 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) 18 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
17 19
18 try: 20 try:
19 from selenium import webdriver 21 from selenium import webdriver
22 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
20 except: 23 except:
21 print 'ERROR:' 24 print 'ERROR:'
22 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`. ' % repo_root 25 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`. ' % repo_root
23 sys.exit(1) 26 sys.exit(1)
24 27
25 def addBuildtoolsToPath(): 28 def addBuildtoolsToPath():
26 envPath = os.environ['PATH'] 29 envPath = os.environ['PATH']
27 buildtoolsPath = repo_root + '/buildtools' 30 buildtoolsPath = repo_root + '/buildtools'
28 if not buildtoolsPath in envPath: 31 if not buildtoolsPath in envPath:
29 os.environ['PATH'] = buildtoolsPath + ':' + envPath 32 os.environ['PATH'] = buildtoolsPath + ':' + envPath
30 33
31 def getDistillerUrl(u): 34 def getDistillerUrl(u):
32 params = { 'url': u} 35 params = { 'url': u}
33 return "chrome-distiller://blah/?" + urllib.urlencode(params) 36 return "chrome-distiller://blah/?" + urllib.urlencode(params)
34 37
35 def newDriver(): 38 def newDriver(mobile=False):
36 chromeOptions = webdriver.ChromeOptions() 39 chromeOptions = webdriver.ChromeOptions()
37 chromeOptions.binary_location = "/usr/bin/google-chrome-unstable"; 40 # If you want to use a different version of chrome, specify the full path here .
41 #chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
38 chromeOptions.add_argument('--enable-dom-distiller') 42 chromeOptions.add_argument('--enable-dom-distiller')
39 chromeOptions.add_argument('--save-page-as-mhtml') 43 chromeOptions.add_argument('--save-page-as-mhtml')
40 driver = webdriver.Chrome(chrome_options=chromeOptions) 44 chromeOptions.add_argument('--reader-mode-heuristics=adaboost')
45 chromeOptions.add_argument('--distillability-dev')
46 if mobile:
47 mobile_emulation = { "deviceName": "Google Nexus 5" }
48 chromeOptions.add_experimental_option("mobileEmulation", mobile_emulation)
49
50 d = DesiredCapabilities.CHROME
51 # This is to enable accessing devtools console log from here, for nativeFeatur es().
52 d['loggingPrefs'] = {'browser': 'ALL'}
53 driver = webdriver.Chrome(chrome_options=chromeOptions, desired_capabilities=d )
41 driver.set_page_load_timeout(60) 54 driver.set_page_load_timeout(60)
42 driver.set_script_timeout(60) 55 driver.set_script_timeout(60)
43 print "created a new chrome driver" 56 print "created a new chrome driver"
44 return driver 57 return driver
45 58
59 def nativeFeatures(logs):
60 return _parseNative(logs, 'distillability_features = ')
61
62 def nativeClassification(logs):
63 return _parseNative(logs, 'adaboost_classification = ')
64
65 def _parseNative(logs, needle):
66 """Parse console logs from Chrome and get decoded JSON.
67
68 Args:
69 logs: Chrome log object
70 needle (str): the string leading the actual JSON.
71
72 Example:
73 >>> _parseNative([{'message':'a=b'},{'message':'ac={"a":[1,2]}'}],'c=')
74 {u'a': [1, 2]}
75 """
76 ret = None
77 for log in logs:
78 message = log['message']
79 loc = message.find(needle)
80 if loc >= 0:
81 ret = json.loads(message[loc+len(needle):])
82 return ret
83
84 def saveFeatures(driver, feature_extractor, data, url_override, filename):
85 data = dict.copy(data)
86 features = driver.execute_script(feature_extractor)
87 if url_override:
88 features['url'] = url_override
89 data['features'] = features
90
91 logs = driver.get_log('browser')
92 native = nativeClassification(logs)
93 if native:
94 native['features'] = nativeFeatures(logs)
95 data['native'] = native
96
97 with open(filename, 'w') as outf:
98 json.dump(data, outf, indent=2, sort_keys=True)
99 print "saved %s" % filename
100
101 derived = dict.copy(data)
102 derived['features'] = CalcDerivedFeatures(data['index'], features)
103
104 derived_name = filename + '-derived'
105 with open(derived_name, 'w') as outf:
106 json.dump(derived, outf, indent=2, sort_keys=True)
107 print "saved %s" % derived_name
108 return data, derived
109
110 def saveInfoFile(data, ss, dss, filename):
111 data = dict.copy(data)
112 data['screenshot'] = ss
113 data['distilled'] = dss
114 with open(filename, 'w') as info:
115 json.dump(data, info)
116
117 def saveMHTML(filename):
118 """Save current page as an MHTML file
119
120 This is done by issuing xdotool commands.
121 Dependencies:
122 - Command line argument "--save-page-as-mhtml" to Chrome.
123 - xdotool
124 """
125
126 cmd = (
127 'xdotool key --clearmodifiers "ctrl+s" && ' +
128 'sleep 1 && ' +
129 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
130 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
131 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
132 'xdotool key --delay 20 --clearmodifiers Return'
133 ) % (os.path.abspath(filename))
134 os.system(cmd)
135 time.sleep(3) # wait for file saving
136 if not os.path.exists(filename):
137 return False
138 print "saved %s" % filename
139 return True
140
46 def writeAggregated(outdir, ext, out, in_marshal=False): 141 def writeAggregated(outdir, ext, out, in_marshal=False):
47 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] 142 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]
48 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f) [1] == '.' + ext] 143 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f) [1] == '.' + ext]
49 output = [] 144 output = []
50 print 'reading %s files' % (ext) 145 print 'reading %s files' % (ext)
51 for f in prevfiles: 146 for f in prevfiles:
52 with open(f) as infofile: 147 with open(f) as infofile:
53 info = json.load(infofile) 148 info = json.load(infofile)
54 output.append(info) 149 output.append(info)
55 print 'done reading %s files' % (ext) 150 print 'done reading %s files' % (ext)
56 151
57 output = sorted(output, key=lambda k: k['index']) 152 output = sorted(output, key=lambda k: k['index'])
58 print 'writing %s files' % (ext) 153 print 'writing %s files' % (ext)
59 with open('%s/%s' % (outdir, out), 'w') as outf: 154 with open('%s/%s' % (outdir, out), 'w') as outf:
60 if in_marshal: 155 if in_marshal:
61 import marshal 156 import marshal
62 marshal.dump(output, outf) 157 marshal.dump(output, outf)
63 else: 158 else:
64 json.dump(output, outf, indent=2) 159 json.dump(output, outf, indent=2)
65 print 'done writing %s files' % (ext) 160 print 'done writing %s files' % (ext)
66 161
67 def writeIndex(outdir): 162 def writeIndex(outdir):
68 writeAggregated(outdir, "info", "index") 163 writeAggregated(outdir, "info", "index")
69 164
70 def writeFeature(outdir): 165 def writeFeature(outdir):
71 writeAggregated(outdir, "feature", "feature", in_marshal=True) 166 for n in ["feature-derived", "dfeature-derived", "mfeature-derived", "mdfeatur e-derived"]:
167 writeAggregated(outdir, n, n)
168 # Use the following when needing aggregated raw features:
169 #writeAggregated(outdir, "feature", "feature", in_marshal=True)
170 #writeAggregated(outdir, "dfeature", "dfeature", in_marshal=True)
171 #writeAggregated(outdir, "mfeature", "mfeature", in_marshal=True)
172 #writeAggregated(outdir, "mdfeature", "mdfeature", in_marshal=True)
173
174 def shouldProcess(load_mhtml, no_distill, prefix):
175 info = prefix + '.info'
176 mhtml = prefix + '.mhtml'
177 mfeature = prefix + '.mfeature'
178 mdfeature = prefix + '.mdfeature'
179 if not load_mhtml:
180 return not os.path.exists(info)
181 else:
182 if no_distill:
183 return os.path.exists(mhtml) and not os.path.exists(mfeature)
184 else:
185 return os.path.exists(mhtml) and not os.path.exists(mdfeature)
72 186
73 def main(argv): 187 def main(argv):
74 parser = argparse.ArgumentParser() 188 parser = argparse.ArgumentParser()
75 parser.add_argument('--out', required=True) 189 parser.add_argument('--out', required=True)
76 parser.add_argument('urls', nargs='*') 190 parser.add_argument('urls', nargs='*')
77 parser.add_argument('--force', action='store_true') 191 parser.add_argument('--force', action='store_true')
78 parser.add_argument('--urls-file') 192 parser.add_argument('--urls-file')
193 parser.add_argument('--emulate-mobile', action='store_true')
79 parser.add_argument('--resume', action='store_true') 194 parser.add_argument('--resume', action='store_true')
80 parser.add_argument('--write-index', action='store_true') 195 parser.add_argument('--write-index', action='store_true')
81 parser.add_argument('--save-mhtml', action='store_true') 196 parser.add_argument('--save-mhtml', action='store_true')
197 parser.add_argument('--load-mhtml', action='store_true')
198 parser.add_argument('--skip-distillation', action='store_true')
199 parser.add_argument('--desktop-distillable-only', action='store_true')
82 options = parser.parse_args(argv) 200 options = parser.parse_args(argv)
83 201
202 if options.load_mhtml:
203 if options.save_mhtml:
204 print '--load-mhtml is not compatible with --save-mhtml'
205 return 1
206 if options.resume:
207 print '--load-mhtml is not compatible with --resume'
208 return 1
209
84 outdir = options.out 210 outdir = options.out
85 if not options.resume: 211 if not options.resume and not options.load_mhtml and not options.write_index:
86 if os.path.exists(outdir): 212 if os.path.exists(outdir):
87 if not options.force: 213 if not options.force:
88 print outdir + ' exists' 214 print outdir + ' exists'
89 return 1 215 return 1
90 shutil.rmtree(outdir, ignore_errors=True) 216 shutil.rmtree(outdir, ignore_errors=True)
91 os.makedirs(outdir) 217 os.makedirs(outdir)
92 else: 218 else:
93 if not os.path.exists(outdir): 219 if not os.path.exists(outdir):
94 print outdir + ' doesn\'t exist' 220 print outdir + ' doesn\'t exist'
95 return 1 221 return 1
96 222
97 addBuildtoolsToPath() 223 addBuildtoolsToPath()
98 224
99 if options.urls: 225 if options.urls:
100 files = options.urls 226 files = options.urls
101 elif options.urls_file: 227 elif options.urls_file:
102 with open(options.urls_file) as u: 228 with open(options.urls_file) as u:
103 files = u.read().splitlines() 229 files = u.read().splitlines()
104 else: 230 else:
105 print 'oh no' 231 print 'oh no'
106 return 1 232 return 1
107 233
108 if options.write_index: 234 if options.write_index:
109 writeIndex(outdir) 235 writeIndex(outdir)
110 writeFeature(outdir) 236 writeFeature(outdir)
111 print 'index is written' 237 print 'index is written'
112 return 0 238 return 0
113 239
114 driver = newDriver() 240 driver = newDriver(options.emulate_mobile)
115 241
116 feature_extractor = open('extract_features.js').read() 242 feature_extractor = open('extract_features.js').read()
117 243
118 try: 244 try:
119 jobs = list(enumerate(files)) 245 jobs = list(enumerate(files))
120 random.shuffle(jobs) 246 random.shuffle(jobs)
121 for i, f in jobs: 247 for i, f in jobs:
122 prefix = '%s/%d' % (outdir, i) 248 prefix = '%s/%d' % (outdir, i)
123 info = '%s.info' % prefix 249 info = '%s.info' % prefix
250 basedata = {'index': i, 'url': f}
124 251
125 if os.path.exists(info): 252 if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix ):
126 print "skip %d" % (i) 253 print "skip %d" % (i)
127 continue; 254 continue;
128 255
129 with FileLock('%s.lock' % (prefix)): 256 with FileLock('%s.lock' % (prefix)):
130 if os.path.exists(info): 257 if not shouldProcess(options.load_mhtml, options.skip_distillation, pref ix):
131 print "SKIP %d" % (i) 258 print "skip %d" % (i)
132 continue; 259 continue;
133 try: 260 try:
134 ss = '%s.png' % prefix 261 ss = '%s.png' % prefix
135 dss = '%s-distilled.png' % prefix 262 dss = '%s-distilled.png' % prefix
136 fea = '%s.feature' % prefix 263 fea = '%s.feature' % prefix
264 dfea = '%s.dfeature' % prefix
265 mhtml = '%s.mhtml' % prefix
266 mhtml_url = 'file://%s' % os.path.abspath(mhtml)
137 267
138 driver.set_window_size(1280, 5000) 268 if options.emulate_mobile:
139 driver.get(f) 269 driver.set_window_size(400, 800)
140 time.sleep(3) # wait for some async scripts 270 else:
141 driver.save_screenshot(ss) 271 driver.set_window_size(1280, 5000)
142 print "saved %s" % ss 272 if options.load_mhtml:
273 if not os.path.exists(mhtml):
274 print "SKIP %d, no mhtml" % (i)
275 continue
276 driver.get(mhtml_url)
277 time.sleep(1) # wait a bit for things to stablize
278 else:
279 driver.get(f)
280 time.sleep(3) # wait for some async scripts
281 driver.save_screenshot(ss)
282 print "saved %s" % ss
143 283
144 features = driver.execute_script(feature_extractor) 284 url_override = None
145 data = { 285 if options.load_mhtml:
146 'index': i, 286 with open(fea) as infile:
147 'url': f, 287 # otherwise it would be file:// of mhtml
148 'features': features 288 url_override = json.load(infile)['features']['url']
149 } 289 fea = '%s.mfeature' % prefix
150 with open(fea, 'w') as outf: 290 _, derived = saveFeatures(driver, feature_extractor, basedata, url_ove rride, fea)
151 json.dump(data, outf, indent=2) 291
152 print "saved %s" % fea 292 if options.desktop_distillable_only:
293 if derived['native']['features']['isMobileFriendly'] or not derived[ 'native']['distillable']:
294 os.system('rm %s.feature %s.png' % (prefix, prefix))
295 saveInfoFile(basedata, ss, dss, info)
296 continue
153 297
154 if options.save_mhtml: 298 if options.save_mhtml:
155 mhtml = '%s.mhtml' % prefix 299 if not saveMHTML(mhtml):
156 cmd = (
157 'xdotool key --clearmodifiers "ctrl+s" && ' +
158 'sleep 1 && ' +
159 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
160 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
161 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
162 'xdotool key --delay 20 --clearmodifiers Return'
163 ) % (os.getcwd() + '/' + mhtml)
164 os.system(cmd)
165 time.sleep(3) # wait for file saving
166 if not os.path.exists(mhtml):
167 # If the file is not saved, the focus point might be lost. 300 # If the file is not saved, the focus point might be lost.
168 # Restart the whole xvfb environment to be safe. 301 # Restart the whole xvfb environment to be safe.
169 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm l) 302 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm l)
170 break 303 break
171 304
172 driver.set_window_size(640, 5000) 305 if options.skip_distillation:
306 continue
307
308 if options.emulate_mobile:
309 driver.set_window_size(400, 800)
310 else:
311 driver.set_window_size(640, 5000)
312
313 if options.load_mhtml:
314 driver.get(getDistillerUrl(mhtml_url))
315 time.sleep(10)
316 dss = '%s-mdistilled.png' % prefix
317 driver.save_screenshot(dss)
318 print "saved %s" % dss
319 dfea = '%s.mdfeature' % prefix
320 saveFeatures(driver, feature_extractor, basedata, None, dfea)
321 continue
322
173 driver.get(getDistillerUrl(f)) 323 driver.get(getDistillerUrl(f))
174 time.sleep(20) # wait for multi-page, etc 324 for i in range(3):
175 driver.save_screenshot(dss) 325 time.sleep(20) # wait for multi-page, etc
176 print "saved %s" % dss 326 driver.save_screenshot(dss)
327 print "saved %s" % dss
328 feature, _ = saveFeatures(driver, feature_extractor, basedata, None, dfea)
329 if feature['features']['innerText'] != "":
330 break
177 331
178 data = { 332 saveInfoFile(basedata, ss, dss, info)
179 'index': i,
180 'url': f,
181 'screenshot': ss,
182 'distilled': dss,
183 }
184 with open(info, 'w') as info:
185 json.dump(data, info)
186 333
187 except Exception as e: 334 except Exception as e:
188 print e 335 print e
189 print "Index=%d URL=%s" % (i, f) 336 print "Index=%d URL=%s" % (i, f)
190 driver.quit() 337 driver.quit()
191 driver = newDriver() 338 driver = newDriver()
192 pass 339 pass
193 340
194 finally: 341 finally:
195 driver.quit() 342 driver.quit()
196 343
197 return 0 344 return 0
198 345
199 if __name__ == '__main__': 346 if __name__ == '__main__':
200 sys.exit(main(sys.argv[1:])) 347 sys.exit(main(sys.argv[1:]))
201 348
OLDNEW
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/server.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698