Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(282)

Unified Diff: heuristics/distillable/get_screenshots.py

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/index.html » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: heuristics/distillable/get_screenshots.py
diff --git a/heuristics/distillable/get_screenshots.py b/heuristics/distillable/get_screenshots.py
new file mode 100755
index 0000000000000000000000000000000000000000..8001add9f8e13f0958c30c5a28c1d75a8ddcd07a
--- /dev/null
+++ b/heuristics/distillable/get_screenshots.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python
+# Copyright 2016 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import argparse
+import json
+import os
+import shutil
+import sys
+import time
+import urllib
+import random
+from lockfile import FileLock
+
+repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
+
+try:
+ from selenium import webdriver
+except:
+ print 'ERROR:'
+ print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.' % repo_root
+ sys.exit(1)
+
+def addBuildtoolsToPath():
+ envPath = os.environ['PATH']
+ buildtoolsPath = repo_root + '/buildtools'
+ if not buildtoolsPath in envPath:
+ os.environ['PATH'] = buildtoolsPath + ':' + envPath
+
+def getDistillerUrl(u):
+ params = { 'url': u}
+ return "chrome-distiller://blah/?" + urllib.urlencode(params)
+
+def newDriver():
+ chromeOptions = webdriver.ChromeOptions()
+ chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
+ chromeOptions.add_argument('--enable-dom-distiller')
+ chromeOptions.add_argument('--save-page-as-mhtml')
+ driver = webdriver.Chrome(chrome_options=chromeOptions)
+ driver.set_page_load_timeout(60)
+ driver.set_script_timeout(60)
+ print "created a new chrome driver"
+ return driver
+
+def writeAggregated(outdir, ext, out, in_marshal=False):
+ prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]
+ prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)[1] == '.' + ext]
+ output = []
+ print 'reading %s files' % (ext)
+ for f in prevfiles:
+ with open(f) as infofile:
+ info = json.load(infofile)
+ output.append(info)
+ print 'done reading %s files' % (ext)
+
+ output = sorted(output, key=lambda k: k['index'])
+ print 'writing %s files' % (ext)
+ with open('%s/%s' % (outdir, out), 'w') as outf:
+ if in_marshal:
+ import marshal
+ marshal.dump(output, outf)
+ else:
+ json.dump(output, outf, indent=2)
+ print 'done writing %s files' % (ext)
+
+def writeIndex(outdir):
+ writeAggregated(outdir, "info", "index")
+
+def writeFeature(outdir):
+ writeAggregated(outdir, "feature", "feature", in_marshal=True)
+
+def main(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--out', required=True)
+ parser.add_argument('urls', nargs='*')
+ parser.add_argument('--force', action='store_true')
+ parser.add_argument('--urls-file')
+ parser.add_argument('--resume', action='store_true')
+ parser.add_argument('--write-index', action='store_true')
+ parser.add_argument('--save-mhtml', action='store_true')
+ options = parser.parse_args(argv)
+
+ outdir = options.out
+ if not options.resume:
+ if os.path.exists(outdir):
+ if not options.force:
+ print outdir + ' exists'
+ return 1
+ shutil.rmtree(outdir, ignore_errors=True)
+ os.makedirs(outdir)
+ else:
+ if not os.path.exists(outdir):
+ print outdir + ' doesn\'t exist'
+ return 1
+
+ addBuildtoolsToPath()
+
+ if options.urls:
+ files = options.urls
+ elif options.urls_file:
+ with open(options.urls_file) as u:
+ files = u.read().splitlines()
+ else:
+ print 'oh no'
+ return 1
+
+ if options.write_index:
+ writeIndex(outdir)
+ writeFeature(outdir)
+ print 'index is written'
+ return 0
+
+ driver = newDriver()
+
+ feature_extractor = open('extract_features.js').read()
+
+ try:
+ jobs = list(enumerate(files))
+ random.shuffle(jobs)
+ for i, f in jobs:
+ prefix = '%s/%d' % (outdir, i)
+ info = '%s.info' % prefix
+
+ if os.path.exists(info):
+ print "skip %d" % (i)
+ continue;
+
+ with FileLock('%s.lock' % (prefix)):
+ if os.path.exists(info):
+ print "SKIP %d" % (i)
+ continue;
+ try:
+ ss = '%s.png' % prefix
+ dss = '%s-distilled.png' % prefix
+ fea = '%s.feature' % prefix
+
+ driver.set_window_size(1280, 5000)
+ driver.get(f)
+ time.sleep(3) # wait for some async scripts
+ driver.save_screenshot(ss)
+ print "saved %s" % ss
+
+ features = driver.execute_script(feature_extractor)
+ data = {
+ 'index': i,
+ 'url': f,
+ 'features': features
+ }
+ with open(fea, 'w') as outf:
+ json.dump(data, outf, indent=2)
+ print "saved %s" % fea
+
+ if options.save_mhtml:
+ mhtml = '%s.mhtml' % prefix
+ cmd = (
+ 'xdotool key --clearmodifiers "ctrl+s" && ' +
+ 'sleep 1 && ' +
+ 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
+ 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
+ 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
+ 'xdotool key --delay 20 --clearmodifiers Return'
+ ) % (os.getcwd() + '/' + mhtml)
+ os.system(cmd)
+ time.sleep(3) # wait for file saving
+ if not os.path.exists(mhtml):
+ # If the file is not saved, the focus point might be lost.
+ # Restart the whole xvfb environment to be safe.
+ print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtml)
+ break
+
+ driver.set_window_size(640, 5000)
+ driver.get(getDistillerUrl(f))
+ time.sleep(20) # wait for multi-page, etc
+ driver.save_screenshot(dss)
+ print "saved %s" % dss
+
+ data = {
+ 'index': i,
+ 'url': f,
+ 'screenshot': ss,
+ 'distilled': dss,
+ }
+ with open(info, 'w') as info:
+ json.dump(data, info)
+
+ except Exception as e:
+ print e
+ print "Index=%d URL=%s" % (i, f)
+ driver.quit()
+ driver = newDriver()
+ pass
+
+ finally:
+ driver.quit()
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
+
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/index.html » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698