| Index: heuristics/distillable/get_screenshots.py
|
| diff --git a/heuristics/distillable/get_screenshots.py b/heuristics/distillable/get_screenshots.py
|
| new file mode 100755
|
| index 0000000000000000000000000000000000000000..8001add9f8e13f0958c30c5a28c1d75a8ddcd07a
|
| --- /dev/null
|
| +++ b/heuristics/distillable/get_screenshots.py
|
| @@ -0,0 +1,201 @@
|
| +#!/usr/bin/env python
|
| +# Copyright 2016 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be
|
| +# found in the LICENSE file.
|
| +
|
| +import argparse
|
| +import json
|
| +import os
|
| +import shutil
|
| +import sys
|
| +import time
|
| +import urllib
|
| +import random
|
| +from lockfile import FileLock
|
| +
|
| +repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
|
| +
|
| +try:
|
| + from selenium import webdriver
|
| +except:
|
| + print 'ERROR:'
|
| + print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.' % repo_root
|
| + sys.exit(1)
|
| +
|
| +def addBuildtoolsToPath():
|
| + envPath = os.environ['PATH']
|
| + buildtoolsPath = repo_root + '/buildtools'
|
| + if not buildtoolsPath in envPath:
|
| + os.environ['PATH'] = buildtoolsPath + ':' + envPath
|
| +
|
| +def getDistillerUrl(u):
|
| + params = { 'url': u}
|
| + return "chrome-distiller://blah/?" + urllib.urlencode(params)
|
| +
|
| +def newDriver():
|
| + chromeOptions = webdriver.ChromeOptions()
|
| + chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
|
| + chromeOptions.add_argument('--enable-dom-distiller')
|
| + chromeOptions.add_argument('--save-page-as-mhtml')
|
| + driver = webdriver.Chrome(chrome_options=chromeOptions)
|
| + driver.set_page_load_timeout(60)
|
| + driver.set_script_timeout(60)
|
| + print "created a new chrome driver"
|
| + return driver
|
| +
|
| +def writeAggregated(outdir, ext, out, in_marshal=False):
|
| + prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]
|
| + prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)[1] == '.' + ext]
|
| + output = []
|
| + print 'reading %s files' % (ext)
|
| + for f in prevfiles:
|
| + with open(f) as infofile:
|
| + info = json.load(infofile)
|
| + output.append(info)
|
| + print 'done reading %s files' % (ext)
|
| +
|
| + output = sorted(output, key=lambda k: k['index'])
|
| + print 'writing %s files' % (ext)
|
| + with open('%s/%s' % (outdir, out), 'w') as outf:
|
| + if in_marshal:
|
| + import marshal
|
| + marshal.dump(output, outf)
|
| + else:
|
| + json.dump(output, outf, indent=2)
|
| + print 'done writing %s files' % (ext)
|
| +
|
| +def writeIndex(outdir):
|
| + writeAggregated(outdir, "info", "index")
|
| +
|
| +def writeFeature(outdir):
|
| + writeAggregated(outdir, "feature", "feature", in_marshal=True)
|
| +
|
| +def main(argv):
|
| + parser = argparse.ArgumentParser()
|
| + parser.add_argument('--out', required=True)
|
| + parser.add_argument('urls', nargs='*')
|
| + parser.add_argument('--force', action='store_true')
|
| + parser.add_argument('--urls-file')
|
| + parser.add_argument('--resume', action='store_true')
|
| + parser.add_argument('--write-index', action='store_true')
|
| + parser.add_argument('--save-mhtml', action='store_true')
|
| + options = parser.parse_args(argv)
|
| +
|
| + outdir = options.out
|
| + if not options.resume:
|
| + if os.path.exists(outdir):
|
| + if not options.force:
|
| + print outdir + ' exists'
|
| + return 1
|
| + shutil.rmtree(outdir, ignore_errors=True)
|
| + os.makedirs(outdir)
|
| + else:
|
| + if not os.path.exists(outdir):
|
| + print outdir + ' doesn\'t exist'
|
| + return 1
|
| +
|
| + addBuildtoolsToPath()
|
| +
|
| + if options.urls:
|
| + files = options.urls
|
| + elif options.urls_file:
|
| + with open(options.urls_file) as u:
|
| + files = u.read().splitlines()
|
| + else:
|
| + print 'oh no'
|
| + return 1
|
| +
|
| + if options.write_index:
|
| + writeIndex(outdir)
|
| + writeFeature(outdir)
|
| + print 'index is written'
|
| + return 0
|
| +
|
| + driver = newDriver()
|
| +
|
| + feature_extractor = open('extract_features.js').read()
|
| +
|
| + try:
|
| + jobs = list(enumerate(files))
|
| + random.shuffle(jobs)
|
| + for i, f in jobs:
|
| + prefix = '%s/%d' % (outdir, i)
|
| + info = '%s.info' % prefix
|
| +
|
| + if os.path.exists(info):
|
| + print "skip %d" % (i)
|
| + continue;
|
| +
|
| + with FileLock('%s.lock' % (prefix)):
|
| + if os.path.exists(info):
|
| + print "SKIP %d" % (i)
|
| + continue;
|
| + try:
|
| + ss = '%s.png' % prefix
|
| + dss = '%s-distilled.png' % prefix
|
| + fea = '%s.feature' % prefix
|
| +
|
| + driver.set_window_size(1280, 5000)
|
| + driver.get(f)
|
| + time.sleep(3) # wait for some async scripts
|
| + driver.save_screenshot(ss)
|
| + print "saved %s" % ss
|
| +
|
| + features = driver.execute_script(feature_extractor)
|
| + data = {
|
| + 'index': i,
|
| + 'url': f,
|
| + 'features': features
|
| + }
|
| + with open(fea, 'w') as outf:
|
| + json.dump(data, outf, indent=2)
|
| + print "saved %s" % fea
|
| +
|
| + if options.save_mhtml:
|
| + mhtml = '%s.mhtml' % prefix
|
| + cmd = (
|
| + 'xdotool key --clearmodifiers "ctrl+s" && ' +
|
| + 'sleep 1 && ' +
|
| + 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
|
| + 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
|
| + 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
|
| + 'xdotool key --delay 20 --clearmodifiers Return'
|
| + ) % (os.getcwd() + '/' + mhtml)
|
| + os.system(cmd)
|
| + time.sleep(3) # wait for file saving
|
| + if not os.path.exists(mhtml):
|
| + # If the file is not saved, the focus point might be lost.
|
| + # Restart the whole xvfb environment to be safe.
|
| + print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtml)
|
| + break
|
| +
|
| + driver.set_window_size(640, 5000)
|
| + driver.get(getDistillerUrl(f))
|
| + time.sleep(20) # wait for multi-page, etc
|
| + driver.save_screenshot(dss)
|
| + print "saved %s" % dss
|
| +
|
| + data = {
|
| + 'index': i,
|
| + 'url': f,
|
| + 'screenshot': ss,
|
| + 'distilled': dss,
|
| + }
|
| + with open(info, 'w') as info:
|
| + json.dump(data, info)
|
| +
|
| + except Exception as e:
|
| + print e
|
| + print "Index=%d URL=%s" % (i, f)
|
| + driver.quit()
|
| + driver = newDriver()
|
| + pass
|
| +
|
| + finally:
|
| + driver.quit()
|
| +
|
| + return 0
|
| +
|
| +if __name__ == '__main__':
|
| + sys.exit(main(sys.argv[1:]))
|
| +
|
|
|