OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 import argparse |
| 7 import json |
| 8 import os |
| 9 import shutil |
| 10 import sys |
| 11 import time |
| 12 import urllib |
| 13 import random |
| 14 from lockfile import FileLock |
| 15 |
| 16 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) |
| 17 |
| 18 try: |
| 19 from selenium import webdriver |
| 20 except: |
| 21 print 'ERROR:' |
| 22 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.
' % repo_root |
| 23 sys.exit(1) |
| 24 |
| 25 def addBuildtoolsToPath(): |
| 26 envPath = os.environ['PATH'] |
| 27 buildtoolsPath = repo_root + '/buildtools' |
| 28 if not buildtoolsPath in envPath: |
| 29 os.environ['PATH'] = buildtoolsPath + ':' + envPath |
| 30 |
| 31 def getDistillerUrl(u): |
| 32 params = { 'url': u} |
| 33 return "chrome-distiller://blah/?" + urllib.urlencode(params) |
| 34 |
| 35 def newDriver(): |
| 36 chromeOptions = webdriver.ChromeOptions() |
| 37 chromeOptions.binary_location = "/usr/bin/google-chrome-unstable"; |
| 38 chromeOptions.add_argument('--enable-dom-distiller') |
| 39 chromeOptions.add_argument('--save-page-as-mhtml') |
| 40 driver = webdriver.Chrome(chrome_options=chromeOptions) |
| 41 driver.set_page_load_timeout(60) |
| 42 driver.set_script_timeout(60) |
| 43 print "created a new chrome driver" |
| 44 return driver |
| 45 |
| 46 def writeAggregated(outdir, ext, out, in_marshal=False): |
| 47 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] |
| 48 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)
[1] == '.' + ext] |
| 49 output = [] |
| 50 print 'reading %s files' % (ext) |
| 51 for f in prevfiles: |
| 52 with open(f) as infofile: |
| 53 info = json.load(infofile) |
| 54 output.append(info) |
| 55 print 'done reading %s files' % (ext) |
| 56 |
| 57 output = sorted(output, key=lambda k: k['index']) |
| 58 print 'writing %s files' % (ext) |
| 59 with open('%s/%s' % (outdir, out), 'w') as outf: |
| 60 if in_marshal: |
| 61 import marshal |
| 62 marshal.dump(output, outf) |
| 63 else: |
| 64 json.dump(output, outf, indent=2) |
| 65 print 'done writing %s files' % (ext) |
| 66 |
| 67 def writeIndex(outdir): |
| 68 writeAggregated(outdir, "info", "index") |
| 69 |
| 70 def writeFeature(outdir): |
| 71 writeAggregated(outdir, "feature", "feature", in_marshal=True) |
| 72 |
| 73 def main(argv): |
| 74 parser = argparse.ArgumentParser() |
| 75 parser.add_argument('--out', required=True) |
| 76 parser.add_argument('urls', nargs='*') |
| 77 parser.add_argument('--force', action='store_true') |
| 78 parser.add_argument('--urls-file') |
| 79 parser.add_argument('--resume', action='store_true') |
| 80 parser.add_argument('--write-index', action='store_true') |
| 81 parser.add_argument('--save-mhtml', action='store_true') |
| 82 options = parser.parse_args(argv) |
| 83 |
| 84 outdir = options.out |
| 85 if not options.resume: |
| 86 if os.path.exists(outdir): |
| 87 if not options.force: |
| 88 print outdir + ' exists' |
| 89 return 1 |
| 90 shutil.rmtree(outdir, ignore_errors=True) |
| 91 os.makedirs(outdir) |
| 92 else: |
| 93 if not os.path.exists(outdir): |
| 94 print outdir + ' doesn\'t exist' |
| 95 return 1 |
| 96 |
| 97 addBuildtoolsToPath() |
| 98 |
| 99 if options.urls: |
| 100 files = options.urls |
| 101 elif options.urls_file: |
| 102 with open(options.urls_file) as u: |
| 103 files = u.read().splitlines() |
| 104 else: |
| 105 print 'oh no' |
| 106 return 1 |
| 107 |
| 108 if options.write_index: |
| 109 writeIndex(outdir) |
| 110 writeFeature(outdir) |
| 111 print 'index is written' |
| 112 return 0 |
| 113 |
| 114 driver = newDriver() |
| 115 |
| 116 feature_extractor = open('extract_features.js').read() |
| 117 |
| 118 try: |
| 119 jobs = list(enumerate(files)) |
| 120 random.shuffle(jobs) |
| 121 for i, f in jobs: |
| 122 prefix = '%s/%d' % (outdir, i) |
| 123 info = '%s.info' % prefix |
| 124 |
| 125 if os.path.exists(info): |
| 126 print "skip %d" % (i) |
| 127 continue; |
| 128 |
| 129 with FileLock('%s.lock' % (prefix)): |
| 130 if os.path.exists(info): |
| 131 print "SKIP %d" % (i) |
| 132 continue; |
| 133 try: |
| 134 ss = '%s.png' % prefix |
| 135 dss = '%s-distilled.png' % prefix |
| 136 fea = '%s.feature' % prefix |
| 137 |
| 138 driver.set_window_size(1280, 5000) |
| 139 driver.get(f) |
| 140 time.sleep(3) # wait for some async scripts |
| 141 driver.save_screenshot(ss) |
| 142 print "saved %s" % ss |
| 143 |
| 144 features = driver.execute_script(feature_extractor) |
| 145 data = { |
| 146 'index': i, |
| 147 'url': f, |
| 148 'features': features |
| 149 } |
| 150 with open(fea, 'w') as outf: |
| 151 json.dump(data, outf, indent=2) |
| 152 print "saved %s" % fea |
| 153 |
| 154 if options.save_mhtml: |
| 155 mhtml = '%s.mhtml' % prefix |
| 156 cmd = ( |
| 157 'xdotool key --clearmodifiers "ctrl+s" && ' + |
| 158 'sleep 1 && ' + |
| 159 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' + |
| 160 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && '
+ |
| 161 'xdotool type --delay 10 --clearmodifiers "%s" && ' + |
| 162 'xdotool key --delay 20 --clearmodifiers Return' |
| 163 ) % (os.getcwd() + '/' + mhtml) |
| 164 os.system(cmd) |
| 165 time.sleep(3) # wait for file saving |
| 166 if not os.path.exists(mhtml): |
| 167 # If the file is not saved, the focus point might be lost. |
| 168 # Restart the whole xvfb environment to be safe. |
| 169 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm
l) |
| 170 break |
| 171 |
| 172 driver.set_window_size(640, 5000) |
| 173 driver.get(getDistillerUrl(f)) |
| 174 time.sleep(20) # wait for multi-page, etc |
| 175 driver.save_screenshot(dss) |
| 176 print "saved %s" % dss |
| 177 |
| 178 data = { |
| 179 'index': i, |
| 180 'url': f, |
| 181 'screenshot': ss, |
| 182 'distilled': dss, |
| 183 } |
| 184 with open(info, 'w') as info: |
| 185 json.dump(data, info) |
| 186 |
| 187 except Exception as e: |
| 188 print e |
| 189 print "Index=%d URL=%s" % (i, f) |
| 190 driver.quit() |
| 191 driver = newDriver() |
| 192 pass |
| 193 |
| 194 finally: |
| 195 driver.quit() |
| 196 |
| 197 return 0 |
| 198 |
| 199 if __name__ == '__main__': |
| 200 sys.exit(main(sys.argv[1:])) |
| 201 |
OLD | NEW |