OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 import argparse |
| 7 import json |
| 8 import os |
| 9 import shutil |
| 10 import sys |
| 11 import time |
| 12 import urllib |
| 13 |
| 14 try: |
| 15 from selenium import webdriver |
| 16 except: |
| 17 print 'ERROR:' |
| 18 print 'Couldn\'t import webdriver. Please run `sudo ./install-build-deps.sh`.' |
| 19 sys.exit(1) |
| 20 |
| 21 self_dir = os.path.abspath(os.path.dirname(__file__)) |
| 22 |
| 23 def addBuildtoolsToPath(): |
| 24 envPath = os.environ['PATH'] |
| 25 if not 'buildtools' in envPath: |
| 26 os.environ['PATH'] = '%s/buildtools:%s' % (self_dir, envPath) |
| 27 |
| 28 def newDriver(): |
| 29 chromeOptions = webdriver.ChromeOptions() |
| 30 chromeOptions.add_argument('--enable-dom-distiller') |
| 31 driver = webdriver.Chrome(chrome_options=chromeOptions) |
| 32 driver.set_window_size(1600, 5000) |
| 33 driver.set_page_load_timeout(20) |
| 34 driver.set_script_timeout(30) |
| 35 return driver |
| 36 |
| 37 def main(argv): |
| 38 parser = argparse.ArgumentParser() |
| 39 parser.add_argument('--out', required=True) |
| 40 parser.add_argument('urls', nargs='*') |
| 41 parser.add_argument('--force', action='store_true') |
| 42 parser.add_argument('--urls-file') |
| 43 parser.add_argument('--restart', action='store_true') |
| 44 options = parser.parse_args(argv) |
| 45 |
| 46 outdir = options.out |
| 47 if not options.restart: |
| 48 if os.path.exists(outdir): |
| 49 if not options.force: |
| 50 print outdir + ' exists' |
| 51 return 1 |
| 52 shutil.rmtree(outdir, ignore_errors=True) |
| 53 os.makedirs(outdir) |
| 54 else: |
| 55 if not os.path.exists(outdir): |
| 56 print outdir + ' doesn\'t exist' |
| 57 return 1 |
| 58 |
| 59 addBuildtoolsToPath() |
| 60 |
| 61 if options.urls: |
| 62 files = options.urls |
| 63 elif options.urls_file: |
| 64 with open(options.urls_file) as u: |
| 65 files = u.read().splitlines() |
| 66 else: |
| 67 print 'oh no' |
| 68 return 1 |
| 69 |
| 70 driver = newDriver() |
| 71 output = [] |
| 72 startIndex = 0 |
| 73 if options.restart: |
| 74 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] |
| 75 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(
f)[1] == '.info'] |
| 76 for f in prevfiles: |
| 77 with open(f) as infofile: |
| 78 info = json.load(infofile) |
| 79 output.append(info) |
| 80 startIndex = max([i['index'] for i in output]) + 1 |
| 81 print 'starting at ', startIndex |
| 82 |
| 83 feature_extractor = open('extract_features.js').read() |
| 84 |
| 85 try: |
| 86 for i, f in enumerate(files): |
| 87 prefix = '%s/%d' % (outdir, i) |
| 88 if i < startIndex: |
| 89 continue |
| 90 try: |
| 91 ss = '%s.png' % prefix |
| 92 dss = '%s-distilled.png' % prefix |
| 93 driver.get(f) |
| 94 time.sleep(0.5) |
| 95 features = driver.execute_script(feature_extractor) |
| 96 data = { |
| 97 'index': i, |
| 98 'url': f, |
| 99 'features': features |
| 100 } |
| 101 output.append(data) |
| 102 with open('%s.features' % prefix, 'w') as info: |
| 103 json.dump(data, info) |
| 104 |
| 105 except Exception as e: |
| 106 print e |
| 107 driver.quit() |
| 108 driver = newDriver() |
| 109 pass |
| 110 |
| 111 finally: |
| 112 driver.quit() |
| 113 |
| 114 with open('%s/features' % outdir, 'w') as index: |
| 115 json.dump(output, index) |
| 116 return 0 |
| 117 |
| 118 if __name__ == '__main__': |
| 119 sys.exit(main(sys.argv[1:])) |
| 120 |
OLD | NEW |