get_features.py - Issue 1289123002: Merge branch 'master' into heuristics

Side by Side Diff

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Keyboard Shortcuts

	File
u :	up to issue
j / k :	jump to file after / before current file
J / K :	jump to next file with a comment after / before current file
	Side-by-side diff
i :	toggle intra-line diffs
e :	expand all comments
c :	collapse all comments
s :	toggle showing all comments
n / p :	next / previous diff chunk or comment
N / P :	next / previous comment
<Up> / <Down> :	next / previous line

	Issue
u :	up to list of issues
j / k :	jump to patch after / before current patch
o / <Enter> :	open current patch in side-by-side view
i :	open current patch in unified diff view

	Issue List
j / k :	jump to issue after / before current issue
o / <Enter> :	open current issue

Side by Side Diff: get_features.py

Issue 1289123002: Merge branch 'master' into heuristics Base URL: git@github.com:chromium/dom-distiller.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 #!/usr/bin/env python

	2 # Copyright 2014 The Chromium Authors. All rights reserved.

	3 # Use of this source code is governed by a BSD-style license that can be

	4 # found in the LICENSE file.

	5

	6 import argparse

	7 import json

	8 import os

	9 import shutil

	10 import sys

	11 import time

	12 import urllib

	13

	14 try:

	15 from selenium import webdriver

	16 except:

	17 print 'ERROR:'

	18 print 'Couldn\'t import webdriver. Please run `sudo ./install-build-deps.sh`.'

	19 sys.exit(1)

	20

	21 self_dir = os.path.abspath(os.path.dirname(__file__))

	22

	23 def addBuildtoolsToPath():

	24 envPath = os.environ['PATH']

	25 if not 'buildtools' in envPath:

	26 os.environ['PATH'] = '%s/buildtools:%s' % (self_dir, envPath)

	27

	28 def newDriver():

	29 chromeOptions = webdriver.ChromeOptions()

	30 chromeOptions.add_argument('--enable-dom-distiller')

	31 driver = webdriver.Chrome(chrome_options=chromeOptions)

	32 driver.set_window_size(1600, 5000)

	33 driver.set_page_load_timeout(20)

	34 driver.set_script_timeout(30)

	35 return driver

	36

	37 def main(argv):

	38 parser = argparse.ArgumentParser()

	39 parser.add_argument('--out', required=True)

	40 parser.add_argument('urls', nargs='*')

	41 parser.add_argument('--force', action='store_true')

	42 parser.add_argument('--urls-file')

	43 parser.add_argument('--restart', action='store_true')

	44 options = parser.parse_args(argv)

	45

	46 outdir = options.out

	47 if not options.restart:

	48 if os.path.exists(outdir):

	49 if not options.force:

	50 print outdir + ' exists'

	51 return 1

	52 shutil.rmtree(outdir, ignore_errors=True)

	53 os.makedirs(outdir)

	54 else:

	55 if not os.path.exists(outdir):

	56 print outdir + ' doesn\'t exist'

	57 return 1

	58

	59 addBuildtoolsToPath()

	60

	61 if options.urls:

	62 files = options.urls

	63 elif options.urls_file:

	64 with open(options.urls_file) as u:

	65 files = u.read().splitlines()

	66 else:

	67 print 'oh no'

	68 return 1

	69

	70 driver = newDriver()

	71 output = []

	72 startIndex = 0

	73 if options.restart:

	74 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]

	75 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext( f)[1] == '.info']

	76 for f in prevfiles:

	77 with open(f) as infofile:

	78 info = json.load(infofile)

	79 output.append(info)

	80 startIndex = max([i['index'] for i in output]) + 1

	81 print 'starting at ', startIndex

	82

	83 feature_extractor = open('extract_features.js').read()

	84

	85 try:

	86 for i, f in enumerate(files):

	87 prefix = '%s/%d' % (outdir, i)

	88 if i < startIndex:

	89 continue

	90 try:

	91 ss = '%s.png' % prefix

	92 dss = '%s-distilled.png' % prefix

	93 driver.get(f)

	94 time.sleep(0.5)

	95 features = driver.execute_script(feature_extractor)

	96 data = {

	97 'index': i,

	98 'url': f,

	99 'features': features

	100 }

	101 output.append(data)

	102 with open('%s.features' % prefix, 'w') as info:

	103 json.dump(data, info)

	104

	105 except Exception as e:

	106 print e

	107 driver.quit()

	108 driver = newDriver()

	109 pass

	110

	111 finally:

	112 driver.quit()

	113

	114 with open('%s/features' % outdir, 'w') as index:

	115 json.dump(output, index)

	116 return 0

	117

	118 if __name__ == '__main__':

	119 sys.exit(main(sys.argv[1:]))

	120

OLD	NEW

« no previous file with comments | « foo/test.js ('k') | get_screenshots.py » ('j') | no next file with comments »