Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(434)

Unified Diff: heuristics/distillable/get_screenshots.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/server.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: heuristics/distillable/get_screenshots.py
diff --git a/heuristics/distillable/get_screenshots.py b/heuristics/distillable/get_screenshots.py
index 8001add9f8e13f0958c30c5a28c1d75a8ddcd07a..3482498302831e1cfd9d20373bfa15bf2a29a0dc 100755
--- a/heuristics/distillable/get_screenshots.py
+++ b/heuristics/distillable/get_screenshots.py
@@ -13,10 +13,13 @@ import urllib
import random
from lockfile import FileLock
+from calculate_derived_features import CalcDerivedFeatures
+
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
try:
from selenium import webdriver
+ from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
except:
print 'ERROR:'
print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.' % repo_root
@@ -32,17 +35,109 @@ def getDistillerUrl(u):
params = { 'url': u}
return "chrome-distiller://blah/?" + urllib.urlencode(params)
-def newDriver():
+def newDriver(mobile=False):
chromeOptions = webdriver.ChromeOptions()
- chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
+ # If you want to use a different version of chrome, specify the full path here.
+ #chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
chromeOptions.add_argument('--enable-dom-distiller')
chromeOptions.add_argument('--save-page-as-mhtml')
- driver = webdriver.Chrome(chrome_options=chromeOptions)
+ chromeOptions.add_argument('--reader-mode-heuristics=adaboost')
+ chromeOptions.add_argument('--distillability-dev')
+ if mobile:
+ mobile_emulation = { "deviceName": "Google Nexus 5" }
+ chromeOptions.add_experimental_option("mobileEmulation", mobile_emulation)
+
+ d = DesiredCapabilities.CHROME
+ # This is to enable accessing devtools console log from here, for nativeFeatures().
+ d['loggingPrefs'] = {'browser': 'ALL'}
+ driver = webdriver.Chrome(chrome_options=chromeOptions, desired_capabilities=d)
driver.set_page_load_timeout(60)
driver.set_script_timeout(60)
print "created a new chrome driver"
return driver
+def nativeFeatures(logs):
+ return _parseNative(logs, 'distillability_features = ')
+
+def nativeClassification(logs):
+ return _parseNative(logs, 'adaboost_classification = ')
+
+def _parseNative(logs, needle):
+ """Parse console logs from Chrome and get decoded JSON.
+
+ Args:
+ logs: Chrome log object
+ needle (str): the string leading the actual JSON.
+
+ Example:
+ >>> _parseNative([{'message':'a=b'},{'message':'ac={"a":[1,2]}'}],'c=')
+ {u'a': [1, 2]}
+ """
+ ret = None
+ for log in logs:
+ message = log['message']
+ loc = message.find(needle)
+ if loc >= 0:
+ ret = json.loads(message[loc+len(needle):])
+ return ret
+
+def saveFeatures(driver, feature_extractor, data, url_override, filename):
+ data = dict.copy(data)
+ features = driver.execute_script(feature_extractor)
+ if url_override:
+ features['url'] = url_override
+ data['features'] = features
+
+ logs = driver.get_log('browser')
+ native = nativeClassification(logs)
+ if native:
+ native['features'] = nativeFeatures(logs)
+ data['native'] = native
+
+ with open(filename, 'w') as outf:
+ json.dump(data, outf, indent=2, sort_keys=True)
+ print "saved %s" % filename
+
+ derived = dict.copy(data)
+ derived['features'] = CalcDerivedFeatures(data['index'], features)
+
+ derived_name = filename + '-derived'
+ with open(derived_name, 'w') as outf:
+ json.dump(derived, outf, indent=2, sort_keys=True)
+ print "saved %s" % derived_name
+ return data, derived
+
+def saveInfoFile(data, ss, dss, filename):
+ data = dict.copy(data)
+ data['screenshot'] = ss
+ data['distilled'] = dss
+ with open(filename, 'w') as info:
+ json.dump(data, info)
+
+def saveMHTML(filename):
+ """Save current page as an MHTML file
+
+ This is done by issuing xdotool commands.
+ Dependencies:
+ - Command line argument "--save-page-as-mhtml" to Chrome.
+ - xdotool
+ """
+
+ cmd = (
+ 'xdotool key --clearmodifiers "ctrl+s" && ' +
+ 'sleep 1 && ' +
+ 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
+ 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
+ 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
+ 'xdotool key --delay 20 --clearmodifiers Return'
+ ) % (os.path.abspath(filename))
+ os.system(cmd)
+ time.sleep(3) # wait for file saving
+ if not os.path.exists(filename):
+ return False
+ print "saved %s" % filename
+ return True
+
def writeAggregated(outdir, ext, out, in_marshal=False):
prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]
prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)[1] == '.' + ext]
@@ -68,7 +163,26 @@ def writeIndex(outdir):
writeAggregated(outdir, "info", "index")
def writeFeature(outdir):
- writeAggregated(outdir, "feature", "feature", in_marshal=True)
+ for n in ["feature-derived", "dfeature-derived", "mfeature-derived", "mdfeature-derived"]:
+ writeAggregated(outdir, n, n)
+ # Use the following when needing aggregated raw features:
+ #writeAggregated(outdir, "feature", "feature", in_marshal=True)
+ #writeAggregated(outdir, "dfeature", "dfeature", in_marshal=True)
+ #writeAggregated(outdir, "mfeature", "mfeature", in_marshal=True)
+ #writeAggregated(outdir, "mdfeature", "mdfeature", in_marshal=True)
+
+def shouldProcess(load_mhtml, no_distill, prefix):
+ info = prefix + '.info'
+ mhtml = prefix + '.mhtml'
+ mfeature = prefix + '.mfeature'
+ mdfeature = prefix + '.mdfeature'
+ if not load_mhtml:
+ return not os.path.exists(info)
+ else:
+ if no_distill:
+ return os.path.exists(mhtml) and not os.path.exists(mfeature)
+ else:
+ return os.path.exists(mhtml) and not os.path.exists(mdfeature)
def main(argv):
parser = argparse.ArgumentParser()
@@ -76,13 +190,25 @@ def main(argv):
parser.add_argument('urls', nargs='*')
parser.add_argument('--force', action='store_true')
parser.add_argument('--urls-file')
+ parser.add_argument('--emulate-mobile', action='store_true')
parser.add_argument('--resume', action='store_true')
parser.add_argument('--write-index', action='store_true')
parser.add_argument('--save-mhtml', action='store_true')
+ parser.add_argument('--load-mhtml', action='store_true')
+ parser.add_argument('--skip-distillation', action='store_true')
+ parser.add_argument('--desktop-distillable-only', action='store_true')
options = parser.parse_args(argv)
+ if options.load_mhtml:
+ if options.save_mhtml:
+ print '--load-mhtml is not compatible with --save-mhtml'
+ return 1
+ if options.resume:
+ print '--load-mhtml is not compatible with --resume'
+ return 1
+
outdir = options.out
- if not options.resume:
+ if not options.resume and not options.load_mhtml and not options.write_index:
if os.path.exists(outdir):
if not options.force:
print outdir + ' exists'
@@ -111,7 +237,7 @@ def main(argv):
print 'index is written'
return 0
- driver = newDriver()
+ driver = newDriver(options.emulate_mobile)
feature_extractor = open('extract_features.js').read()
@@ -121,68 +247,89 @@ def main(argv):
for i, f in jobs:
prefix = '%s/%d' % (outdir, i)
info = '%s.info' % prefix
+ basedata = {'index': i, 'url': f}
- if os.path.exists(info):
+ if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix):
print "skip %d" % (i)
continue;
with FileLock('%s.lock' % (prefix)):
- if os.path.exists(info):
- print "SKIP %d" % (i)
+ if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix):
+ print "skip %d" % (i)
continue;
try:
ss = '%s.png' % prefix
dss = '%s-distilled.png' % prefix
fea = '%s.feature' % prefix
+ dfea = '%s.dfeature' % prefix
+ mhtml = '%s.mhtml' % prefix
+ mhtml_url = 'file://%s' % os.path.abspath(mhtml)
+
+ if options.emulate_mobile:
+ driver.set_window_size(400, 800)
+ else:
+ driver.set_window_size(1280, 5000)
+ if options.load_mhtml:
+ if not os.path.exists(mhtml):
+ print "SKIP %d, no mhtml" % (i)
+ continue
+ driver.get(mhtml_url)
+ time.sleep(1) # wait a bit for things to stablize
+ else:
+ driver.get(f)
+ time.sleep(3) # wait for some async scripts
+ driver.save_screenshot(ss)
+ print "saved %s" % ss
- driver.set_window_size(1280, 5000)
- driver.get(f)
- time.sleep(3) # wait for some async scripts
- driver.save_screenshot(ss)
- print "saved %s" % ss
-
- features = driver.execute_script(feature_extractor)
- data = {
- 'index': i,
- 'url': f,
- 'features': features
- }
- with open(fea, 'w') as outf:
- json.dump(data, outf, indent=2)
- print "saved %s" % fea
+ url_override = None
+ if options.load_mhtml:
+ with open(fea) as infile:
+ # otherwise it would be file:// of mhtml
+ url_override = json.load(infile)['features']['url']
+ fea = '%s.mfeature' % prefix
+ _, derived = saveFeatures(driver, feature_extractor, basedata, url_override, fea)
+
+ if options.desktop_distillable_only:
+ if derived['native']['features']['isMobileFriendly'] or not derived['native']['distillable']:
+ os.system('rm %s.feature %s.png' % (prefix, prefix))
+ saveInfoFile(basedata, ss, dss, info)
+ continue
if options.save_mhtml:
- mhtml = '%s.mhtml' % prefix
- cmd = (
- 'xdotool key --clearmodifiers "ctrl+s" && ' +
- 'sleep 1 && ' +
- 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
- 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
- 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
- 'xdotool key --delay 20 --clearmodifiers Return'
- ) % (os.getcwd() + '/' + mhtml)
- os.system(cmd)
- time.sleep(3) # wait for file saving
- if not os.path.exists(mhtml):
+ if not saveMHTML(mhtml):
# If the file is not saved, the focus point might be lost.
# Restart the whole xvfb environment to be safe.
print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtml)
break
- driver.set_window_size(640, 5000)
+ if options.skip_distillation:
+ continue
+
+ if options.emulate_mobile:
+ driver.set_window_size(400, 800)
+ else:
+ driver.set_window_size(640, 5000)
+
+ if options.load_mhtml:
+ driver.get(getDistillerUrl(mhtml_url))
+ time.sleep(10)
+ dss = '%s-mdistilled.png' % prefix
+ driver.save_screenshot(dss)
+ print "saved %s" % dss
+ dfea = '%s.mdfeature' % prefix
+ saveFeatures(driver, feature_extractor, basedata, None, dfea)
+ continue
+
driver.get(getDistillerUrl(f))
- time.sleep(20) # wait for multi-page, etc
- driver.save_screenshot(dss)
- print "saved %s" % dss
-
- data = {
- 'index': i,
- 'url': f,
- 'screenshot': ss,
- 'distilled': dss,
- }
- with open(info, 'w') as info:
- json.dump(data, info)
+ for i in range(3):
+ time.sleep(20) # wait for multi-page, etc
+ driver.save_screenshot(dss)
+ print "saved %s" % dss
+ feature, _ = saveFeatures(driver, feature_extractor, basedata, None, dfea)
+ if feature['features']['innerText'] != "":
+ break
+
+ saveInfoFile(basedata, ss, dss, info)
except Exception as e:
print e
« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/server.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698