| Index: heuristics/distillable/get_screenshots.py
|
| diff --git a/heuristics/distillable/get_screenshots.py b/heuristics/distillable/get_screenshots.py
|
| index 8001add9f8e13f0958c30c5a28c1d75a8ddcd07a..3482498302831e1cfd9d20373bfa15bf2a29a0dc 100755
|
| --- a/heuristics/distillable/get_screenshots.py
|
| +++ b/heuristics/distillable/get_screenshots.py
|
| @@ -13,10 +13,13 @@ import urllib
|
| import random
|
| from lockfile import FileLock
|
|
|
| +from calculate_derived_features import CalcDerivedFeatures
|
| +
|
| repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))
|
|
|
| try:
|
| from selenium import webdriver
|
| + from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
|
| except:
|
| print 'ERROR:'
|
| print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.' % repo_root
|
| @@ -32,17 +35,109 @@ def getDistillerUrl(u):
|
| params = { 'url': u}
|
| return "chrome-distiller://blah/?" + urllib.urlencode(params)
|
|
|
| -def newDriver():
|
| +def newDriver(mobile=False):
|
| chromeOptions = webdriver.ChromeOptions()
|
| - chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
|
| + # If you want to use a different version of chrome, specify the full path here.
|
| + #chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";
|
| chromeOptions.add_argument('--enable-dom-distiller')
|
| chromeOptions.add_argument('--save-page-as-mhtml')
|
| - driver = webdriver.Chrome(chrome_options=chromeOptions)
|
| + chromeOptions.add_argument('--reader-mode-heuristics=adaboost')
|
| + chromeOptions.add_argument('--distillability-dev')
|
| + if mobile:
|
| + mobile_emulation = { "deviceName": "Google Nexus 5" }
|
| + chromeOptions.add_experimental_option("mobileEmulation", mobile_emulation)
|
| +
|
| + d = DesiredCapabilities.CHROME
|
| + # This is to enable accessing devtools console log from here, for nativeFeatures().
|
| + d['loggingPrefs'] = {'browser': 'ALL'}
|
| + driver = webdriver.Chrome(chrome_options=chromeOptions, desired_capabilities=d)
|
| driver.set_page_load_timeout(60)
|
| driver.set_script_timeout(60)
|
| print "created a new chrome driver"
|
| return driver
|
|
|
| +def nativeFeatures(logs):
|
| + return _parseNative(logs, 'distillability_features = ')
|
| +
|
| +def nativeClassification(logs):
|
| + return _parseNative(logs, 'adaboost_classification = ')
|
| +
|
| +def _parseNative(logs, needle):
|
| + """Parse console logs from Chrome and get decoded JSON.
|
| +
|
| + Args:
|
| + logs: Chrome log object
|
| + needle (str): the string leading the actual JSON.
|
| +
|
| + Example:
|
| + >>> _parseNative([{'message':'a=b'},{'message':'ac={"a":[1,2]}'}],'c=')
|
| + {u'a': [1, 2]}
|
| + """
|
| + ret = None
|
| + for log in logs:
|
| + message = log['message']
|
| + loc = message.find(needle)
|
| + if loc >= 0:
|
| + ret = json.loads(message[loc+len(needle):])
|
| + return ret
|
| +
|
| +def saveFeatures(driver, feature_extractor, data, url_override, filename):
|
| + data = dict.copy(data)
|
| + features = driver.execute_script(feature_extractor)
|
| + if url_override:
|
| + features['url'] = url_override
|
| + data['features'] = features
|
| +
|
| + logs = driver.get_log('browser')
|
| + native = nativeClassification(logs)
|
| + if native:
|
| + native['features'] = nativeFeatures(logs)
|
| + data['native'] = native
|
| +
|
| + with open(filename, 'w') as outf:
|
| + json.dump(data, outf, indent=2, sort_keys=True)
|
| + print "saved %s" % filename
|
| +
|
| + derived = dict.copy(data)
|
| + derived['features'] = CalcDerivedFeatures(data['index'], features)
|
| +
|
| + derived_name = filename + '-derived'
|
| + with open(derived_name, 'w') as outf:
|
| + json.dump(derived, outf, indent=2, sort_keys=True)
|
| + print "saved %s" % derived_name
|
| + return data, derived
|
| +
|
| +def saveInfoFile(data, ss, dss, filename):
|
| + data = dict.copy(data)
|
| + data['screenshot'] = ss
|
| + data['distilled'] = dss
|
| + with open(filename, 'w') as info:
|
| + json.dump(data, info)
|
| +
|
| +def saveMHTML(filename):
|
| + """Save current page as an MHTML file
|
| +
|
| + This is done by issuing xdotool commands.
|
| + Dependencies:
|
| + - Command line argument "--save-page-as-mhtml" to Chrome.
|
| + - xdotool
|
| + """
|
| +
|
| + cmd = (
|
| + 'xdotool key --clearmodifiers "ctrl+s" && ' +
|
| + 'sleep 1 && ' +
|
| + 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
|
| + 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
|
| + 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
|
| + 'xdotool key --delay 20 --clearmodifiers Return'
|
| + ) % (os.path.abspath(filename))
|
| + os.system(cmd)
|
| + time.sleep(3) # wait for file saving
|
| + if not os.path.exists(filename):
|
| + return False
|
| + print "saved %s" % filename
|
| + return True
|
| +
|
| def writeAggregated(outdir, ext, out, in_marshal=False):
|
| prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]
|
| prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)[1] == '.' + ext]
|
| @@ -68,7 +163,26 @@ def writeIndex(outdir):
|
| writeAggregated(outdir, "info", "index")
|
|
|
| def writeFeature(outdir):
|
| - writeAggregated(outdir, "feature", "feature", in_marshal=True)
|
| + for n in ["feature-derived", "dfeature-derived", "mfeature-derived", "mdfeature-derived"]:
|
| + writeAggregated(outdir, n, n)
|
| + # Use the following when needing aggregated raw features:
|
| + #writeAggregated(outdir, "feature", "feature", in_marshal=True)
|
| + #writeAggregated(outdir, "dfeature", "dfeature", in_marshal=True)
|
| + #writeAggregated(outdir, "mfeature", "mfeature", in_marshal=True)
|
| + #writeAggregated(outdir, "mdfeature", "mdfeature", in_marshal=True)
|
| +
|
| +def shouldProcess(load_mhtml, no_distill, prefix):
|
| + info = prefix + '.info'
|
| + mhtml = prefix + '.mhtml'
|
| + mfeature = prefix + '.mfeature'
|
| + mdfeature = prefix + '.mdfeature'
|
| + if not load_mhtml:
|
| + return not os.path.exists(info)
|
| + else:
|
| + if no_distill:
|
| + return os.path.exists(mhtml) and not os.path.exists(mfeature)
|
| + else:
|
| + return os.path.exists(mhtml) and not os.path.exists(mdfeature)
|
|
|
| def main(argv):
|
| parser = argparse.ArgumentParser()
|
| @@ -76,13 +190,25 @@ def main(argv):
|
| parser.add_argument('urls', nargs='*')
|
| parser.add_argument('--force', action='store_true')
|
| parser.add_argument('--urls-file')
|
| + parser.add_argument('--emulate-mobile', action='store_true')
|
| parser.add_argument('--resume', action='store_true')
|
| parser.add_argument('--write-index', action='store_true')
|
| parser.add_argument('--save-mhtml', action='store_true')
|
| + parser.add_argument('--load-mhtml', action='store_true')
|
| + parser.add_argument('--skip-distillation', action='store_true')
|
| + parser.add_argument('--desktop-distillable-only', action='store_true')
|
| options = parser.parse_args(argv)
|
|
|
| + if options.load_mhtml:
|
| + if options.save_mhtml:
|
| + print '--load-mhtml is not compatible with --save-mhtml'
|
| + return 1
|
| + if options.resume:
|
| + print '--load-mhtml is not compatible with --resume'
|
| + return 1
|
| +
|
| outdir = options.out
|
| - if not options.resume:
|
| + if not options.resume and not options.load_mhtml and not options.write_index:
|
| if os.path.exists(outdir):
|
| if not options.force:
|
| print outdir + ' exists'
|
| @@ -111,7 +237,7 @@ def main(argv):
|
| print 'index is written'
|
| return 0
|
|
|
| - driver = newDriver()
|
| + driver = newDriver(options.emulate_mobile)
|
|
|
| feature_extractor = open('extract_features.js').read()
|
|
|
| @@ -121,68 +247,89 @@ def main(argv):
|
| for i, f in jobs:
|
| prefix = '%s/%d' % (outdir, i)
|
| info = '%s.info' % prefix
|
| + basedata = {'index': i, 'url': f}
|
|
|
| - if os.path.exists(info):
|
| + if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix):
|
| print "skip %d" % (i)
|
| continue;
|
|
|
| with FileLock('%s.lock' % (prefix)):
|
| - if os.path.exists(info):
|
| - print "SKIP %d" % (i)
|
| + if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix):
|
| + print "skip %d" % (i)
|
| continue;
|
| try:
|
| ss = '%s.png' % prefix
|
| dss = '%s-distilled.png' % prefix
|
| fea = '%s.feature' % prefix
|
| + dfea = '%s.dfeature' % prefix
|
| + mhtml = '%s.mhtml' % prefix
|
| + mhtml_url = 'file://%s' % os.path.abspath(mhtml)
|
| +
|
| + if options.emulate_mobile:
|
| + driver.set_window_size(400, 800)
|
| + else:
|
| + driver.set_window_size(1280, 5000)
|
| + if options.load_mhtml:
|
| + if not os.path.exists(mhtml):
|
| + print "SKIP %d, no mhtml" % (i)
|
| + continue
|
| + driver.get(mhtml_url)
|
| + time.sleep(1) # wait a bit for things to stablize
|
| + else:
|
| + driver.get(f)
|
| + time.sleep(3) # wait for some async scripts
|
| + driver.save_screenshot(ss)
|
| + print "saved %s" % ss
|
|
|
| - driver.set_window_size(1280, 5000)
|
| - driver.get(f)
|
| - time.sleep(3) # wait for some async scripts
|
| - driver.save_screenshot(ss)
|
| - print "saved %s" % ss
|
| -
|
| - features = driver.execute_script(feature_extractor)
|
| - data = {
|
| - 'index': i,
|
| - 'url': f,
|
| - 'features': features
|
| - }
|
| - with open(fea, 'w') as outf:
|
| - json.dump(data, outf, indent=2)
|
| - print "saved %s" % fea
|
| + url_override = None
|
| + if options.load_mhtml:
|
| + with open(fea) as infile:
|
| + # otherwise it would be file:// of mhtml
|
| + url_override = json.load(infile)['features']['url']
|
| + fea = '%s.mfeature' % prefix
|
| + _, derived = saveFeatures(driver, feature_extractor, basedata, url_override, fea)
|
| +
|
| + if options.desktop_distillable_only:
|
| + if derived['native']['features']['isMobileFriendly'] or not derived['native']['distillable']:
|
| + os.system('rm %s.feature %s.png' % (prefix, prefix))
|
| + saveInfoFile(basedata, ss, dss, info)
|
| + continue
|
|
|
| if options.save_mhtml:
|
| - mhtml = '%s.mhtml' % prefix
|
| - cmd = (
|
| - 'xdotool key --clearmodifiers "ctrl+s" && ' +
|
| - 'sleep 1 && ' +
|
| - 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +
|
| - 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +
|
| - 'xdotool type --delay 10 --clearmodifiers "%s" && ' +
|
| - 'xdotool key --delay 20 --clearmodifiers Return'
|
| - ) % (os.getcwd() + '/' + mhtml)
|
| - os.system(cmd)
|
| - time.sleep(3) # wait for file saving
|
| - if not os.path.exists(mhtml):
|
| + if not saveMHTML(mhtml):
|
| # If the file is not saved, the focus point might be lost.
|
| # Restart the whole xvfb environment to be safe.
|
| print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtml)
|
| break
|
|
|
| - driver.set_window_size(640, 5000)
|
| + if options.skip_distillation:
|
| + continue
|
| +
|
| + if options.emulate_mobile:
|
| + driver.set_window_size(400, 800)
|
| + else:
|
| + driver.set_window_size(640, 5000)
|
| +
|
| + if options.load_mhtml:
|
| + driver.get(getDistillerUrl(mhtml_url))
|
| + time.sleep(10)
|
| + dss = '%s-mdistilled.png' % prefix
|
| + driver.save_screenshot(dss)
|
| + print "saved %s" % dss
|
| + dfea = '%s.mdfeature' % prefix
|
| + saveFeatures(driver, feature_extractor, basedata, None, dfea)
|
| + continue
|
| +
|
| driver.get(getDistillerUrl(f))
|
| - time.sleep(20) # wait for multi-page, etc
|
| - driver.save_screenshot(dss)
|
| - print "saved %s" % dss
|
| -
|
| - data = {
|
| - 'index': i,
|
| - 'url': f,
|
| - 'screenshot': ss,
|
| - 'distilled': dss,
|
| - }
|
| - with open(info, 'w') as info:
|
| - json.dump(data, info)
|
| + for i in range(3):
|
| + time.sleep(20) # wait for multi-page, etc
|
| + driver.save_screenshot(dss)
|
| + print "saved %s" % dss
|
| + feature, _ = saveFeatures(driver, feature_extractor, basedata, None, dfea)
|
| + if feature['features']['innerText'] != "":
|
| + break
|
| +
|
| + saveInfoFile(basedata, ss, dss, info)
|
|
|
| except Exception as e:
|
| print e
|
|
|