| OLD | NEW |
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # Copyright 2016 The Chromium Authors. All rights reserved. | 2 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
| 5 | 5 |
| 6 import argparse | 6 import argparse |
| 7 import json | 7 import json |
| 8 import os | 8 import os |
| 9 import shutil | 9 import shutil |
| 10 import sys | 10 import sys |
| 11 import time | 11 import time |
| 12 import urllib | 12 import urllib |
| 13 import random | 13 import random |
| 14 from lockfile import FileLock | 14 from lockfile import FileLock |
| 15 | 15 |
| 16 from calculate_derived_features import CalcDerivedFeatures |
| 17 |
| 16 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) | 18 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) |
| 17 | 19 |
| 18 try: | 20 try: |
| 19 from selenium import webdriver | 21 from selenium import webdriver |
| 22 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities |
| 20 except: | 23 except: |
| 21 print 'ERROR:' | 24 print 'ERROR:' |
| 22 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.
' % repo_root | 25 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.
' % repo_root |
| 23 sys.exit(1) | 26 sys.exit(1) |
| 24 | 27 |
| 25 def addBuildtoolsToPath(): | 28 def addBuildtoolsToPath(): |
| 26 envPath = os.environ['PATH'] | 29 envPath = os.environ['PATH'] |
| 27 buildtoolsPath = repo_root + '/buildtools' | 30 buildtoolsPath = repo_root + '/buildtools' |
| 28 if not buildtoolsPath in envPath: | 31 if not buildtoolsPath in envPath: |
| 29 os.environ['PATH'] = buildtoolsPath + ':' + envPath | 32 os.environ['PATH'] = buildtoolsPath + ':' + envPath |
| 30 | 33 |
| 31 def getDistillerUrl(u): | 34 def getDistillerUrl(u): |
| 32 params = { 'url': u} | 35 params = { 'url': u} |
| 33 return "chrome-distiller://blah/?" + urllib.urlencode(params) | 36 return "chrome-distiller://blah/?" + urllib.urlencode(params) |
| 34 | 37 |
| 35 def newDriver(): | 38 def newDriver(mobile=False): |
| 36 chromeOptions = webdriver.ChromeOptions() | 39 chromeOptions = webdriver.ChromeOptions() |
| 37 chromeOptions.binary_location = "/usr/bin/google-chrome-unstable"; | 40 # If you want to use a different version of chrome, specify the full path here
. |
| 41 #chromeOptions.binary_location = "/usr/bin/google-chrome-unstable"; |
| 38 chromeOptions.add_argument('--enable-dom-distiller') | 42 chromeOptions.add_argument('--enable-dom-distiller') |
| 39 chromeOptions.add_argument('--save-page-as-mhtml') | 43 chromeOptions.add_argument('--save-page-as-mhtml') |
| 40 driver = webdriver.Chrome(chrome_options=chromeOptions) | 44 chromeOptions.add_argument('--reader-mode-heuristics=adaboost') |
| 45 chromeOptions.add_argument('--distillability-dev') |
| 46 if mobile: |
| 47 mobile_emulation = { "deviceName": "Google Nexus 5" } |
| 48 chromeOptions.add_experimental_option("mobileEmulation", mobile_emulation) |
| 49 |
| 50 d = DesiredCapabilities.CHROME |
| 51 # This is to enable accessing devtools console log from here, for nativeFeatur
es(). |
| 52 d['loggingPrefs'] = {'browser': 'ALL'} |
| 53 driver = webdriver.Chrome(chrome_options=chromeOptions, desired_capabilities=d
) |
| 41 driver.set_page_load_timeout(60) | 54 driver.set_page_load_timeout(60) |
| 42 driver.set_script_timeout(60) | 55 driver.set_script_timeout(60) |
| 43 print "created a new chrome driver" | 56 print "created a new chrome driver" |
| 44 return driver | 57 return driver |
| 45 | 58 |
| 59 def nativeFeatures(logs): |
| 60 return _parseNative(logs, 'distillability_features = ') |
| 61 |
| 62 def nativeClassification(logs): |
| 63 return _parseNative(logs, 'adaboost_classification = ') |
| 64 |
| 65 def _parseNative(logs, needle): |
| 66 """Parse console logs from Chrome and get decoded JSON. |
| 67 |
| 68 Args: |
| 69 logs: Chrome log object |
| 70 needle (str): the string leading the actual JSON. |
| 71 |
| 72 Example: |
| 73 >>> _parseNative([{'message':'a=b'},{'message':'ac={"a":[1,2]}'}],'c=') |
| 74 {u'a': [1, 2]} |
| 75 """ |
| 76 ret = None |
| 77 for log in logs: |
| 78 message = log['message'] |
| 79 loc = message.find(needle) |
| 80 if loc >= 0: |
| 81 ret = json.loads(message[loc+len(needle):]) |
| 82 return ret |
| 83 |
| 84 def saveFeatures(driver, feature_extractor, data, url_override, filename): |
| 85 data = dict.copy(data) |
| 86 features = driver.execute_script(feature_extractor) |
| 87 if url_override: |
| 88 features['url'] = url_override |
| 89 data['features'] = features |
| 90 |
| 91 logs = driver.get_log('browser') |
| 92 native = nativeClassification(logs) |
| 93 if native: |
| 94 native['features'] = nativeFeatures(logs) |
| 95 data['native'] = native |
| 96 |
| 97 with open(filename, 'w') as outf: |
| 98 json.dump(data, outf, indent=2, sort_keys=True) |
| 99 print "saved %s" % filename |
| 100 |
| 101 derived = dict.copy(data) |
| 102 derived['features'] = CalcDerivedFeatures(data['index'], features) |
| 103 |
| 104 derived_name = filename + '-derived' |
| 105 with open(derived_name, 'w') as outf: |
| 106 json.dump(derived, outf, indent=2, sort_keys=True) |
| 107 print "saved %s" % derived_name |
| 108 return data, derived |
| 109 |
| 110 def saveInfoFile(data, ss, dss, filename): |
| 111 data = dict.copy(data) |
| 112 data['screenshot'] = ss |
| 113 data['distilled'] = dss |
| 114 with open(filename, 'w') as info: |
| 115 json.dump(data, info) |
| 116 |
| 117 def saveMHTML(filename): |
| 118 """Save current page as an MHTML file |
| 119 |
| 120 This is done by issuing xdotool commands. |
| 121 Dependencies: |
| 122 - Command line argument "--save-page-as-mhtml" to Chrome. |
| 123 - xdotool |
| 124 """ |
| 125 |
| 126 cmd = ( |
| 127 'xdotool key --clearmodifiers "ctrl+s" && ' + |
| 128 'sleep 1 && ' + |
| 129 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' + |
| 130 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' + |
| 131 'xdotool type --delay 10 --clearmodifiers "%s" && ' + |
| 132 'xdotool key --delay 20 --clearmodifiers Return' |
| 133 ) % (os.path.abspath(filename)) |
| 134 os.system(cmd) |
| 135 time.sleep(3) # wait for file saving |
| 136 if not os.path.exists(filename): |
| 137 return False |
| 138 print "saved %s" % filename |
| 139 return True |
| 140 |
| 46 def writeAggregated(outdir, ext, out, in_marshal=False): | 141 def writeAggregated(outdir, ext, out, in_marshal=False): |
| 47 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] | 142 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] |
| 48 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)
[1] == '.' + ext] | 143 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)
[1] == '.' + ext] |
| 49 output = [] | 144 output = [] |
| 50 print 'reading %s files' % (ext) | 145 print 'reading %s files' % (ext) |
| 51 for f in prevfiles: | 146 for f in prevfiles: |
| 52 with open(f) as infofile: | 147 with open(f) as infofile: |
| 53 info = json.load(infofile) | 148 info = json.load(infofile) |
| 54 output.append(info) | 149 output.append(info) |
| 55 print 'done reading %s files' % (ext) | 150 print 'done reading %s files' % (ext) |
| 56 | 151 |
| 57 output = sorted(output, key=lambda k: k['index']) | 152 output = sorted(output, key=lambda k: k['index']) |
| 58 print 'writing %s files' % (ext) | 153 print 'writing %s files' % (ext) |
| 59 with open('%s/%s' % (outdir, out), 'w') as outf: | 154 with open('%s/%s' % (outdir, out), 'w') as outf: |
| 60 if in_marshal: | 155 if in_marshal: |
| 61 import marshal | 156 import marshal |
| 62 marshal.dump(output, outf) | 157 marshal.dump(output, outf) |
| 63 else: | 158 else: |
| 64 json.dump(output, outf, indent=2) | 159 json.dump(output, outf, indent=2) |
| 65 print 'done writing %s files' % (ext) | 160 print 'done writing %s files' % (ext) |
| 66 | 161 |
| 67 def writeIndex(outdir): | 162 def writeIndex(outdir): |
| 68 writeAggregated(outdir, "info", "index") | 163 writeAggregated(outdir, "info", "index") |
| 69 | 164 |
| 70 def writeFeature(outdir): | 165 def writeFeature(outdir): |
| 71 writeAggregated(outdir, "feature", "feature", in_marshal=True) | 166 for n in ["feature-derived", "dfeature-derived", "mfeature-derived", "mdfeatur
e-derived"]: |
| 167 writeAggregated(outdir, n, n) |
| 168 # Use the following when needing aggregated raw features: |
| 169 #writeAggregated(outdir, "feature", "feature", in_marshal=True) |
| 170 #writeAggregated(outdir, "dfeature", "dfeature", in_marshal=True) |
| 171 #writeAggregated(outdir, "mfeature", "mfeature", in_marshal=True) |
| 172 #writeAggregated(outdir, "mdfeature", "mdfeature", in_marshal=True) |
| 173 |
| 174 def shouldProcess(load_mhtml, no_distill, prefix): |
| 175 info = prefix + '.info' |
| 176 mhtml = prefix + '.mhtml' |
| 177 mfeature = prefix + '.mfeature' |
| 178 mdfeature = prefix + '.mdfeature' |
| 179 if not load_mhtml: |
| 180 return not os.path.exists(info) |
| 181 else: |
| 182 if no_distill: |
| 183 return os.path.exists(mhtml) and not os.path.exists(mfeature) |
| 184 else: |
| 185 return os.path.exists(mhtml) and not os.path.exists(mdfeature) |
| 72 | 186 |
| 73 def main(argv): | 187 def main(argv): |
| 74 parser = argparse.ArgumentParser() | 188 parser = argparse.ArgumentParser() |
| 75 parser.add_argument('--out', required=True) | 189 parser.add_argument('--out', required=True) |
| 76 parser.add_argument('urls', nargs='*') | 190 parser.add_argument('urls', nargs='*') |
| 77 parser.add_argument('--force', action='store_true') | 191 parser.add_argument('--force', action='store_true') |
| 78 parser.add_argument('--urls-file') | 192 parser.add_argument('--urls-file') |
| 193 parser.add_argument('--emulate-mobile', action='store_true') |
| 79 parser.add_argument('--resume', action='store_true') | 194 parser.add_argument('--resume', action='store_true') |
| 80 parser.add_argument('--write-index', action='store_true') | 195 parser.add_argument('--write-index', action='store_true') |
| 81 parser.add_argument('--save-mhtml', action='store_true') | 196 parser.add_argument('--save-mhtml', action='store_true') |
| 197 parser.add_argument('--load-mhtml', action='store_true') |
| 198 parser.add_argument('--skip-distillation', action='store_true') |
| 199 parser.add_argument('--desktop-distillable-only', action='store_true') |
| 82 options = parser.parse_args(argv) | 200 options = parser.parse_args(argv) |
| 83 | 201 |
| 202 if options.load_mhtml: |
| 203 if options.save_mhtml: |
| 204 print '--load-mhtml is not compatible with --save-mhtml' |
| 205 return 1 |
| 206 if options.resume: |
| 207 print '--load-mhtml is not compatible with --resume' |
| 208 return 1 |
| 209 |
| 84 outdir = options.out | 210 outdir = options.out |
| 85 if not options.resume: | 211 if not options.resume and not options.load_mhtml and not options.write_index: |
| 86 if os.path.exists(outdir): | 212 if os.path.exists(outdir): |
| 87 if not options.force: | 213 if not options.force: |
| 88 print outdir + ' exists' | 214 print outdir + ' exists' |
| 89 return 1 | 215 return 1 |
| 90 shutil.rmtree(outdir, ignore_errors=True) | 216 shutil.rmtree(outdir, ignore_errors=True) |
| 91 os.makedirs(outdir) | 217 os.makedirs(outdir) |
| 92 else: | 218 else: |
| 93 if not os.path.exists(outdir): | 219 if not os.path.exists(outdir): |
| 94 print outdir + ' doesn\'t exist' | 220 print outdir + ' doesn\'t exist' |
| 95 return 1 | 221 return 1 |
| 96 | 222 |
| 97 addBuildtoolsToPath() | 223 addBuildtoolsToPath() |
| 98 | 224 |
| 99 if options.urls: | 225 if options.urls: |
| 100 files = options.urls | 226 files = options.urls |
| 101 elif options.urls_file: | 227 elif options.urls_file: |
| 102 with open(options.urls_file) as u: | 228 with open(options.urls_file) as u: |
| 103 files = u.read().splitlines() | 229 files = u.read().splitlines() |
| 104 else: | 230 else: |
| 105 print 'oh no' | 231 print 'oh no' |
| 106 return 1 | 232 return 1 |
| 107 | 233 |
| 108 if options.write_index: | 234 if options.write_index: |
| 109 writeIndex(outdir) | 235 writeIndex(outdir) |
| 110 writeFeature(outdir) | 236 writeFeature(outdir) |
| 111 print 'index is written' | 237 print 'index is written' |
| 112 return 0 | 238 return 0 |
| 113 | 239 |
| 114 driver = newDriver() | 240 driver = newDriver(options.emulate_mobile) |
| 115 | 241 |
| 116 feature_extractor = open('extract_features.js').read() | 242 feature_extractor = open('extract_features.js').read() |
| 117 | 243 |
| 118 try: | 244 try: |
| 119 jobs = list(enumerate(files)) | 245 jobs = list(enumerate(files)) |
| 120 random.shuffle(jobs) | 246 random.shuffle(jobs) |
| 121 for i, f in jobs: | 247 for i, f in jobs: |
| 122 prefix = '%s/%d' % (outdir, i) | 248 prefix = '%s/%d' % (outdir, i) |
| 123 info = '%s.info' % prefix | 249 info = '%s.info' % prefix |
| 250 basedata = {'index': i, 'url': f} |
| 124 | 251 |
| 125 if os.path.exists(info): | 252 if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix
): |
| 126 print "skip %d" % (i) | 253 print "skip %d" % (i) |
| 127 continue; | 254 continue; |
| 128 | 255 |
| 129 with FileLock('%s.lock' % (prefix)): | 256 with FileLock('%s.lock' % (prefix)): |
| 130 if os.path.exists(info): | 257 if not shouldProcess(options.load_mhtml, options.skip_distillation, pref
ix): |
| 131 print "SKIP %d" % (i) | 258 print "skip %d" % (i) |
| 132 continue; | 259 continue; |
| 133 try: | 260 try: |
| 134 ss = '%s.png' % prefix | 261 ss = '%s.png' % prefix |
| 135 dss = '%s-distilled.png' % prefix | 262 dss = '%s-distilled.png' % prefix |
| 136 fea = '%s.feature' % prefix | 263 fea = '%s.feature' % prefix |
| 264 dfea = '%s.dfeature' % prefix |
| 265 mhtml = '%s.mhtml' % prefix |
| 266 mhtml_url = 'file://%s' % os.path.abspath(mhtml) |
| 137 | 267 |
| 138 driver.set_window_size(1280, 5000) | 268 if options.emulate_mobile: |
| 139 driver.get(f) | 269 driver.set_window_size(400, 800) |
| 140 time.sleep(3) # wait for some async scripts | 270 else: |
| 141 driver.save_screenshot(ss) | 271 driver.set_window_size(1280, 5000) |
| 142 print "saved %s" % ss | 272 if options.load_mhtml: |
| 273 if not os.path.exists(mhtml): |
| 274 print "SKIP %d, no mhtml" % (i) |
| 275 continue |
| 276 driver.get(mhtml_url) |
| 277 time.sleep(1) # wait a bit for things to stablize |
| 278 else: |
| 279 driver.get(f) |
| 280 time.sleep(3) # wait for some async scripts |
| 281 driver.save_screenshot(ss) |
| 282 print "saved %s" % ss |
| 143 | 283 |
| 144 features = driver.execute_script(feature_extractor) | 284 url_override = None |
| 145 data = { | 285 if options.load_mhtml: |
| 146 'index': i, | 286 with open(fea) as infile: |
| 147 'url': f, | 287 # otherwise it would be file:// of mhtml |
| 148 'features': features | 288 url_override = json.load(infile)['features']['url'] |
| 149 } | 289 fea = '%s.mfeature' % prefix |
| 150 with open(fea, 'w') as outf: | 290 _, derived = saveFeatures(driver, feature_extractor, basedata, url_ove
rride, fea) |
| 151 json.dump(data, outf, indent=2) | 291 |
| 152 print "saved %s" % fea | 292 if options.desktop_distillable_only: |
| 293 if derived['native']['features']['isMobileFriendly'] or not derived[
'native']['distillable']: |
| 294 os.system('rm %s.feature %s.png' % (prefix, prefix)) |
| 295 saveInfoFile(basedata, ss, dss, info) |
| 296 continue |
| 153 | 297 |
| 154 if options.save_mhtml: | 298 if options.save_mhtml: |
| 155 mhtml = '%s.mhtml' % prefix | 299 if not saveMHTML(mhtml): |
| 156 cmd = ( | |
| 157 'xdotool key --clearmodifiers "ctrl+s" && ' + | |
| 158 'sleep 1 && ' + | |
| 159 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' + | |
| 160 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && '
+ | |
| 161 'xdotool type --delay 10 --clearmodifiers "%s" && ' + | |
| 162 'xdotool key --delay 20 --clearmodifiers Return' | |
| 163 ) % (os.getcwd() + '/' + mhtml) | |
| 164 os.system(cmd) | |
| 165 time.sleep(3) # wait for file saving | |
| 166 if not os.path.exists(mhtml): | |
| 167 # If the file is not saved, the focus point might be lost. | 300 # If the file is not saved, the focus point might be lost. |
| 168 # Restart the whole xvfb environment to be safe. | 301 # Restart the whole xvfb environment to be safe. |
| 169 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm
l) | 302 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm
l) |
| 170 break | 303 break |
| 171 | 304 |
| 172 driver.set_window_size(640, 5000) | 305 if options.skip_distillation: |
| 306 continue |
| 307 |
| 308 if options.emulate_mobile: |
| 309 driver.set_window_size(400, 800) |
| 310 else: |
| 311 driver.set_window_size(640, 5000) |
| 312 |
| 313 if options.load_mhtml: |
| 314 driver.get(getDistillerUrl(mhtml_url)) |
| 315 time.sleep(10) |
| 316 dss = '%s-mdistilled.png' % prefix |
| 317 driver.save_screenshot(dss) |
| 318 print "saved %s" % dss |
| 319 dfea = '%s.mdfeature' % prefix |
| 320 saveFeatures(driver, feature_extractor, basedata, None, dfea) |
| 321 continue |
| 322 |
| 173 driver.get(getDistillerUrl(f)) | 323 driver.get(getDistillerUrl(f)) |
| 174 time.sleep(20) # wait for multi-page, etc | 324 for i in range(3): |
| 175 driver.save_screenshot(dss) | 325 time.sleep(20) # wait for multi-page, etc |
| 176 print "saved %s" % dss | 326 driver.save_screenshot(dss) |
| 327 print "saved %s" % dss |
| 328 feature, _ = saveFeatures(driver, feature_extractor, basedata, None,
dfea) |
| 329 if feature['features']['innerText'] != "": |
| 330 break |
| 177 | 331 |
| 178 data = { | 332 saveInfoFile(basedata, ss, dss, info) |
| 179 'index': i, | |
| 180 'url': f, | |
| 181 'screenshot': ss, | |
| 182 'distilled': dss, | |
| 183 } | |
| 184 with open(info, 'w') as info: | |
| 185 json.dump(data, info) | |
| 186 | 333 |
| 187 except Exception as e: | 334 except Exception as e: |
| 188 print e | 335 print e |
| 189 print "Index=%d URL=%s" % (i, f) | 336 print "Index=%d URL=%s" % (i, f) |
| 190 driver.quit() | 337 driver.quit() |
| 191 driver = newDriver() | 338 driver = newDriver() |
| 192 pass | 339 pass |
| 193 | 340 |
| 194 finally: | 341 finally: |
| 195 driver.quit() | 342 driver.quit() |
| 196 | 343 |
| 197 return 0 | 344 return 0 |
| 198 | 345 |
| 199 if __name__ == '__main__': | 346 if __name__ == '__main__': |
| 200 sys.exit(main(sys.argv[1:])) | 347 sys.exit(main(sys.argv[1:])) |
| 201 | 348 |
| OLD | NEW |