OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright 2016 The Chromium Authors. All rights reserved. | 2 # Copyright 2016 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 import argparse | 6 import argparse |
7 import json | 7 import json |
8 import os | 8 import os |
9 import shutil | 9 import shutil |
10 import sys | 10 import sys |
11 import time | 11 import time |
12 import urllib | 12 import urllib |
13 import random | 13 import random |
14 from lockfile import FileLock | 14 from lockfile import FileLock |
15 | 15 |
| 16 from calculate_derived_features import CalcDerivedFeatures |
| 17 |
16 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) | 18 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')) |
17 | 19 |
18 try: | 20 try: |
19 from selenium import webdriver | 21 from selenium import webdriver |
| 22 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities |
20 except: | 23 except: |
21 print 'ERROR:' | 24 print 'ERROR:' |
22 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.
' % repo_root | 25 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`.
' % repo_root |
23 sys.exit(1) | 26 sys.exit(1) |
24 | 27 |
25 def addBuildtoolsToPath(): | 28 def addBuildtoolsToPath(): |
26 envPath = os.environ['PATH'] | 29 envPath = os.environ['PATH'] |
27 buildtoolsPath = repo_root + '/buildtools' | 30 buildtoolsPath = repo_root + '/buildtools' |
28 if not buildtoolsPath in envPath: | 31 if not buildtoolsPath in envPath: |
29 os.environ['PATH'] = buildtoolsPath + ':' + envPath | 32 os.environ['PATH'] = buildtoolsPath + ':' + envPath |
30 | 33 |
31 def getDistillerUrl(u): | 34 def getDistillerUrl(u): |
32 params = { 'url': u} | 35 params = { 'url': u} |
33 return "chrome-distiller://blah/?" + urllib.urlencode(params) | 36 return "chrome-distiller://blah/?" + urllib.urlencode(params) |
34 | 37 |
35 def newDriver(): | 38 def newDriver(mobile=False): |
36 chromeOptions = webdriver.ChromeOptions() | 39 chromeOptions = webdriver.ChromeOptions() |
37 chromeOptions.binary_location = "/usr/bin/google-chrome-unstable"; | 40 # If you want to use a different version of chrome, specify the full path here
. |
| 41 #chromeOptions.binary_location = "/usr/bin/google-chrome-unstable"; |
38 chromeOptions.add_argument('--enable-dom-distiller') | 42 chromeOptions.add_argument('--enable-dom-distiller') |
39 chromeOptions.add_argument('--save-page-as-mhtml') | 43 chromeOptions.add_argument('--save-page-as-mhtml') |
40 driver = webdriver.Chrome(chrome_options=chromeOptions) | 44 chromeOptions.add_argument('--reader-mode-heuristics=adaboost') |
| 45 chromeOptions.add_argument('--distillability-dev') |
| 46 if mobile: |
| 47 mobile_emulation = { "deviceName": "Google Nexus 5" } |
| 48 chromeOptions.add_experimental_option("mobileEmulation", mobile_emulation) |
| 49 |
| 50 d = DesiredCapabilities.CHROME |
| 51 # This is to enable accessing devtools console log from here, for nativeFeatur
es(). |
| 52 d['loggingPrefs'] = {'browser': 'ALL'} |
| 53 driver = webdriver.Chrome(chrome_options=chromeOptions, desired_capabilities=d
) |
41 driver.set_page_load_timeout(60) | 54 driver.set_page_load_timeout(60) |
42 driver.set_script_timeout(60) | 55 driver.set_script_timeout(60) |
43 print "created a new chrome driver" | 56 print "created a new chrome driver" |
44 return driver | 57 return driver |
45 | 58 |
| 59 def nativeFeatures(logs): |
| 60 return _parseNative(logs, 'distillability_features = ') |
| 61 |
| 62 def nativeClassification(logs): |
| 63 return _parseNative(logs, 'adaboost_classification = ') |
| 64 |
| 65 def _parseNative(logs, needle): |
| 66 """Parse console logs from Chrome and get decoded JSON. |
| 67 |
| 68 Args: |
| 69 logs: Chrome log object |
| 70 needle (str): the string leading the actual JSON. |
| 71 |
| 72 Example: |
| 73 >>> _parseNative([{'message':'a=b'},{'message':'ac={"a":[1,2]}'}],'c=') |
| 74 {u'a': [1, 2]} |
| 75 """ |
| 76 ret = None |
| 77 for log in logs: |
| 78 message = log['message'] |
| 79 loc = message.find(needle) |
| 80 if loc >= 0: |
| 81 ret = json.loads(message[loc+len(needle):]) |
| 82 return ret |
| 83 |
| 84 def saveFeatures(driver, feature_extractor, data, url_override, filename): |
| 85 data = dict.copy(data) |
| 86 features = driver.execute_script(feature_extractor) |
| 87 if url_override: |
| 88 features['url'] = url_override |
| 89 data['features'] = features |
| 90 |
| 91 logs = driver.get_log('browser') |
| 92 native = nativeClassification(logs) |
| 93 if native: |
| 94 native['features'] = nativeFeatures(logs) |
| 95 data['native'] = native |
| 96 |
| 97 with open(filename, 'w') as outf: |
| 98 json.dump(data, outf, indent=2, sort_keys=True) |
| 99 print "saved %s" % filename |
| 100 |
| 101 derived = dict.copy(data) |
| 102 derived['features'] = CalcDerivedFeatures(data['index'], features) |
| 103 |
| 104 derived_name = filename + '-derived' |
| 105 with open(derived_name, 'w') as outf: |
| 106 json.dump(derived, outf, indent=2, sort_keys=True) |
| 107 print "saved %s" % derived_name |
| 108 return data, derived |
| 109 |
| 110 def saveInfoFile(data, ss, dss, filename): |
| 111 data = dict.copy(data) |
| 112 data['screenshot'] = ss |
| 113 data['distilled'] = dss |
| 114 with open(filename, 'w') as info: |
| 115 json.dump(data, info) |
| 116 |
| 117 def saveMHTML(filename): |
| 118 """Save current page as an MHTML file |
| 119 |
| 120 This is done by issuing xdotool commands. |
| 121 Dependencies: |
| 122 - Command line argument "--save-page-as-mhtml" to Chrome. |
| 123 - xdotool |
| 124 """ |
| 125 |
| 126 cmd = ( |
| 127 'xdotool key --clearmodifiers "ctrl+s" && ' + |
| 128 'sleep 1 && ' + |
| 129 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' + |
| 130 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' + |
| 131 'xdotool type --delay 10 --clearmodifiers "%s" && ' + |
| 132 'xdotool key --delay 20 --clearmodifiers Return' |
| 133 ) % (os.path.abspath(filename)) |
| 134 os.system(cmd) |
| 135 time.sleep(3) # wait for file saving |
| 136 if not os.path.exists(filename): |
| 137 return False |
| 138 print "saved %s" % filename |
| 139 return True |
| 140 |
46 def writeAggregated(outdir, ext, out, in_marshal=False): | 141 def writeAggregated(outdir, ext, out, in_marshal=False): |
47 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] | 142 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)] |
48 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)
[1] == '.' + ext] | 143 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f)
[1] == '.' + ext] |
49 output = [] | 144 output = [] |
50 print 'reading %s files' % (ext) | 145 print 'reading %s files' % (ext) |
51 for f in prevfiles: | 146 for f in prevfiles: |
52 with open(f) as infofile: | 147 with open(f) as infofile: |
53 info = json.load(infofile) | 148 info = json.load(infofile) |
54 output.append(info) | 149 output.append(info) |
55 print 'done reading %s files' % (ext) | 150 print 'done reading %s files' % (ext) |
56 | 151 |
57 output = sorted(output, key=lambda k: k['index']) | 152 output = sorted(output, key=lambda k: k['index']) |
58 print 'writing %s files' % (ext) | 153 print 'writing %s files' % (ext) |
59 with open('%s/%s' % (outdir, out), 'w') as outf: | 154 with open('%s/%s' % (outdir, out), 'w') as outf: |
60 if in_marshal: | 155 if in_marshal: |
61 import marshal | 156 import marshal |
62 marshal.dump(output, outf) | 157 marshal.dump(output, outf) |
63 else: | 158 else: |
64 json.dump(output, outf, indent=2) | 159 json.dump(output, outf, indent=2) |
65 print 'done writing %s files' % (ext) | 160 print 'done writing %s files' % (ext) |
66 | 161 |
67 def writeIndex(outdir): | 162 def writeIndex(outdir): |
68 writeAggregated(outdir, "info", "index") | 163 writeAggregated(outdir, "info", "index") |
69 | 164 |
70 def writeFeature(outdir): | 165 def writeFeature(outdir): |
71 writeAggregated(outdir, "feature", "feature", in_marshal=True) | 166 for n in ["feature-derived", "dfeature-derived", "mfeature-derived", "mdfeatur
e-derived"]: |
| 167 writeAggregated(outdir, n, n) |
| 168 # Use the following when needing aggregated raw features: |
| 169 #writeAggregated(outdir, "feature", "feature", in_marshal=True) |
| 170 #writeAggregated(outdir, "dfeature", "dfeature", in_marshal=True) |
| 171 #writeAggregated(outdir, "mfeature", "mfeature", in_marshal=True) |
| 172 #writeAggregated(outdir, "mdfeature", "mdfeature", in_marshal=True) |
| 173 |
| 174 def shouldProcess(load_mhtml, no_distill, prefix): |
| 175 info = prefix + '.info' |
| 176 mhtml = prefix + '.mhtml' |
| 177 mfeature = prefix + '.mfeature' |
| 178 mdfeature = prefix + '.mdfeature' |
| 179 if not load_mhtml: |
| 180 return not os.path.exists(info) |
| 181 else: |
| 182 if no_distill: |
| 183 return os.path.exists(mhtml) and not os.path.exists(mfeature) |
| 184 else: |
| 185 return os.path.exists(mhtml) and not os.path.exists(mdfeature) |
72 | 186 |
73 def main(argv): | 187 def main(argv): |
74 parser = argparse.ArgumentParser() | 188 parser = argparse.ArgumentParser() |
75 parser.add_argument('--out', required=True) | 189 parser.add_argument('--out', required=True) |
76 parser.add_argument('urls', nargs='*') | 190 parser.add_argument('urls', nargs='*') |
77 parser.add_argument('--force', action='store_true') | 191 parser.add_argument('--force', action='store_true') |
78 parser.add_argument('--urls-file') | 192 parser.add_argument('--urls-file') |
| 193 parser.add_argument('--emulate-mobile', action='store_true') |
79 parser.add_argument('--resume', action='store_true') | 194 parser.add_argument('--resume', action='store_true') |
80 parser.add_argument('--write-index', action='store_true') | 195 parser.add_argument('--write-index', action='store_true') |
81 parser.add_argument('--save-mhtml', action='store_true') | 196 parser.add_argument('--save-mhtml', action='store_true') |
| 197 parser.add_argument('--load-mhtml', action='store_true') |
| 198 parser.add_argument('--skip-distillation', action='store_true') |
| 199 parser.add_argument('--desktop-distillable-only', action='store_true') |
82 options = parser.parse_args(argv) | 200 options = parser.parse_args(argv) |
83 | 201 |
| 202 if options.load_mhtml: |
| 203 if options.save_mhtml: |
| 204 print '--load-mhtml is not compatible with --save-mhtml' |
| 205 return 1 |
| 206 if options.resume: |
| 207 print '--load-mhtml is not compatible with --resume' |
| 208 return 1 |
| 209 |
84 outdir = options.out | 210 outdir = options.out |
85 if not options.resume: | 211 if not options.resume and not options.load_mhtml and not options.write_index: |
86 if os.path.exists(outdir): | 212 if os.path.exists(outdir): |
87 if not options.force: | 213 if not options.force: |
88 print outdir + ' exists' | 214 print outdir + ' exists' |
89 return 1 | 215 return 1 |
90 shutil.rmtree(outdir, ignore_errors=True) | 216 shutil.rmtree(outdir, ignore_errors=True) |
91 os.makedirs(outdir) | 217 os.makedirs(outdir) |
92 else: | 218 else: |
93 if not os.path.exists(outdir): | 219 if not os.path.exists(outdir): |
94 print outdir + ' doesn\'t exist' | 220 print outdir + ' doesn\'t exist' |
95 return 1 | 221 return 1 |
96 | 222 |
97 addBuildtoolsToPath() | 223 addBuildtoolsToPath() |
98 | 224 |
99 if options.urls: | 225 if options.urls: |
100 files = options.urls | 226 files = options.urls |
101 elif options.urls_file: | 227 elif options.urls_file: |
102 with open(options.urls_file) as u: | 228 with open(options.urls_file) as u: |
103 files = u.read().splitlines() | 229 files = u.read().splitlines() |
104 else: | 230 else: |
105 print 'oh no' | 231 print 'oh no' |
106 return 1 | 232 return 1 |
107 | 233 |
108 if options.write_index: | 234 if options.write_index: |
109 writeIndex(outdir) | 235 writeIndex(outdir) |
110 writeFeature(outdir) | 236 writeFeature(outdir) |
111 print 'index is written' | 237 print 'index is written' |
112 return 0 | 238 return 0 |
113 | 239 |
114 driver = newDriver() | 240 driver = newDriver(options.emulate_mobile) |
115 | 241 |
116 feature_extractor = open('extract_features.js').read() | 242 feature_extractor = open('extract_features.js').read() |
117 | 243 |
118 try: | 244 try: |
119 jobs = list(enumerate(files)) | 245 jobs = list(enumerate(files)) |
120 random.shuffle(jobs) | 246 random.shuffle(jobs) |
121 for i, f in jobs: | 247 for i, f in jobs: |
122 prefix = '%s/%d' % (outdir, i) | 248 prefix = '%s/%d' % (outdir, i) |
123 info = '%s.info' % prefix | 249 info = '%s.info' % prefix |
| 250 basedata = {'index': i, 'url': f} |
124 | 251 |
125 if os.path.exists(info): | 252 if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix
): |
126 print "skip %d" % (i) | 253 print "skip %d" % (i) |
127 continue; | 254 continue; |
128 | 255 |
129 with FileLock('%s.lock' % (prefix)): | 256 with FileLock('%s.lock' % (prefix)): |
130 if os.path.exists(info): | 257 if not shouldProcess(options.load_mhtml, options.skip_distillation, pref
ix): |
131 print "SKIP %d" % (i) | 258 print "skip %d" % (i) |
132 continue; | 259 continue; |
133 try: | 260 try: |
134 ss = '%s.png' % prefix | 261 ss = '%s.png' % prefix |
135 dss = '%s-distilled.png' % prefix | 262 dss = '%s-distilled.png' % prefix |
136 fea = '%s.feature' % prefix | 263 fea = '%s.feature' % prefix |
| 264 dfea = '%s.dfeature' % prefix |
| 265 mhtml = '%s.mhtml' % prefix |
| 266 mhtml_url = 'file://%s' % os.path.abspath(mhtml) |
137 | 267 |
138 driver.set_window_size(1280, 5000) | 268 if options.emulate_mobile: |
139 driver.get(f) | 269 driver.set_window_size(400, 800) |
140 time.sleep(3) # wait for some async scripts | 270 else: |
141 driver.save_screenshot(ss) | 271 driver.set_window_size(1280, 5000) |
142 print "saved %s" % ss | 272 if options.load_mhtml: |
| 273 if not os.path.exists(mhtml): |
| 274 print "SKIP %d, no mhtml" % (i) |
| 275 continue |
| 276 driver.get(mhtml_url) |
| 277 time.sleep(1) # wait a bit for things to stablize |
| 278 else: |
| 279 driver.get(f) |
| 280 time.sleep(3) # wait for some async scripts |
| 281 driver.save_screenshot(ss) |
| 282 print "saved %s" % ss |
143 | 283 |
144 features = driver.execute_script(feature_extractor) | 284 url_override = None |
145 data = { | 285 if options.load_mhtml: |
146 'index': i, | 286 with open(fea) as infile: |
147 'url': f, | 287 # otherwise it would be file:// of mhtml |
148 'features': features | 288 url_override = json.load(infile)['features']['url'] |
149 } | 289 fea = '%s.mfeature' % prefix |
150 with open(fea, 'w') as outf: | 290 _, derived = saveFeatures(driver, feature_extractor, basedata, url_ove
rride, fea) |
151 json.dump(data, outf, indent=2) | 291 |
152 print "saved %s" % fea | 292 if options.desktop_distillable_only: |
| 293 if derived['native']['features']['isMobileFriendly'] or not derived[
'native']['distillable']: |
| 294 os.system('rm %s.feature %s.png' % (prefix, prefix)) |
| 295 saveInfoFile(basedata, ss, dss, info) |
| 296 continue |
153 | 297 |
154 if options.save_mhtml: | 298 if options.save_mhtml: |
155 mhtml = '%s.mhtml' % prefix | 299 if not saveMHTML(mhtml): |
156 cmd = ( | |
157 'xdotool key --clearmodifiers "ctrl+s" && ' + | |
158 'sleep 1 && ' + | |
159 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' + | |
160 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && '
+ | |
161 'xdotool type --delay 10 --clearmodifiers "%s" && ' + | |
162 'xdotool key --delay 20 --clearmodifiers Return' | |
163 ) % (os.getcwd() + '/' + mhtml) | |
164 os.system(cmd) | |
165 time.sleep(3) # wait for file saving | |
166 if not os.path.exists(mhtml): | |
167 # If the file is not saved, the focus point might be lost. | 300 # If the file is not saved, the focus point might be lost. |
168 # Restart the whole xvfb environment to be safe. | 301 # Restart the whole xvfb environment to be safe. |
169 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm
l) | 302 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm
l) |
170 break | 303 break |
171 | 304 |
172 driver.set_window_size(640, 5000) | 305 if options.skip_distillation: |
| 306 continue |
| 307 |
| 308 if options.emulate_mobile: |
| 309 driver.set_window_size(400, 800) |
| 310 else: |
| 311 driver.set_window_size(640, 5000) |
| 312 |
| 313 if options.load_mhtml: |
| 314 driver.get(getDistillerUrl(mhtml_url)) |
| 315 time.sleep(10) |
| 316 dss = '%s-mdistilled.png' % prefix |
| 317 driver.save_screenshot(dss) |
| 318 print "saved %s" % dss |
| 319 dfea = '%s.mdfeature' % prefix |
| 320 saveFeatures(driver, feature_extractor, basedata, None, dfea) |
| 321 continue |
| 322 |
173 driver.get(getDistillerUrl(f)) | 323 driver.get(getDistillerUrl(f)) |
174 time.sleep(20) # wait for multi-page, etc | 324 for i in range(3): |
175 driver.save_screenshot(dss) | 325 time.sleep(20) # wait for multi-page, etc |
176 print "saved %s" % dss | 326 driver.save_screenshot(dss) |
| 327 print "saved %s" % dss |
| 328 feature, _ = saveFeatures(driver, feature_extractor, basedata, None,
dfea) |
| 329 if feature['features']['innerText'] != "": |
| 330 break |
177 | 331 |
178 data = { | 332 saveInfoFile(basedata, ss, dss, info) |
179 'index': i, | |
180 'url': f, | |
181 'screenshot': ss, | |
182 'distilled': dss, | |
183 } | |
184 with open(info, 'w') as info: | |
185 json.dump(data, info) | |
186 | 333 |
187 except Exception as e: | 334 except Exception as e: |
188 print e | 335 print e |
189 print "Index=%d URL=%s" % (i, f) | 336 print "Index=%d URL=%s" % (i, f) |
190 driver.quit() | 337 driver.quit() |
191 driver = newDriver() | 338 driver = newDriver() |
192 pass | 339 pass |
193 | 340 |
194 finally: | 341 finally: |
195 driver.quit() | 342 driver.quit() |
196 | 343 |
197 return 0 | 344 return 0 |
198 | 345 |
199 if __name__ == '__main__': | 346 if __name__ == '__main__': |
200 sys.exit(main(sys.argv[1:])) | 347 sys.exit(main(sys.argv[1:])) |
201 | 348 |
OLD | NEW |