heuristics/distillable/get_screenshots.py - Issue 1808503002: Update distillability modeling scripts to predict long articles

Side by Side Diff: heuristics/distillable/get_screenshots.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible

Patch Set: update docs Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # Copyright 2016 The Chromium Authors. All rights reserved.	2 # Copyright 2016 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5	5

6 import argparse	6 import argparse

7 import json	7 import json

8 import os	8 import os

9 import shutil	9 import shutil

10 import sys	10 import sys

11 import time	11 import time

12 import urllib	12 import urllib

13 import random	13 import random

14 from lockfile import FileLock	14 from lockfile import FileLock

15	15

	16 from calculate_derived_features import CalcDerivedFeatures

	17

16 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))	18 repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '../..'))

17	19

18 try:	20 try:

19 from selenium import webdriver	21 from selenium import webdriver

	22 from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

20 except:	23 except:

21 print 'ERROR:'	24 print 'ERROR:'

22 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`. ' % repo_root	25 print 'Couldn\'t import webdriver. Please run `sudo %s/install-build-deps.sh`. ' % repo_root

23 sys.exit(1)	26 sys.exit(1)

24	27

25 def addBuildtoolsToPath():	28 def addBuildtoolsToPath():

26 envPath = os.environ['PATH']	29 envPath = os.environ['PATH']

27 buildtoolsPath = repo_root + '/buildtools'	30 buildtoolsPath = repo_root + '/buildtools'

28 if not buildtoolsPath in envPath:	31 if not buildtoolsPath in envPath:

29 os.environ['PATH'] = buildtoolsPath + ':' + envPath	32 os.environ['PATH'] = buildtoolsPath + ':' + envPath

30	33

31 def getDistillerUrl(u):	34 def getDistillerUrl(u):

32 params = { 'url': u}	35 params = { 'url': u}

33 return "chrome-distiller://blah/?" + urllib.urlencode(params)	36 return "chrome-distiller://blah/?" + urllib.urlencode(params)

34	37

35 def newDriver():	38 def newDriver(mobile=False):

36 chromeOptions = webdriver.ChromeOptions()	39 chromeOptions = webdriver.ChromeOptions()

37 chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";	40 # If you want to use a different version of chrome, specify the full path here .

	41 #chromeOptions.binary_location = "/usr/bin/google-chrome-unstable";

38 chromeOptions.add_argument('--enable-dom-distiller')	42 chromeOptions.add_argument('--enable-dom-distiller')

39 chromeOptions.add_argument('--save-page-as-mhtml')	43 chromeOptions.add_argument('--save-page-as-mhtml')

40 driver = webdriver.Chrome(chrome_options=chromeOptions)	44 chromeOptions.add_argument('--reader-mode-heuristics=adaboost')

	45 chromeOptions.add_argument('--distillability-dev')

	46 if mobile:

	47 mobile_emulation = { "deviceName": "Google Nexus 5" }

	48 chromeOptions.add_experimental_option("mobileEmulation", mobile_emulation)

	49

	50 d = DesiredCapabilities.CHROME

	51 # This is to enable accessing devtools console log from here, for nativeFeatur es().

	52 d['loggingPrefs'] = {'browser': 'ALL'}

	53 driver = webdriver.Chrome(chrome_options=chromeOptions, desired_capabilities=d )

41 driver.set_page_load_timeout(60)	54 driver.set_page_load_timeout(60)

42 driver.set_script_timeout(60)	55 driver.set_script_timeout(60)

43 print "created a new chrome driver"	56 print "created a new chrome driver"

44 return driver	57 return driver

45	58

	59 def nativeFeatures(logs):

	60 return _parseNative(logs, 'distillability_features = ')

	61

	62 def nativeClassification(logs):

	63 return _parseNative(logs, 'adaboost_classification = ')

	64

	65 def _parseNative(logs, needle):

	66 """Parse console logs from Chrome and get decoded JSON.

	67

	68 Args:

	69 logs: Chrome log object

	70 needle (str): the string leading the actual JSON.

	71

	72 Example:

	73 >>> _parseNative([{'message':'a=b'},{'message':'ac={"a":[1,2]}'}],'c=')

	74 {u'a': [1, 2]}

	75 """

	76 ret = None

	77 for log in logs:

	78 message = log['message']

	79 loc = message.find(needle)

	80 if loc >= 0:

	81 ret = json.loads(message[loc+len(needle):])

	82 return ret

	83

	84 def saveFeatures(driver, feature_extractor, data, url_override, filename):

	85 data = dict.copy(data)

	86 features = driver.execute_script(feature_extractor)

	87 if url_override:

	88 features['url'] = url_override

	89 data['features'] = features

	90

	91 logs = driver.get_log('browser')

	92 native = nativeClassification(logs)

	93 if native:

	94 native['features'] = nativeFeatures(logs)

	95 data['native'] = native

	96

	97 with open(filename, 'w') as outf:

	98 json.dump(data, outf, indent=2, sort_keys=True)

	99 print "saved %s" % filename

	100

	101 derived = dict.copy(data)

	102 derived['features'] = CalcDerivedFeatures(data['index'], features)

	103

	104 derived_name = filename + '-derived'

	105 with open(derived_name, 'w') as outf:

	106 json.dump(derived, outf, indent=2, sort_keys=True)

	107 print "saved %s" % derived_name

	108 return data, derived

	109

	110 def saveInfoFile(data, ss, dss, filename):

	111 data = dict.copy(data)

	112 data['screenshot'] = ss

	113 data['distilled'] = dss

	114 with open(filename, 'w') as info:

	115 json.dump(data, info)

	116

	117 def saveMHTML(filename):

	118 """Save current page as an MHTML file

	119

	120 This is done by issuing xdotool commands.

	121 Dependencies:

	122 - Command line argument "--save-page-as-mhtml" to Chrome.

	123 - xdotool

	124 """

	125

	126 cmd = (

	127 'xdotool key --clearmodifiers "ctrl+s" && ' +

	128 'sleep 1 && ' +

	129 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +

	130 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +

	131 'xdotool type --delay 10 --clearmodifiers "%s" && ' +

	132 'xdotool key --delay 20 --clearmodifiers Return'

	133 ) % (os.path.abspath(filename))

	134 os.system(cmd)

	135 time.sleep(3) # wait for file saving

	136 if not os.path.exists(filename):

	137 return False

	138 print "saved %s" % filename

	139 return True

	140

46 def writeAggregated(outdir, ext, out, in_marshal=False):	141 def writeAggregated(outdir, ext, out, in_marshal=False):

47 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]	142 prevfiles = [os.path.join(outdir, f) for f in os.listdir(outdir)]

48 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f) [1] == '.' + ext]	143 prevfiles = [f for f in prevfiles if os.path.isfile(f) and os.path.splitext(f) [1] == '.' + ext]

49 output = []	144 output = []

50 print 'reading %s files' % (ext)	145 print 'reading %s files' % (ext)

51 for f in prevfiles:	146 for f in prevfiles:

52 with open(f) as infofile:	147 with open(f) as infofile:

53 info = json.load(infofile)	148 info = json.load(infofile)

54 output.append(info)	149 output.append(info)

55 print 'done reading %s files' % (ext)	150 print 'done reading %s files' % (ext)

56	151

57 output = sorted(output, key=lambda k: k['index'])	152 output = sorted(output, key=lambda k: k['index'])

58 print 'writing %s files' % (ext)	153 print 'writing %s files' % (ext)

59 with open('%s/%s' % (outdir, out), 'w') as outf:	154 with open('%s/%s' % (outdir, out), 'w') as outf:

60 if in_marshal:	155 if in_marshal:

61 import marshal	156 import marshal

62 marshal.dump(output, outf)	157 marshal.dump(output, outf)

63 else:	158 else:

64 json.dump(output, outf, indent=2)	159 json.dump(output, outf, indent=2)

65 print 'done writing %s files' % (ext)	160 print 'done writing %s files' % (ext)

66	161

67 def writeIndex(outdir):	162 def writeIndex(outdir):

68 writeAggregated(outdir, "info", "index")	163 writeAggregated(outdir, "info", "index")

69	164

70 def writeFeature(outdir):	165 def writeFeature(outdir):

71 writeAggregated(outdir, "feature", "feature", in_marshal=True)	166 for n in ["feature-derived", "dfeature-derived", "mfeature-derived", "mdfeatur e-derived"]:

	167 writeAggregated(outdir, n, n)

	168 # Use the following when needing aggregated raw features:

	169 #writeAggregated(outdir, "feature", "feature", in_marshal=True)

	170 #writeAggregated(outdir, "dfeature", "dfeature", in_marshal=True)

	171 #writeAggregated(outdir, "mfeature", "mfeature", in_marshal=True)

	172 #writeAggregated(outdir, "mdfeature", "mdfeature", in_marshal=True)

	173

	174 def shouldProcess(load_mhtml, no_distill, prefix):

	175 info = prefix + '.info'

	176 mhtml = prefix + '.mhtml'

	177 mfeature = prefix + '.mfeature'

	178 mdfeature = prefix + '.mdfeature'

	179 if not load_mhtml:

	180 return not os.path.exists(info)

	181 else:

	182 if no_distill:

	183 return os.path.exists(mhtml) and not os.path.exists(mfeature)

	184 else:

	185 return os.path.exists(mhtml) and not os.path.exists(mdfeature)

72	186

73 def main(argv):	187 def main(argv):

74 parser = argparse.ArgumentParser()	188 parser = argparse.ArgumentParser()

75 parser.add_argument('--out', required=True)	189 parser.add_argument('--out', required=True)

76 parser.add_argument('urls', nargs='*')	190 parser.add_argument('urls', nargs='*')

77 parser.add_argument('--force', action='store_true')	191 parser.add_argument('--force', action='store_true')

78 parser.add_argument('--urls-file')	192 parser.add_argument('--urls-file')

	193 parser.add_argument('--emulate-mobile', action='store_true')

79 parser.add_argument('--resume', action='store_true')	194 parser.add_argument('--resume', action='store_true')

80 parser.add_argument('--write-index', action='store_true')	195 parser.add_argument('--write-index', action='store_true')

81 parser.add_argument('--save-mhtml', action='store_true')	196 parser.add_argument('--save-mhtml', action='store_true')

	197 parser.add_argument('--load-mhtml', action='store_true')

	198 parser.add_argument('--skip-distillation', action='store_true')

	199 parser.add_argument('--desktop-distillable-only', action='store_true')

82 options = parser.parse_args(argv)	200 options = parser.parse_args(argv)

83	201

	202 if options.load_mhtml:

	203 if options.save_mhtml:

	204 print '--load-mhtml is not compatible with --save-mhtml'

	205 return 1

	206 if options.resume:

	207 print '--load-mhtml is not compatible with --resume'

	208 return 1

	209

84 outdir = options.out	210 outdir = options.out

85 if not options.resume:	211 if not options.resume and not options.load_mhtml and not options.write_index:

86 if os.path.exists(outdir):	212 if os.path.exists(outdir):

87 if not options.force:	213 if not options.force:

88 print outdir + ' exists'	214 print outdir + ' exists'

89 return 1	215 return 1

90 shutil.rmtree(outdir, ignore_errors=True)	216 shutil.rmtree(outdir, ignore_errors=True)

91 os.makedirs(outdir)	217 os.makedirs(outdir)

92 else:	218 else:

93 if not os.path.exists(outdir):	219 if not os.path.exists(outdir):

94 print outdir + ' doesn\'t exist'	220 print outdir + ' doesn\'t exist'

95 return 1	221 return 1

96	222

97 addBuildtoolsToPath()	223 addBuildtoolsToPath()

98	224

99 if options.urls:	225 if options.urls:

100 files = options.urls	226 files = options.urls

101 elif options.urls_file:	227 elif options.urls_file:

102 with open(options.urls_file) as u:	228 with open(options.urls_file) as u:

103 files = u.read().splitlines()	229 files = u.read().splitlines()

104 else:	230 else:

105 print 'oh no'	231 print 'oh no'

106 return 1	232 return 1

107	233

108 if options.write_index:	234 if options.write_index:

109 writeIndex(outdir)	235 writeIndex(outdir)

110 writeFeature(outdir)	236 writeFeature(outdir)

111 print 'index is written'	237 print 'index is written'

112 return 0	238 return 0

113	239

114 driver = newDriver()	240 driver = newDriver(options.emulate_mobile)

115	241

116 feature_extractor = open('extract_features.js').read()	242 feature_extractor = open('extract_features.js').read()

117	243

118 try:	244 try:

119 jobs = list(enumerate(files))	245 jobs = list(enumerate(files))

120 random.shuffle(jobs)	246 random.shuffle(jobs)

121 for i, f in jobs:	247 for i, f in jobs:

122 prefix = '%s/%d' % (outdir, i)	248 prefix = '%s/%d' % (outdir, i)

123 info = '%s.info' % prefix	249 info = '%s.info' % prefix

	250 basedata = {'index': i, 'url': f}

124	251

125 if os.path.exists(info):	252 if not shouldProcess(options.load_mhtml, options.skip_distillation, prefix ):

126 print "skip %d" % (i)	253 print "skip %d" % (i)

127 continue;	254 continue;

128	255

129 with FileLock('%s.lock' % (prefix)):	256 with FileLock('%s.lock' % (prefix)):

130 if os.path.exists(info):	257 if not shouldProcess(options.load_mhtml, options.skip_distillation, pref ix):

131 print "SKIP %d" % (i)	258 print "skip %d" % (i)

132 continue;	259 continue;

133 try:	260 try:

134 ss = '%s.png' % prefix	261 ss = '%s.png' % prefix

135 dss = '%s-distilled.png' % prefix	262 dss = '%s-distilled.png' % prefix

136 fea = '%s.feature' % prefix	263 fea = '%s.feature' % prefix

	264 dfea = '%s.dfeature' % prefix

	265 mhtml = '%s.mhtml' % prefix

	266 mhtml_url = 'file://%s' % os.path.abspath(mhtml)

137	267

138 driver.set_window_size(1280, 5000)	268 if options.emulate_mobile:

139 driver.get(f)	269 driver.set_window_size(400, 800)

140 time.sleep(3) # wait for some async scripts	270 else:

141 driver.save_screenshot(ss)	271 driver.set_window_size(1280, 5000)

142 print "saved %s" % ss	272 if options.load_mhtml:

	273 if not os.path.exists(mhtml):

	274 print "SKIP %d, no mhtml" % (i)

	275 continue

	276 driver.get(mhtml_url)

	277 time.sleep(1) # wait a bit for things to stablize

	278 else:

	279 driver.get(f)

	280 time.sleep(3) # wait for some async scripts

	281 driver.save_screenshot(ss)

	282 print "saved %s" % ss

143	283

144 features = driver.execute_script(feature_extractor)	284 url_override = None

145 data = {	285 if options.load_mhtml:

146 'index': i,	286 with open(fea) as infile:

147 'url': f,	287 # otherwise it would be file:// of mhtml

148 'features': features	288 url_override = json.load(infile)['features']['url']

149 }	289 fea = '%s.mfeature' % prefix

150 with open(fea, 'w') as outf:	290 _, derived = saveFeatures(driver, feature_extractor, basedata, url_ove rride, fea)

151 json.dump(data, outf, indent=2)	291

152 print "saved %s" % fea	292 if options.desktop_distillable_only:

	293 if derived['native']['features']['isMobileFriendly'] or not derived[ 'native']['distillable']:

	294 os.system('rm %s.feature %s.png' % (prefix, prefix))

	295 saveInfoFile(basedata, ss, dss, info)

	296 continue

153	297

154 if options.save_mhtml:	298 if options.save_mhtml:

155 mhtml = '%s.mhtml' % prefix	299 if not saveMHTML(mhtml):

156 cmd = (

157 'xdotool key --clearmodifiers "ctrl+s" && ' +

158 'sleep 1 && ' +

159 'xdotool key --delay 20 --clearmodifier "Alt+n" && ' +

160 'xdotool key --delay 20 --clearmodifiers "ctrl+a" "BackSpace" && ' +

161 'xdotool type --delay 10 --clearmodifiers "%s" && ' +

162 'xdotool key --delay 20 --clearmodifiers Return'

163 ) % (os.getcwd() + '/' + mhtml)

164 os.system(cmd)

165 time.sleep(3) # wait for file saving

166 if not os.path.exists(mhtml):

167 # If the file is not saved, the focus point might be lost.	300 # If the file is not saved, the focus point might be lost.

168 # Restart the whole xvfb environment to be safe.	301 # Restart the whole xvfb environment to be safe.

169 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm l)	302 print "[ERROR] Snapshot of [%d] %s (%s) is missing." % (i, f, mhtm l)

170 break	303 break

171	304

172 driver.set_window_size(640, 5000)	305 if options.skip_distillation:

	306 continue

	307

	308 if options.emulate_mobile:

	309 driver.set_window_size(400, 800)

	310 else:

	311 driver.set_window_size(640, 5000)

	312

	313 if options.load_mhtml:

	314 driver.get(getDistillerUrl(mhtml_url))

	315 time.sleep(10)

	316 dss = '%s-mdistilled.png' % prefix

	317 driver.save_screenshot(dss)

	318 print "saved %s" % dss

	319 dfea = '%s.mdfeature' % prefix

	320 saveFeatures(driver, feature_extractor, basedata, None, dfea)

	321 continue

	322

173 driver.get(getDistillerUrl(f))	323 driver.get(getDistillerUrl(f))

174 time.sleep(20) # wait for multi-page, etc	324 for i in range(3):

175 driver.save_screenshot(dss)	325 time.sleep(20) # wait for multi-page, etc

176 print "saved %s" % dss	326 driver.save_screenshot(dss)

	327 print "saved %s" % dss

	328 feature, _ = saveFeatures(driver, feature_extractor, basedata, None, dfea)

	329 if feature['features']['innerText'] != "":

	330 break

177	331

178 data = {	332 saveInfoFile(basedata, ss, dss, info)

179 'index': i,

180 'url': f,

181 'screenshot': ss,

182 'distilled': dss,

183 }

184 with open(info, 'w') as info:

185 json.dump(data, info)

186	333

187 except Exception as e:	334 except Exception as e:

188 print e	335 print e

189 print "Index=%d URL=%s" % (i, f)	336 print "Index=%d URL=%s" % (i, f)

190 driver.quit()	337 driver.quit()

191 driver = newDriver()	338 driver = newDriver()

192 pass	339 pass

193	340

194 finally:	341 finally:

195 driver.quit()	342 driver.quit()

196	343

197 return 0	344 return 0

198	345

199 if __name__ == '__main__':	346 if __name__ == '__main__':

200 sys.exit(main(sys.argv[1:]))	347 sys.exit(main(sys.argv[1:]))

201	348

OLD	NEW

« no previous file with comments | « heuristics/distillable/extract_features.js ('k') | heuristics/distillable/server.py » ('j') | no next file with comments »