Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(833)

Side by Side Diff: heuristics/distillable/write_features_csv.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « heuristics/distillable/server.py ('k') | install-build-deps.sh » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright 2016 The Chromium Authors. All rights reserved. 2 # Copyright 2016 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 import argparse 6 import argparse
7 import csv 7 import csv
8 import json 8 import json
9 import os 9 import os
10 import shutil 10 import shutil
11 import sys 11 import sys
12 import unittest
13
14 def filter_fields(header, data, fields):
15 """Filter (header, data) with selected header fields.
16
17 Args:
18 header ([str]): The header.
19 data ([[float]]): The data, with the same number of columns as header.
20 fields ([str]): The fields that need to be written.
21
22 Returns:
23 (header ([str]), data ([[float]]))
24
25 Examples:
26 >>> filter_fields(['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])
27 (['b', 'a'], [[1, 0], [4, 3]])
28 """
29
30 picked = []
31 for f in fields:
32 try:
33 picked.append(header.index(f))
34 except ValueError:
35 # OK to have missing values
36 pass
37
38 h = [header[i] for i in picked]
39 d = []
40 for e in data:
41 d.append([e[i] for i in picked])
42 return (h, d)
43
44 def write_features(filename, header, data, fields):
45 """Write (header, data) to filename in CSV format, with selected fields.
46
47 Args:
48 filename (str): The output filename.
49 header ([str]): The header.
50 data ([[float]]): The data, with the same number of columns as header.
51 fields ([str]): The fields that need to be written.
52
53 Examples:
54 >>> write_features(None, ['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])
55 b,a
56 1,0
57 4,3
58 """
59
60 (header, data) = filter_fields(header, data, fields)
61
62 if filename:
63 writer = csv.writer(open(filename, 'w'))
64 else:
65 writer = csv.writer(sys.stdout, lineterminator="\n")
66
67 writer.writerow(header)
68 writer.writerows(data)
69
70 def getGroups(header):
71 """Return groups of header fields
72
73 Returns:
74 dict of name (str): fields ([str])
75 """
76 groups = {}
77 groupPath = [
78 'forum',
79 'index',
80 'search',
81 'view',
82 'archive',
83 'asp',
84 'phpbb',
85 'php',
86 'pathLength',
87 'domain',
88 'pathComponents',
89 'slugDetector',
90 'pathNumbers',
91 'lastSegmentLength',
92 ]
93 groups['path'] = groupPath
94
95 groupNumElement = [
96 'numElements',
97 'numAnchors',
98 'anchorRatio',
99 'numForms',
100 'numTextInput',
101 'numPasswordInput',
102 'numPPRE',
103 ]
104 groups['numElement'] = groupNumElement
105
106 groupVisibleElement = [
107 'visibleElements',
108 'visibleAnchors',
109 'visiblePPRE',
110 'visibleAnchorOverPPre',
111 ]
112 groups['visibleElement'] = groupVisibleElement
113
114 groupEntries = [
115 'numSection',
116 'numSection2',
117 'numSection3',
118 'numArticle',
119 'numArticle2',
120 'numArticle3',
121 'numEntries',
122 'numEntries2',
123 'numEntries3',
124 'numH1',
125 'numH2',
126 'numH3',
127 'numH4',
128 'headCountSum',
129 'headCountMax',
130 'entryCountSum',
131 'entryCountMax',
132 ]
133 groups['entries'] = groupEntries
134
135 groupV1 = [
136 'openGraph',
137
138 'forum',
139 'index',
140 'search',
141 'view',
142 'archive',
143 'asp',
144 'phpbb',
145 'php',
146 'pathLength',
147 'domain',
148 'pathComponents',
149 'slugDetector',
150 'pathNumbers',
151 'lastSegmentLength',
152
153 'formCount',
154 'anchorCount',
155 'elementCount',
156 'anchorRatio',
157
158 'mozScore',
159 'mozScoreAllSqrt',
160 'mozScoreAllLinear',
161 ]
162 groups['v1'] = groupV1
163
164 groupV1NoPath = [
165 'openGraph',
166
167 'formCount',
168 'anchorCount',
169 'elementCount',
170 'anchorRatio',
171
172 'mozScore',
173 'mozScoreAllSqrt',
174 'mozScoreAllLinear',
175 ]
176 groups['v1NoPath'] = groupV1NoPath
177
178 groups['allElement'] = groupNumElement + groupVisibleElement + groupEntries
179 groups['mozScores'] = [f for f in header if 'moz' in f]
180 groups['noText'] = [f for f in header if not ('inner' in f or 'Content' in f o r 'WordCount' in f)]
181
182 return groups
12 183
13 def main(argv): 184 def main(argv):
14 parser = argparse.ArgumentParser() 185 parser = argparse.ArgumentParser()
15 parser.add_argument('--out', required=True) 186 parser.add_argument('--out', required=True, help="filename of output")
16 parser.add_argument('--marked', required=True) 187 parser.add_argument('--marked', help="filename of marked output")
17 parser.add_argument('--features', required=True) 188 parser.add_argument('--distilled', help="filename of derived features of disti lled content")
189 parser.add_argument('--features', required=True, help="filename of derived fea tures")
18 options = parser.parse_args(argv) 190 options = parser.parse_args(argv)
19 191
20 marked = None 192 if (options.marked is None) + (options.distilled is None) != 1:
21 with open(options.marked) as markedin: 193 print 'Use exactly one of --marked or --distilled.'
22 marked = json.load(markedin) 194 os.exit(1)
23 195
24 features = None
25 with open(options.features) as features: 196 with open(options.features) as features:
26 features = json.load(features) 197 features = json.load(features)
27 198
28 markedMap = dict() 199 if options.marked:
29 # good: 200 with open(options.marked) as markedin:
30 # -1 error 201 marked = json.load(markedin)
31 # 0 bad 202
32 # 1 good 203 markedMap = dict()
33 # 2 good w/error 204 # good:
34 for m in marked: 205 # -1 error
35 if not 'good' in m: 206 # 0 bad
36 continue 207 # 1 good
37 if m['good'] < 0: 208 # 2 good w/error
38 continue 209 for m in marked:
39 markedMap[m['url']] = m 210 if not 'good' in m:
40 211 continue
41 merged = [] 212 if m['good'] < 0:
42 for f in features: 213 continue
43 url = f['url'] 214 markedMap[m['url'].strip()] = m
44 if not url in markedMap: 215
45 continue 216 print "Loaded %d labeled entries" % (len(markedMap))
46 merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['feat ures'][1::2])) 217
47 218 merged = []
48 header = ['good'] + map(str, features[0]['features'][::2]) 219 for f in features:
49 220 url = f['url']
50 with open(options.out, 'w') as csvfile: 221 if not url in markedMap:
51 writer = csv.writer(csvfile) 222 continue
52 writer.writerow(header) 223 merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['fe atures'][1::2]))
53 for e in merged: 224 print "Merged %d entries" % (len(merged))
54 writer.writerow(e) 225
226 if options.distilled:
227 with open(options.distilled) as markedin:
228 marked = json.load(markedin)
229
230 markedMap = dict()
231 for m in marked:
232 feature = m['features']
233 feature = dict(zip(feature[::2], feature[1::2]))
234 if feature['innerTextLength'] == 0:
235 continue
236 m['features'] = feature
237 markedMap[m['url'].strip()] = m
238
239 print "Loaded %d distilled entries" % (len(markedMap))
240
241 merged = []
242 for f in features:
243 url = f['url']
244 if not url in markedMap:
245 continue
246 if f['native']['features']['isMobileFriendly'] == 1:
247 continue
248 if f['native']['distillable'] != 1:
249 continue
250 feature = markedMap[url]['features']
251 merged.append(map(float, [0 if feature['innerTextLength'] < 1000 else 1] + f['features'][1::2]))
252 print "Merged %d entries" % (len(merged))
253
254 feature_headers = map(str, features[0]['features'][::2])
255 header = ['good'] + feature_headers
256
257 write_features(options.out, header, merged, header)
258
259 # write datasets with a single feature
260 outbase = os.path.splitext(options.out)[0]
261 for s in feature_headers:
262 print 'Single feature: %s' % s
263 write_features('%s-feature-%s.csv' % (outbase, s), header, merged, ['good', s])
264
265 # write datasets with feature groups
266 for (name, g) in getGroups(feature_headers).iteritems():
267 print 'Feature group: %s' % name
268 write_features('%s-group-%s.csv' % (outbase, name), header, merged, ['good'] + g)
55 269
56 return 0 270 return 0
57 271
58 if __name__ == '__main__': 272 if __name__ == '__main__':
59 sys.exit(main(sys.argv[1:])) 273 sys.exit(main(sys.argv[1:]))
60
OLDNEW
« no previous file with comments | « heuristics/distillable/server.py ('k') | install-build-deps.sh » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698