heuristics/distillable/write_features_csv.py - Issue 1808503002: Update distillability modeling scripts to predict long articles

Side by Side Diff: heuristics/distillable/write_features_csv.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible

Patch Set: update docs Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 #!/usr/bin/env python	1 #!/usr/bin/env python

2 # Copyright 2016 The Chromium Authors. All rights reserved.	2 # Copyright 2016 The Chromium Authors. All rights reserved.

3 # Use of this source code is governed by a BSD-style license that can be	3 # Use of this source code is governed by a BSD-style license that can be

4 # found in the LICENSE file.	4 # found in the LICENSE file.

5	5

6 import argparse	6 import argparse

7 import csv	7 import csv

8 import json	8 import json

9 import os	9 import os

10 import shutil	10 import shutil

11 import sys	11 import sys

	12 import unittest

	13

	14 def filter_fields(header, data, fields):

	15 """Filter (header, data) with selected header fields.

	16

	17 Args:

	18 header ([str]): The header.

	19 data ([[float]]): The data, with the same number of columns as header.

	20 fields ([str]): The fields that need to be written.

	21

	22 Returns:

	23 (header ([str]), data ([[float]]))

	24

	25 Examples:

	26 >>> filter_fields(['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])

	27 (['b', 'a'], [[1, 0], [4, 3]])

	28 """

	29

	30 picked = []

	31 for f in fields:

	32 try:

	33 picked.append(header.index(f))

	34 except ValueError:

	35 # OK to have missing values

	36 pass

	37

	38 h = [header[i] for i in picked]

	39 d = []

	40 for e in data:

	41 d.append([e[i] for i in picked])

	42 return (h, d)

	43

	44 def write_features(filename, header, data, fields):

	45 """Write (header, data) to filename in CSV format, with selected fields.

	46

	47 Args:

	48 filename (str): The output filename.

	49 header ([str]): The header.

	50 data ([[float]]): The data, with the same number of columns as header.

	51 fields ([str]): The fields that need to be written.

	52

	53 Examples:

	54 >>> write_features(None, ['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])

	55 b,a

	56 1,0

	57 4,3

	58 """

	59

	60 (header, data) = filter_fields(header, data, fields)

	61

	62 if filename:

	63 writer = csv.writer(open(filename, 'w'))

	64 else:

	65 writer = csv.writer(sys.stdout, lineterminator="\n")

	66

	67 writer.writerow(header)

	68 writer.writerows(data)

	69

	70 def getGroups(header):

	71 """Return groups of header fields

	72

	73 Returns:

	74 dict of name (str): fields ([str])

	75 """

	76 groups = {}

	77 groupPath = [

	78 'forum',

	79 'index',

	80 'search',

	81 'view',

	82 'archive',

	83 'asp',

	84 'phpbb',

	85 'php',

	86 'pathLength',

	87 'domain',

	88 'pathComponents',

	89 'slugDetector',

	90 'pathNumbers',

	91 'lastSegmentLength',

	92 ]

	93 groups['path'] = groupPath

	94

	95 groupNumElement = [

	96 'numElements',

	97 'numAnchors',

	98 'anchorRatio',

	99 'numForms',

	100 'numTextInput',

	101 'numPasswordInput',

	102 'numPPRE',

	103 ]

	104 groups['numElement'] = groupNumElement

	105

	106 groupVisibleElement = [

	107 'visibleElements',

	108 'visibleAnchors',

	109 'visiblePPRE',

	110 'visibleAnchorOverPPre',

	111 ]

	112 groups['visibleElement'] = groupVisibleElement

	113

	114 groupEntries = [

	115 'numSection',

	116 'numSection2',

	117 'numSection3',

	118 'numArticle',

	119 'numArticle2',

	120 'numArticle3',

	121 'numEntries',

	122 'numEntries2',

	123 'numEntries3',

	124 'numH1',

	125 'numH2',

	126 'numH3',

	127 'numH4',

	128 'headCountSum',

	129 'headCountMax',

	130 'entryCountSum',

	131 'entryCountMax',

	132 ]

	133 groups['entries'] = groupEntries

	134

	135 groupV1 = [

	136 'openGraph',

	137

	138 'forum',

	139 'index',

	140 'search',

	141 'view',

	142 'archive',

	143 'asp',

	144 'phpbb',

	145 'php',

	146 'pathLength',

	147 'domain',

	148 'pathComponents',

	149 'slugDetector',

	150 'pathNumbers',

	151 'lastSegmentLength',

	152

	153 'formCount',

	154 'anchorCount',

	155 'elementCount',

	156 'anchorRatio',

	157

	158 'mozScore',

	159 'mozScoreAllSqrt',

	160 'mozScoreAllLinear',

	161 ]

	162 groups['v1'] = groupV1

	163

	164 groupV1NoPath = [

	165 'openGraph',

	166

	167 'formCount',

	168 'anchorCount',

	169 'elementCount',

	170 'anchorRatio',

	171

	172 'mozScore',

	173 'mozScoreAllSqrt',

	174 'mozScoreAllLinear',

	175 ]

	176 groups['v1NoPath'] = groupV1NoPath

	177

	178 groups['allElement'] = groupNumElement + groupVisibleElement + groupEntries

	179 groups['mozScores'] = [f for f in header if 'moz' in f]

	180 groups['noText'] = [f for f in header if not ('inner' in f or 'Content' in f o r 'WordCount' in f)]

	181

	182 return groups

12	183

13 def main(argv):	184 def main(argv):

14 parser = argparse.ArgumentParser()	185 parser = argparse.ArgumentParser()

15 parser.add_argument('--out', required=True)	186 parser.add_argument('--out', required=True, help="filename of output")

16 parser.add_argument('--marked', required=True)	187 parser.add_argument('--marked', help="filename of marked output")

17 parser.add_argument('--features', required=True)	188 parser.add_argument('--distilled', help="filename of derived features of disti lled content")

	189 parser.add_argument('--features', required=True, help="filename of derived fea tures")

18 options = parser.parse_args(argv)	190 options = parser.parse_args(argv)

19	191

20 marked = None	192 if (options.marked is None) + (options.distilled is None) != 1:

21 with open(options.marked) as markedin:	193 print 'Use exactly one of --marked or --distilled.'

22 marked = json.load(markedin)	194 os.exit(1)

23	195

24 features = None

25 with open(options.features) as features:	196 with open(options.features) as features:

26 features = json.load(features)	197 features = json.load(features)

27	198

28 markedMap = dict()	199 if options.marked:

29 # good:	200 with open(options.marked) as markedin:

30 # -1 error	201 marked = json.load(markedin)

31 # 0 bad	202

32 # 1 good	203 markedMap = dict()

33 # 2 good w/error	204 # good:

34 for m in marked:	205 # -1 error

35 if not 'good' in m:	206 # 0 bad

36 continue	207 # 1 good

37 if m['good'] < 0:	208 # 2 good w/error

38 continue	209 for m in marked:

39 markedMap[m['url']] = m	210 if not 'good' in m:

40	211 continue

41 merged = []	212 if m['good'] < 0:

42 for f in features:	213 continue

43 url = f['url']	214 markedMap[m['url'].strip()] = m

44 if not url in markedMap:	215

45 continue	216 print "Loaded %d labeled entries" % (len(markedMap))

46 merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['feat ures'][1::2]))	217

47	218 merged = []

48 header = ['good'] + map(str, features[0]['features'][::2])	219 for f in features:

49	220 url = f['url']

50 with open(options.out, 'w') as csvfile:	221 if not url in markedMap:

51 writer = csv.writer(csvfile)	222 continue

52 writer.writerow(header)	223 merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['fe atures'][1::2]))

53 for e in merged:	224 print "Merged %d entries" % (len(merged))

54 writer.writerow(e)	225

	226 if options.distilled:

	227 with open(options.distilled) as markedin:

	228 marked = json.load(markedin)

	229

	230 markedMap = dict()

	231 for m in marked:

	232 feature = m['features']

	233 feature = dict(zip(feature[::2], feature[1::2]))

	234 if feature['innerTextLength'] == 0:

	235 continue

	236 m['features'] = feature

	237 markedMap[m['url'].strip()] = m

	238

	239 print "Loaded %d distilled entries" % (len(markedMap))

	240

	241 merged = []

	242 for f in features:

	243 url = f['url']

	244 if not url in markedMap:

	245 continue

	246 if f['native']['features']['isMobileFriendly'] == 1:

	247 continue

	248 if f['native']['distillable'] != 1:

	249 continue

	250 feature = markedMap[url]['features']

	251 merged.append(map(float, [0 if feature['innerTextLength'] < 1000 else 1] + f['features'][1::2]))

	252 print "Merged %d entries" % (len(merged))

	253

	254 feature_headers = map(str, features[0]['features'][::2])

	255 header = ['good'] + feature_headers

	256

	257 write_features(options.out, header, merged, header)

	258

	259 # write datasets with a single feature

	260 outbase = os.path.splitext(options.out)[0]

	261 for s in feature_headers:

	262 print 'Single feature: %s' % s

	263 write_features('%s-feature-%s.csv' % (outbase, s), header, merged, ['good', s])

	264

	265 # write datasets with feature groups

	266 for (name, g) in getGroups(feature_headers).iteritems():

	267 print 'Feature group: %s' % name

	268 write_features('%s-group-%s.csv' % (outbase, name), header, merged, ['good'] + g)

55	269

56 return 0	270 return 0

57	271

58 if __name__ == '__main__':	272 if __name__ == '__main__':

59 sys.exit(main(sys.argv[1:]))	273 sys.exit(main(sys.argv[1:]))

60

OLD	NEW

« no previous file with comments | « heuristics/distillable/server.py ('k') | install-build-deps.sh » ('j') | no next file with comments »