Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(273)

Unified Diff: heuristics/distillable/write_features_csv.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « heuristics/distillable/server.py ('k') | install-build-deps.sh » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: heuristics/distillable/write_features_csv.py
diff --git a/heuristics/distillable/write_features_csv.py b/heuristics/distillable/write_features_csv.py
index 17e1fd08e58984af7ddb07480b4673b51026feb5..a366858e929ab6f163b77c27e2177725432bd7f7 100755
--- a/heuristics/distillable/write_features_csv.py
+++ b/heuristics/distillable/write_features_csv.py
@@ -9,52 +9,265 @@ import json
import os
import shutil
import sys
+import unittest
+
+def filter_fields(header, data, fields):
+ """Filter (header, data) with selected header fields.
+
+ Args:
+ header ([str]): The header.
+ data ([[float]]): The data, with the same number of columns as header.
+ fields ([str]): The fields that need to be written.
+
+ Returns:
+ (header ([str]), data ([[float]]))
+
+ Examples:
+ >>> filter_fields(['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])
+ (['b', 'a'], [[1, 0], [4, 3]])
+ """
+
+ picked = []
+ for f in fields:
+ try:
+ picked.append(header.index(f))
+ except ValueError:
+ # OK to have missing values
+ pass
+
+ h = [header[i] for i in picked]
+ d = []
+ for e in data:
+ d.append([e[i] for i in picked])
+ return (h, d)
+
+def write_features(filename, header, data, fields):
+ """Write (header, data) to filename in CSV format, with selected fields.
+
+ Args:
+ filename (str): The output filename.
+ header ([str]): The header.
+ data ([[float]]): The data, with the same number of columns as header.
+ fields ([str]): The fields that need to be written.
+
+ Examples:
+ >>> write_features(None, ['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])
+ b,a
+ 1,0
+ 4,3
+ """
+
+ (header, data) = filter_fields(header, data, fields)
+
+ if filename:
+ writer = csv.writer(open(filename, 'w'))
+ else:
+ writer = csv.writer(sys.stdout, lineterminator="\n")
+
+ writer.writerow(header)
+ writer.writerows(data)
+
+def getGroups(header):
+ """Return groups of header fields
+
+ Returns:
+ dict of name (str): fields ([str])
+ """
+ groups = {}
+ groupPath = [
+ 'forum',
+ 'index',
+ 'search',
+ 'view',
+ 'archive',
+ 'asp',
+ 'phpbb',
+ 'php',
+ 'pathLength',
+ 'domain',
+ 'pathComponents',
+ 'slugDetector',
+ 'pathNumbers',
+ 'lastSegmentLength',
+ ]
+ groups['path'] = groupPath
+
+ groupNumElement = [
+ 'numElements',
+ 'numAnchors',
+ 'anchorRatio',
+ 'numForms',
+ 'numTextInput',
+ 'numPasswordInput',
+ 'numPPRE',
+ ]
+ groups['numElement'] = groupNumElement
+
+ groupVisibleElement = [
+ 'visibleElements',
+ 'visibleAnchors',
+ 'visiblePPRE',
+ 'visibleAnchorOverPPre',
+ ]
+ groups['visibleElement'] = groupVisibleElement
+
+ groupEntries = [
+ 'numSection',
+ 'numSection2',
+ 'numSection3',
+ 'numArticle',
+ 'numArticle2',
+ 'numArticle3',
+ 'numEntries',
+ 'numEntries2',
+ 'numEntries3',
+ 'numH1',
+ 'numH2',
+ 'numH3',
+ 'numH4',
+ 'headCountSum',
+ 'headCountMax',
+ 'entryCountSum',
+ 'entryCountMax',
+ ]
+ groups['entries'] = groupEntries
+
+ groupV1 = [
+ 'openGraph',
+
+ 'forum',
+ 'index',
+ 'search',
+ 'view',
+ 'archive',
+ 'asp',
+ 'phpbb',
+ 'php',
+ 'pathLength',
+ 'domain',
+ 'pathComponents',
+ 'slugDetector',
+ 'pathNumbers',
+ 'lastSegmentLength',
+
+ 'formCount',
+ 'anchorCount',
+ 'elementCount',
+ 'anchorRatio',
+
+ 'mozScore',
+ 'mozScoreAllSqrt',
+ 'mozScoreAllLinear',
+ ]
+ groups['v1'] = groupV1
+
+ groupV1NoPath = [
+ 'openGraph',
+
+ 'formCount',
+ 'anchorCount',
+ 'elementCount',
+ 'anchorRatio',
+
+ 'mozScore',
+ 'mozScoreAllSqrt',
+ 'mozScoreAllLinear',
+ ]
+ groups['v1NoPath'] = groupV1NoPath
+
+ groups['allElement'] = groupNumElement + groupVisibleElement + groupEntries
+ groups['mozScores'] = [f for f in header if 'moz' in f]
+ groups['noText'] = [f for f in header if not ('inner' in f or 'Content' in f or 'WordCount' in f)]
+
+ return groups
def main(argv):
parser = argparse.ArgumentParser()
- parser.add_argument('--out', required=True)
- parser.add_argument('--marked', required=True)
- parser.add_argument('--features', required=True)
+ parser.add_argument('--out', required=True, help="filename of output")
+ parser.add_argument('--marked', help="filename of marked output")
+ parser.add_argument('--distilled', help="filename of derived features of distilled content")
+ parser.add_argument('--features', required=True, help="filename of derived features")
options = parser.parse_args(argv)
- marked = None
- with open(options.marked) as markedin:
- marked = json.load(markedin)
+ if (options.marked is None) + (options.distilled is None) != 1:
+ print 'Use exactly one of --marked or --distilled.'
+ os.exit(1)
- features = None
with open(options.features) as features:
features = json.load(features)
- markedMap = dict()
- # good:
- # -1 error
- # 0 bad
- # 1 good
- # 2 good w/error
- for m in marked:
- if not 'good' in m:
- continue
- if m['good'] < 0:
- continue
- markedMap[m['url']] = m
-
- merged = []
- for f in features:
- url = f['url']
- if not url in markedMap:
- continue
- merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['features'][1::2]))
-
- header = ['good'] + map(str, features[0]['features'][::2])
-
- with open(options.out, 'w') as csvfile:
- writer = csv.writer(csvfile)
- writer.writerow(header)
- for e in merged:
- writer.writerow(e)
+ if options.marked:
+ with open(options.marked) as markedin:
+ marked = json.load(markedin)
+
+ markedMap = dict()
+ # good:
+ # -1 error
+ # 0 bad
+ # 1 good
+ # 2 good w/error
+ for m in marked:
+ if not 'good' in m:
+ continue
+ if m['good'] < 0:
+ continue
+ markedMap[m['url'].strip()] = m
+
+ print "Loaded %d labeled entries" % (len(markedMap))
+
+ merged = []
+ for f in features:
+ url = f['url']
+ if not url in markedMap:
+ continue
+ merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['features'][1::2]))
+ print "Merged %d entries" % (len(merged))
+
+ if options.distilled:
+ with open(options.distilled) as markedin:
+ marked = json.load(markedin)
+
+ markedMap = dict()
+ for m in marked:
+ feature = m['features']
+ feature = dict(zip(feature[::2], feature[1::2]))
+ if feature['innerTextLength'] == 0:
+ continue
+ m['features'] = feature
+ markedMap[m['url'].strip()] = m
+
+ print "Loaded %d distilled entries" % (len(markedMap))
+
+ merged = []
+ for f in features:
+ url = f['url']
+ if not url in markedMap:
+ continue
+ if f['native']['features']['isMobileFriendly'] == 1:
+ continue
+ if f['native']['distillable'] != 1:
+ continue
+ feature = markedMap[url]['features']
+ merged.append(map(float, [0 if feature['innerTextLength'] < 1000 else 1] + f['features'][1::2]))
+ print "Merged %d entries" % (len(merged))
+
+ feature_headers = map(str, features[0]['features'][::2])
+ header = ['good'] + feature_headers
+
+ write_features(options.out, header, merged, header)
+
+ # write datasets with a single feature
+ outbase = os.path.splitext(options.out)[0]
+ for s in feature_headers:
+ print 'Single feature: %s' % s
+ write_features('%s-feature-%s.csv' % (outbase, s), header, merged, ['good', s])
+
+ # write datasets with feature groups
+ for (name, g) in getGroups(feature_headers).iteritems():
+ print 'Feature group: %s' % name
+ write_features('%s-group-%s.csv' % (outbase, name), header, merged, ['good'] + g)
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
-
« no previous file with comments | « heuristics/distillable/server.py ('k') | install-build-deps.sh » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698