heuristics/distillable/write_features_csv.py - Issue 1808503002: Update distillability modeling scripts to predict long articles

Unified Diff: heuristics/distillable/write_features_csv.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible

Patch Set: update docs Created 4 years, 7 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: heuristics/distillable/write_features_csv.py

diff --git a/heuristics/distillable/write_features_csv.py b/heuristics/distillable/write_features_csv.py

index 17e1fd08e58984af7ddb07480b4673b51026feb5..a366858e929ab6f163b77c27e2177725432bd7f7 100755

--- a/heuristics/distillable/write_features_csv.py

+++ b/heuristics/distillable/write_features_csv.py

@@ -9,52 +9,265 @@ import json

import os

import shutil

import sys

+import unittest

+def filter_fields(header, data, fields):

+ """Filter (header, data) with selected header fields.

+ Args:

+ header ([str]): The header.

+ data ([[float]]): The data, with the same number of columns as header.

+ fields ([str]): The fields that need to be written.

+ Returns:

+ (header ([str]), data ([[float]]))

+ Examples:

+ >>> filter_fields(['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])

+ (['b', 'a'], [[1, 0], [4, 3]])

+ """

+ picked = []

+ for f in fields:

+ try:

+ picked.append(header.index(f))

+ except ValueError:

+ # OK to have missing values

+ pass

+ h = [header[i] for i in picked]

+ d = []

+ for e in data:

+ d.append([e[i] for i in picked])

+ return (h, d)

+def write_features(filename, header, data, fields):

+ """Write (header, data) to filename in CSV format, with selected fields.

+ Args:

+ filename (str): The output filename.

+ header ([str]): The header.

+ data ([[float]]): The data, with the same number of columns as header.

+ fields ([str]): The fields that need to be written.

+ Examples:

+ >>> write_features(None, ['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])

+ b,a

+ 1,0

+ 4,3

+ """

+ (header, data) = filter_fields(header, data, fields)

+ if filename:

+ writer = csv.writer(open(filename, 'w'))

+ else:

+ writer = csv.writer(sys.stdout, lineterminator="\n")

+ writer.writerow(header)

+ writer.writerows(data)

+def getGroups(header):

+ """Return groups of header fields

+ Returns:

+ dict of name (str): fields ([str])

+ """

+ groups = {}

+ groupPath = [

+ 'forum',

+ 'index',

+ 'search',

+ 'view',

+ 'archive',

+ 'asp',

+ 'phpbb',

+ 'php',

+ 'pathLength',

+ 'domain',

+ 'pathComponents',

+ 'slugDetector',

+ 'pathNumbers',

+ 'lastSegmentLength',

+ ]

+ groups['path'] = groupPath

+ groupNumElement = [

+ 'numElements',

+ 'numAnchors',

+ 'anchorRatio',

+ 'numForms',

+ 'numTextInput',

+ 'numPasswordInput',

+ 'numPPRE',

+ ]

+ groups['numElement'] = groupNumElement

+ groupVisibleElement = [

+ 'visibleElements',

+ 'visibleAnchors',

+ 'visiblePPRE',

+ 'visibleAnchorOverPPre',

+ ]

+ groups['visibleElement'] = groupVisibleElement

+ groupEntries = [

+ 'numSection',

+ 'numSection2',

+ 'numSection3',

+ 'numArticle',

+ 'numArticle2',

+ 'numArticle3',

+ 'numEntries',

+ 'numEntries2',

+ 'numEntries3',

+ 'numH1',

+ 'numH2',

+ 'numH3',

+ 'numH4',

+ 'headCountSum',

+ 'headCountMax',

+ 'entryCountSum',

+ 'entryCountMax',

+ ]

+ groups['entries'] = groupEntries

+ groupV1 = [

+ 'openGraph',

+ 'forum',

+ 'index',

+ 'search',

+ 'view',

+ 'archive',

+ 'asp',

+ 'phpbb',

+ 'php',

+ 'pathLength',

+ 'domain',

+ 'pathComponents',

+ 'slugDetector',

+ 'pathNumbers',

+ 'lastSegmentLength',

+ 'formCount',

+ 'anchorCount',

+ 'elementCount',

+ 'anchorRatio',

+ 'mozScore',

+ 'mozScoreAllSqrt',

+ 'mozScoreAllLinear',

+ ]

+ groups['v1'] = groupV1

+ groupV1NoPath = [

+ 'openGraph',

+ 'formCount',

+ 'anchorCount',

+ 'elementCount',

+ 'anchorRatio',

+ 'mozScore',

+ 'mozScoreAllSqrt',

+ 'mozScoreAllLinear',

+ ]

+ groups['v1NoPath'] = groupV1NoPath

+ groups['allElement'] = groupNumElement + groupVisibleElement + groupEntries

+ groups['mozScores'] = [f for f in header if 'moz' in f]

+ groups['noText'] = [f for f in header if not ('inner' in f or 'Content' in f or 'WordCount' in f)]

+ return groups

def main(argv):

parser = argparse.ArgumentParser()

- parser.add_argument('--out', required=True)

- parser.add_argument('--marked', required=True)

- parser.add_argument('--features', required=True)

+ parser.add_argument('--out', required=True, help="filename of output")

+ parser.add_argument('--marked', help="filename of marked output")

+ parser.add_argument('--distilled', help="filename of derived features of distilled content")

+ parser.add_argument('--features', required=True, help="filename of derived features")

options = parser.parse_args(argv)

- marked = None

- with open(options.marked) as markedin:

- marked = json.load(markedin)

+ if (options.marked is None) + (options.distilled is None) != 1:

+ print 'Use exactly one of --marked or --distilled.'

+ os.exit(1)

- features = None

with open(options.features) as features:

features = json.load(features)

- markedMap = dict()

- # good:

- # -1 error

- # 0 bad

- # 1 good

- # 2 good w/error

- for m in marked:

- if not 'good' in m:

- continue

- if m['good'] < 0:

- continue

- markedMap[m['url']] = m

- merged = []

- for f in features:

- url = f['url']

- if not url in markedMap:

- continue

- merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['features'][1::2]))

- header = ['good'] + map(str, features[0]['features'][::2])

- with open(options.out, 'w') as csvfile:

- writer = csv.writer(csvfile)

- writer.writerow(header)

- for e in merged:

- writer.writerow(e)

+ if options.marked:

+ with open(options.marked) as markedin:

+ marked = json.load(markedin)

+ markedMap = dict()

+ # good:

+ # -1 error

+ # 0 bad

+ # 1 good

+ # 2 good w/error

+ for m in marked:

+ if not 'good' in m:

+ continue

+ if m['good'] < 0:

+ continue

+ markedMap[m['url'].strip()] = m

+ print "Loaded %d labeled entries" % (len(markedMap))

+ merged = []

+ for f in features:

+ url = f['url']

+ if not url in markedMap:

+ continue

+ merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['features'][1::2]))

+ print "Merged %d entries" % (len(merged))

+ if options.distilled:

+ with open(options.distilled) as markedin:

+ marked = json.load(markedin)

+ markedMap = dict()

+ for m in marked:

+ feature = m['features']

+ feature = dict(zip(feature[::2], feature[1::2]))

+ if feature['innerTextLength'] == 0:

+ continue

+ m['features'] = feature

+ markedMap[m['url'].strip()] = m

+ print "Loaded %d distilled entries" % (len(markedMap))

+ merged = []

+ for f in features:

+ url = f['url']

+ if not url in markedMap:

+ continue

+ if f['native']['features']['isMobileFriendly'] == 1:

+ continue

+ if f['native']['distillable'] != 1:

+ continue

+ feature = markedMap[url]['features']

+ merged.append(map(float, [0 if feature['innerTextLength'] < 1000 else 1] + f['features'][1::2]))

+ print "Merged %d entries" % (len(merged))

+ feature_headers = map(str, features[0]['features'][::2])

+ header = ['good'] + feature_headers

+ write_features(options.out, header, merged, header)

+ # write datasets with a single feature

+ outbase = os.path.splitext(options.out)[0]

+ for s in feature_headers:

+ print 'Single feature: %s' % s

+ write_features('%s-feature-%s.csv' % (outbase, s), header, merged, ['good', s])

+ # write datasets with feature groups

+ for (name, g) in getGroups(feature_headers).iteritems():

+ print 'Feature group: %s' % name

+ write_features('%s-group-%s.csv' % (outbase, name), header, merged, ['good'] + g)

return 0

if __name__ == '__main__':

sys.exit(main(sys.argv[1:]))

« no previous file with comments | « heuristics/distillable/server.py ('k') | install-build-deps.sh » ('j') | no next file with comments »