| Index: heuristics/distillable/write_features_csv.py
|
| diff --git a/heuristics/distillable/write_features_csv.py b/heuristics/distillable/write_features_csv.py
|
| index 17e1fd08e58984af7ddb07480b4673b51026feb5..a366858e929ab6f163b77c27e2177725432bd7f7 100755
|
| --- a/heuristics/distillable/write_features_csv.py
|
| +++ b/heuristics/distillable/write_features_csv.py
|
| @@ -9,52 +9,265 @@ import json
|
| import os
|
| import shutil
|
| import sys
|
| +import unittest
|
| +
|
| +def filter_fields(header, data, fields):
|
| + """Filter (header, data) with selected header fields.
|
| +
|
| + Args:
|
| + header ([str]): The header.
|
| + data ([[float]]): The data, with the same number of columns as header.
|
| + fields ([str]): The fields that need to be written.
|
| +
|
| + Returns:
|
| + (header ([str]), data ([[float]]))
|
| +
|
| + Examples:
|
| + >>> filter_fields(['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])
|
| + (['b', 'a'], [[1, 0], [4, 3]])
|
| + """
|
| +
|
| + picked = []
|
| + for f in fields:
|
| + try:
|
| + picked.append(header.index(f))
|
| + except ValueError:
|
| + # OK to have missing values
|
| + pass
|
| +
|
| + h = [header[i] for i in picked]
|
| + d = []
|
| + for e in data:
|
| + d.append([e[i] for i in picked])
|
| + return (h, d)
|
| +
|
| +def write_features(filename, header, data, fields):
|
| + """Write (header, data) to filename in CSV format, with selected fields.
|
| +
|
| + Args:
|
| + filename (str): The output filename.
|
| + header ([str]): The header.
|
| + data ([[float]]): The data, with the same number of columns as header.
|
| + fields ([str]): The fields that need to be written.
|
| +
|
| + Examples:
|
| + >>> write_features(None, ['a','b','c'], [[0,1,2], [3,4,5]], ['d','b','a'])
|
| + b,a
|
| + 1,0
|
| + 4,3
|
| + """
|
| +
|
| + (header, data) = filter_fields(header, data, fields)
|
| +
|
| + if filename:
|
| + writer = csv.writer(open(filename, 'w'))
|
| + else:
|
| + writer = csv.writer(sys.stdout, lineterminator="\n")
|
| +
|
| + writer.writerow(header)
|
| + writer.writerows(data)
|
| +
|
| +def getGroups(header):
|
| + """Return groups of header fields
|
| +
|
| + Returns:
|
| + dict of name (str): fields ([str])
|
| + """
|
| + groups = {}
|
| + groupPath = [
|
| + 'forum',
|
| + 'index',
|
| + 'search',
|
| + 'view',
|
| + 'archive',
|
| + 'asp',
|
| + 'phpbb',
|
| + 'php',
|
| + 'pathLength',
|
| + 'domain',
|
| + 'pathComponents',
|
| + 'slugDetector',
|
| + 'pathNumbers',
|
| + 'lastSegmentLength',
|
| + ]
|
| + groups['path'] = groupPath
|
| +
|
| + groupNumElement = [
|
| + 'numElements',
|
| + 'numAnchors',
|
| + 'anchorRatio',
|
| + 'numForms',
|
| + 'numTextInput',
|
| + 'numPasswordInput',
|
| + 'numPPRE',
|
| + ]
|
| + groups['numElement'] = groupNumElement
|
| +
|
| + groupVisibleElement = [
|
| + 'visibleElements',
|
| + 'visibleAnchors',
|
| + 'visiblePPRE',
|
| + 'visibleAnchorOverPPre',
|
| + ]
|
| + groups['visibleElement'] = groupVisibleElement
|
| +
|
| + groupEntries = [
|
| + 'numSection',
|
| + 'numSection2',
|
| + 'numSection3',
|
| + 'numArticle',
|
| + 'numArticle2',
|
| + 'numArticle3',
|
| + 'numEntries',
|
| + 'numEntries2',
|
| + 'numEntries3',
|
| + 'numH1',
|
| + 'numH2',
|
| + 'numH3',
|
| + 'numH4',
|
| + 'headCountSum',
|
| + 'headCountMax',
|
| + 'entryCountSum',
|
| + 'entryCountMax',
|
| + ]
|
| + groups['entries'] = groupEntries
|
| +
|
| + groupV1 = [
|
| + 'openGraph',
|
| +
|
| + 'forum',
|
| + 'index',
|
| + 'search',
|
| + 'view',
|
| + 'archive',
|
| + 'asp',
|
| + 'phpbb',
|
| + 'php',
|
| + 'pathLength',
|
| + 'domain',
|
| + 'pathComponents',
|
| + 'slugDetector',
|
| + 'pathNumbers',
|
| + 'lastSegmentLength',
|
| +
|
| + 'formCount',
|
| + 'anchorCount',
|
| + 'elementCount',
|
| + 'anchorRatio',
|
| +
|
| + 'mozScore',
|
| + 'mozScoreAllSqrt',
|
| + 'mozScoreAllLinear',
|
| + ]
|
| + groups['v1'] = groupV1
|
| +
|
| + groupV1NoPath = [
|
| + 'openGraph',
|
| +
|
| + 'formCount',
|
| + 'anchorCount',
|
| + 'elementCount',
|
| + 'anchorRatio',
|
| +
|
| + 'mozScore',
|
| + 'mozScoreAllSqrt',
|
| + 'mozScoreAllLinear',
|
| + ]
|
| + groups['v1NoPath'] = groupV1NoPath
|
| +
|
| + groups['allElement'] = groupNumElement + groupVisibleElement + groupEntries
|
| + groups['mozScores'] = [f for f in header if 'moz' in f]
|
| + groups['noText'] = [f for f in header if not ('inner' in f or 'Content' in f or 'WordCount' in f)]
|
| +
|
| + return groups
|
|
|
| def main(argv):
|
| parser = argparse.ArgumentParser()
|
| - parser.add_argument('--out', required=True)
|
| - parser.add_argument('--marked', required=True)
|
| - parser.add_argument('--features', required=True)
|
| + parser.add_argument('--out', required=True, help="filename of output")
|
| + parser.add_argument('--marked', help="filename of marked output")
|
| + parser.add_argument('--distilled', help="filename of derived features of distilled content")
|
| + parser.add_argument('--features', required=True, help="filename of derived features")
|
| options = parser.parse_args(argv)
|
|
|
| - marked = None
|
| - with open(options.marked) as markedin:
|
| - marked = json.load(markedin)
|
| + if (options.marked is None) + (options.distilled is None) != 1:
|
| + print 'Use exactly one of --marked or --distilled.'
|
| + os.exit(1)
|
|
|
| - features = None
|
| with open(options.features) as features:
|
| features = json.load(features)
|
|
|
| - markedMap = dict()
|
| - # good:
|
| - # -1 error
|
| - # 0 bad
|
| - # 1 good
|
| - # 2 good w/error
|
| - for m in marked:
|
| - if not 'good' in m:
|
| - continue
|
| - if m['good'] < 0:
|
| - continue
|
| - markedMap[m['url']] = m
|
| -
|
| - merged = []
|
| - for f in features:
|
| - url = f['url']
|
| - if not url in markedMap:
|
| - continue
|
| - merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['features'][1::2]))
|
| -
|
| - header = ['good'] + map(str, features[0]['features'][::2])
|
| -
|
| - with open(options.out, 'w') as csvfile:
|
| - writer = csv.writer(csvfile)
|
| - writer.writerow(header)
|
| - for e in merged:
|
| - writer.writerow(e)
|
| + if options.marked:
|
| + with open(options.marked) as markedin:
|
| + marked = json.load(markedin)
|
| +
|
| + markedMap = dict()
|
| + # good:
|
| + # -1 error
|
| + # 0 bad
|
| + # 1 good
|
| + # 2 good w/error
|
| + for m in marked:
|
| + if not 'good' in m:
|
| + continue
|
| + if m['good'] < 0:
|
| + continue
|
| + markedMap[m['url'].strip()] = m
|
| +
|
| + print "Loaded %d labeled entries" % (len(markedMap))
|
| +
|
| + merged = []
|
| + for f in features:
|
| + url = f['url']
|
| + if not url in markedMap:
|
| + continue
|
| + merged.append(map(float, [0 if markedMap[url]['good'] == 0 else 1] + f['features'][1::2]))
|
| + print "Merged %d entries" % (len(merged))
|
| +
|
| + if options.distilled:
|
| + with open(options.distilled) as markedin:
|
| + marked = json.load(markedin)
|
| +
|
| + markedMap = dict()
|
| + for m in marked:
|
| + feature = m['features']
|
| + feature = dict(zip(feature[::2], feature[1::2]))
|
| + if feature['innerTextLength'] == 0:
|
| + continue
|
| + m['features'] = feature
|
| + markedMap[m['url'].strip()] = m
|
| +
|
| + print "Loaded %d distilled entries" % (len(markedMap))
|
| +
|
| + merged = []
|
| + for f in features:
|
| + url = f['url']
|
| + if not url in markedMap:
|
| + continue
|
| + if f['native']['features']['isMobileFriendly'] == 1:
|
| + continue
|
| + if f['native']['distillable'] != 1:
|
| + continue
|
| + feature = markedMap[url]['features']
|
| + merged.append(map(float, [0 if feature['innerTextLength'] < 1000 else 1] + f['features'][1::2]))
|
| + print "Merged %d entries" % (len(merged))
|
| +
|
| + feature_headers = map(str, features[0]['features'][::2])
|
| + header = ['good'] + feature_headers
|
| +
|
| + write_features(options.out, header, merged, header)
|
| +
|
| + # write datasets with a single feature
|
| + outbase = os.path.splitext(options.out)[0]
|
| + for s in feature_headers:
|
| + print 'Single feature: %s' % s
|
| + write_features('%s-feature-%s.csv' % (outbase, s), header, merged, ['good', s])
|
| +
|
| + # write datasets with feature groups
|
| + for (name, g) in getGroups(feature_headers).iteritems():
|
| + print 'Feature group: %s' % name
|
| + write_features('%s-group-%s.csv' % (outbase, name), header, merged, ['good'] + g)
|
|
|
| return 0
|
|
|
| if __name__ == '__main__':
|
| sys.exit(main(sys.argv[1:]))
|
| -
|
|
|