Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(42)

Side by Side Diff: heuristics/distillable/calculate_derived_features.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/check_derived_features.py » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 #!/usr/bin/env python 1 #!/usr/bin/env python
2 # Copyright 2016 The Chromium Authors. All rights reserved. 2 # Copyright 2016 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be 3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file. 4 # found in the LICENSE file.
5 5
6 import argparse 6 import argparse
7 import csv 7 import csv
8 import json 8 import json
9 import marshal 9 import marshal
10 import os 10 import os
11 import shutil 11 import shutil
12 import math 12 import math
13 import sys 13 import sys
14 import re 14 import re
15 import urlparse 15 import urlparse
16 16
17 def CountMatches(s, p): 17 def CountMatches(s, p):
18 return len(re.findall(p, s)) 18 return len(re.findall(p, s))
19 19
20 def WordCount(s): 20 def WordCount(s):
21 return CountMatches(s, r'\w+') 21 return CountMatches(s, r'\w+')
22 22
23 def GetLastSegment(path): 23 def GetLastSegment(path):
24 return re.search('[^/]*\/?$', path).group(0) 24 return re.search('[^/]*\/?$', path).group(0)
25 25
26 def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n umForms, numPPRE, visibleElements, visiblePPRE, 26 def CalcDerivedFeatures(index, raw):
27 innerText, textContent, innerHTML, numText, numPassword, mozScores): 27 return _CalcDerivedFeatures(
28 index,
29 raw,
30 raw['opengraph'],
31 raw['url'],
32 raw['title'],
33 raw['numElements'],
34 raw['numAnchors'],
35 raw['numForms'],
36 raw['numPPRE'],
37 raw['visibleElements'],
38 raw['visiblePPRE'],
39 raw['innerText'],
40 raw['textContent'],
41 raw['innerHTML'],
42 raw['numTextInput'],
43 raw['numPasswordInput']
44 )
45
46 def _CalcDerivedFeatures(index, raw, opengraph, url, title, numElements, numAnch ors, numForms, numPPRE, visibleElements, visiblePPRE,
47 innerText, textContent, innerHTML, numText, numPassword):
28 path = urlparse.urlparse(url).path 48 path = urlparse.urlparse(url).path
29 49
30 path = path.encode('utf-8') 50 path = path.encode('utf-8')
31 innerText = innerText.encode('utf-8') 51 innerText = innerText.encode('utf-8')
32 textContent = textContent.encode('utf-8') 52 textContent = textContent.encode('utf-8')
33 innerHTML = innerHTML.encode('utf-8') 53 innerHTML = innerHTML.encode('utf-8')
34 54
35 innerTextWords = WordCount(innerText) 55 innerTextWords = WordCount(innerText)
36 textContentWords = WordCount(textContent) 56 textContentWords = WordCount(textContent)
37 innerHTMLWords = WordCount(innerHTML) 57 innerHTMLWords = WordCount(innerHTML)
38 return [ 58 features = [
59 'id', index,
60 'sin', math.sin(index),
39 'openGraph', opengraph, 61 'openGraph', opengraph,
40 62
41 'forum', 'forum' in path, 63 'forum', 'forum' in path,
42 'index', 'index' in path, 64 'index', 'index' in path,
43 'search', 'search' in path, 65 'search', 'search' in path,
44 'view', 'view' in path, 66 'view', 'view' in path,
45 'archive', 'archive' in path, 67 'archive', 'archive' in path,
46 'asp', '.asp' in path, 68 'asp', '.asp' in path,
47 'phpbb', 'phpbb' in path, 69 'phpbb', 'phpbb' in path,
48 'php', path.endswith('.php'), 70 'php', path.endswith('.php'),
49 'pathLength', len(path), 71 'pathLength', len(path),
50 'domain', len(path) < 2, 72 'domain', len(path) < 2,
51 'pathComponents', CountMatches(path, r'\/.'), 73 'pathComponents', CountMatches(path, r'\/.'),
52 'slugDetector', CountMatches(path, r'[^\w/]'), 74 'slugDetector', CountMatches(path, r'[^\w/]'),
53 'pathNumbers', CountMatches(path, r'\d+'), 75 'pathNumbers', CountMatches(path, r'\d+'),
54 'lastSegmentLength', len(GetLastSegment(path)), 76 'lastSegmentLength', len(GetLastSegment(path)),
55 77
78 'visibleRatio', float(visibleElements) / max(1, numElements),
79 'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE),
80 'PPRERatio', float(numPPRE) / max(1, numElements),
81 'anchorPPRERatio', float(numAnchors) / max(1, numPPRE),
82
83 'innerTextLength', len(innerText),
84 'textContentLength', len(textContent),
85 'innerHtmlLength', len(innerHTML),
86 'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)),
87 'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)),
88 'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textCon tent)),
89
90 'innerTextWordCount', innerTextWords,
91 'textContentWordCount', textContentWords,
92 'innerhtmlWordCount', innerHTMLWords,
93 'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords),
94 'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords ),
95 'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textCon tentWords),
96
97 'textCount', numText,
98 'passwordCount', numPassword,
56 'formCount', numForms, 99 'formCount', numForms,
57 'anchorCount', numAnchors, 100 'anchorCount', numAnchors,
58 'elementCount', numElements, 101 'elementCount', numElements,
59 'anchorRatio', float(numAnchors) / max(1, numElements), 102 'anchorRatio', float(numAnchors) / max(1, numElements),
103 ]
60 104
61 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)), 105 for k in sorted(raw):
62 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)), 106 if 'mozScore' in k or 'num' in k:
63 'mozScoreAllLinear', min(mozScores[5], 6000), 107 features += [k, raw[k]]
64 ] 108
109 return features
65 110
66 def main(argv): 111 def main(argv):
67 parser = argparse.ArgumentParser() 112 parser = argparse.ArgumentParser()
68 parser.add_argument('--out', required=True) 113 parser.add_argument('--out', required=True)
69 parser.add_argument('--core', required=True) 114 parser.add_argument('--core', required=True)
70 options = parser.parse_args(argv) 115 options = parser.parse_args(argv)
71 116
72 if os.path.exists(options.out): 117 if os.path.exists(options.out):
73 raise Exception('exists: ' + options.out) 118 raise Exception('exists: ' + options.out)
74 119
75 core = None 120 core = None
76 with open(options.core) as core_file: 121 with open(options.core) as core_file:
77 core = marshal.load(core_file) 122 core = marshal.load(core_file)
78 123
79 for entry in core: 124 for entry in core:
80 features = entry['features'] 125 features = entry['features']
81 print 'processing %d' % (entry['index']) 126 print 'processing %d' % (entry['index'])
82 entry['features'] = CalcDerivedFeatures( 127
83 entry['index'], 128 entry['features'] = CalcDerivedFeatures(entry['index'], features)
84 features['opengraph'],
85 features['url'],
86 features['title'],
87 features['numElements'],
88 features['numAnchors'],
89 features['numForms'],
90 features['numPPRE'],
91 features['visibleElements'],
92 features['visiblePPRE'],
93 features['innerText'],
94 features['textContent'],
95 features['innerHTML'],
96 features['numTextInput'],
97 features['numPasswordInput'],
98 [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllL inear'],
99 features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozS coreFastAllLinear']]
100 )
101 129
102 with open(options.out, 'w') as outfile: 130 with open(options.out, 'w') as outfile:
103 json.dump(core, outfile, indent=1) 131 json.dump(core, outfile, indent=1)
104 132
105 return 0 133 return 0
106 134
107 if __name__ == '__main__': 135 if __name__ == '__main__':
108 sys.exit(main(sys.argv[1:])) 136 sys.exit(main(sys.argv[1:]))
109 137
OLDNEW
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/check_derived_features.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698