OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright 2016 The Chromium Authors. All rights reserved. | 2 # Copyright 2016 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 import argparse | 6 import argparse |
7 import csv | 7 import csv |
8 import json | 8 import json |
9 import marshal | 9 import marshal |
10 import os | 10 import os |
11 import shutil | 11 import shutil |
12 import math | 12 import math |
13 import sys | 13 import sys |
14 import re | 14 import re |
15 import urlparse | 15 import urlparse |
16 | 16 |
17 def CountMatches(s, p): | 17 def CountMatches(s, p): |
18 return len(re.findall(p, s)) | 18 return len(re.findall(p, s)) |
19 | 19 |
20 def WordCount(s): | 20 def WordCount(s): |
21 return CountMatches(s, r'\w+') | 21 return CountMatches(s, r'\w+') |
22 | 22 |
23 def GetLastSegment(path): | 23 def GetLastSegment(path): |
24 return re.search('[^/]*\/?$', path).group(0) | 24 return re.search('[^/]*\/?$', path).group(0) |
25 | 25 |
26 def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n
umForms, numPPRE, visibleElements, visiblePPRE, | 26 def CalcDerivedFeatures(index, raw): |
27 innerText, textContent, innerHTML, numText, numPassword, mozScores): | 27 return _CalcDerivedFeatures( |
| 28 index, |
| 29 raw, |
| 30 raw['opengraph'], |
| 31 raw['url'], |
| 32 raw['title'], |
| 33 raw['numElements'], |
| 34 raw['numAnchors'], |
| 35 raw['numForms'], |
| 36 raw['numPPRE'], |
| 37 raw['visibleElements'], |
| 38 raw['visiblePPRE'], |
| 39 raw['innerText'], |
| 40 raw['textContent'], |
| 41 raw['innerHTML'], |
| 42 raw['numTextInput'], |
| 43 raw['numPasswordInput'] |
| 44 ) |
| 45 |
| 46 def _CalcDerivedFeatures(index, raw, opengraph, url, title, numElements, numAnch
ors, numForms, numPPRE, visibleElements, visiblePPRE, |
| 47 innerText, textContent, innerHTML, numText, numPassword): |
28 path = urlparse.urlparse(url).path | 48 path = urlparse.urlparse(url).path |
29 | 49 |
30 path = path.encode('utf-8') | 50 path = path.encode('utf-8') |
31 innerText = innerText.encode('utf-8') | 51 innerText = innerText.encode('utf-8') |
32 textContent = textContent.encode('utf-8') | 52 textContent = textContent.encode('utf-8') |
33 innerHTML = innerHTML.encode('utf-8') | 53 innerHTML = innerHTML.encode('utf-8') |
34 | 54 |
35 innerTextWords = WordCount(innerText) | 55 innerTextWords = WordCount(innerText) |
36 textContentWords = WordCount(textContent) | 56 textContentWords = WordCount(textContent) |
37 innerHTMLWords = WordCount(innerHTML) | 57 innerHTMLWords = WordCount(innerHTML) |
38 return [ | 58 features = [ |
| 59 'id', index, |
| 60 'sin', math.sin(index), |
39 'openGraph', opengraph, | 61 'openGraph', opengraph, |
40 | 62 |
41 'forum', 'forum' in path, | 63 'forum', 'forum' in path, |
42 'index', 'index' in path, | 64 'index', 'index' in path, |
43 'search', 'search' in path, | 65 'search', 'search' in path, |
44 'view', 'view' in path, | 66 'view', 'view' in path, |
45 'archive', 'archive' in path, | 67 'archive', 'archive' in path, |
46 'asp', '.asp' in path, | 68 'asp', '.asp' in path, |
47 'phpbb', 'phpbb' in path, | 69 'phpbb', 'phpbb' in path, |
48 'php', path.endswith('.php'), | 70 'php', path.endswith('.php'), |
49 'pathLength', len(path), | 71 'pathLength', len(path), |
50 'domain', len(path) < 2, | 72 'domain', len(path) < 2, |
51 'pathComponents', CountMatches(path, r'\/.'), | 73 'pathComponents', CountMatches(path, r'\/.'), |
52 'slugDetector', CountMatches(path, r'[^\w/]'), | 74 'slugDetector', CountMatches(path, r'[^\w/]'), |
53 'pathNumbers', CountMatches(path, r'\d+'), | 75 'pathNumbers', CountMatches(path, r'\d+'), |
54 'lastSegmentLength', len(GetLastSegment(path)), | 76 'lastSegmentLength', len(GetLastSegment(path)), |
55 | 77 |
| 78 'visibleRatio', float(visibleElements) / max(1, numElements), |
| 79 'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE), |
| 80 'PPRERatio', float(numPPRE) / max(1, numElements), |
| 81 'anchorPPRERatio', float(numAnchors) / max(1, numPPRE), |
| 82 |
| 83 'innerTextLength', len(innerText), |
| 84 'textContentLength', len(textContent), |
| 85 'innerHtmlLength', len(innerHTML), |
| 86 'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)), |
| 87 'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)), |
| 88 'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textCon
tent)), |
| 89 |
| 90 'innerTextWordCount', innerTextWords, |
| 91 'textContentWordCount', textContentWords, |
| 92 'innerhtmlWordCount', innerHTMLWords, |
| 93 'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords), |
| 94 'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords
), |
| 95 'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textCon
tentWords), |
| 96 |
| 97 'textCount', numText, |
| 98 'passwordCount', numPassword, |
56 'formCount', numForms, | 99 'formCount', numForms, |
57 'anchorCount', numAnchors, | 100 'anchorCount', numAnchors, |
58 'elementCount', numElements, | 101 'elementCount', numElements, |
59 'anchorRatio', float(numAnchors) / max(1, numElements), | 102 'anchorRatio', float(numAnchors) / max(1, numElements), |
| 103 ] |
60 | 104 |
61 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)), | 105 for k in sorted(raw): |
62 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)), | 106 if 'mozScore' in k or 'num' in k: |
63 'mozScoreAllLinear', min(mozScores[5], 6000), | 107 features += [k, raw[k]] |
64 ] | 108 |
| 109 return features |
65 | 110 |
66 def main(argv): | 111 def main(argv): |
67 parser = argparse.ArgumentParser() | 112 parser = argparse.ArgumentParser() |
68 parser.add_argument('--out', required=True) | 113 parser.add_argument('--out', required=True) |
69 parser.add_argument('--core', required=True) | 114 parser.add_argument('--core', required=True) |
70 options = parser.parse_args(argv) | 115 options = parser.parse_args(argv) |
71 | 116 |
72 if os.path.exists(options.out): | 117 if os.path.exists(options.out): |
73 raise Exception('exists: ' + options.out) | 118 raise Exception('exists: ' + options.out) |
74 | 119 |
75 core = None | 120 core = None |
76 with open(options.core) as core_file: | 121 with open(options.core) as core_file: |
77 core = marshal.load(core_file) | 122 core = marshal.load(core_file) |
78 | 123 |
79 for entry in core: | 124 for entry in core: |
80 features = entry['features'] | 125 features = entry['features'] |
81 print 'processing %d' % (entry['index']) | 126 print 'processing %d' % (entry['index']) |
82 entry['features'] = CalcDerivedFeatures( | 127 |
83 entry['index'], | 128 entry['features'] = CalcDerivedFeatures(entry['index'], features) |
84 features['opengraph'], | |
85 features['url'], | |
86 features['title'], | |
87 features['numElements'], | |
88 features['numAnchors'], | |
89 features['numForms'], | |
90 features['numPPRE'], | |
91 features['visibleElements'], | |
92 features['visiblePPRE'], | |
93 features['innerText'], | |
94 features['textContent'], | |
95 features['innerHTML'], | |
96 features['numTextInput'], | |
97 features['numPasswordInput'], | |
98 [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllL
inear'], | |
99 features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozS
coreFastAllLinear']] | |
100 ) | |
101 | 129 |
102 with open(options.out, 'w') as outfile: | 130 with open(options.out, 'w') as outfile: |
103 json.dump(core, outfile, indent=1) | 131 json.dump(core, outfile, indent=1) |
104 | 132 |
105 return 0 | 133 return 0 |
106 | 134 |
107 if __name__ == '__main__': | 135 if __name__ == '__main__': |
108 sys.exit(main(sys.argv[1:])) | 136 sys.exit(main(sys.argv[1:])) |
109 | 137 |
OLD | NEW |