OLD | NEW |
1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
2 # Copyright 2014 The Chromium Authors. All rights reserved. | 2 # Copyright 2016 The Chromium Authors. All rights reserved. |
3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
5 | 5 |
6 import argparse | 6 import argparse |
7 import csv | 7 import csv |
8 import json | 8 import json |
| 9 import marshal |
9 import os | 10 import os |
10 import shutil | 11 import shutil |
| 12 import math |
11 import sys | 13 import sys |
12 import re | 14 import re |
13 import urlparse | 15 import urlparse |
14 | 16 |
15 def CountMatches(s, p): | 17 def CountMatches(s, p): |
16 return len(re.findall(p, s)) | 18 return len(re.findall(p, s)) |
17 | 19 |
18 def WordCount(s): | 20 def WordCount(s): |
19 return CountMatches(s, r'\w+') | 21 return CountMatches(s, r'\w+') |
20 | 22 |
21 def GetLastSegment(path): | 23 def GetLastSegment(path): |
22 return re.search('[^/]*\/?$', path).group(0) | 24 return re.search('[^/]*\/?$', path).group(0) |
23 | 25 |
24 def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner
Text, textContent, innerHTML): | 26 def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n
umForms, numPPRE, visibleElements, visiblePPRE, |
| 27 innerText, textContent, innerHTML, numText, numPassword, mozScores): |
25 path = urlparse.urlparse(url).path | 28 path = urlparse.urlparse(url).path |
26 | 29 |
27 path = path.encode('utf-8') | 30 path = path.encode('utf-8') |
28 innerText = innerText.encode('utf-8') | 31 innerText = innerText.encode('utf-8') |
29 textContent = textContent.encode('utf-8') | 32 textContent = textContent.encode('utf-8') |
30 innerHTML = innerHTML.encode('utf-8') | 33 innerHTML = innerHTML.encode('utf-8') |
31 | 34 |
32 innerTextWords = WordCount(innerText) | 35 innerTextWords = WordCount(innerText) |
33 textContentWords = WordCount(textContent) | 36 textContentWords = WordCount(textContent) |
34 innerHTMLWords = WordCount(innerHTML) | 37 innerHTMLWords = WordCount(innerHTML) |
35 return [ | 38 return [ |
36 'opengraph', opengraph, | 39 'openGraph', opengraph, |
| 40 |
37 'forum', 'forum' in path, | 41 'forum', 'forum' in path, |
38 'index', 'index' in path, | 42 'index', 'index' in path, |
| 43 'search', 'search' in path, |
39 'view', 'view' in path, | 44 'view', 'view' in path, |
| 45 'archive', 'archive' in path, |
40 'asp', '.asp' in path, | 46 'asp', '.asp' in path, |
41 'phpbb', 'phpbb' in path, | 47 'phpbb', 'phpbb' in path, |
42 'php', path.endswith('.php'), | 48 'php', path.endswith('.php'), |
43 'pathlength', len(path), | 49 'pathLength', len(path), |
44 'domain', len(path) < 2, | 50 'domain', len(path) < 2, |
45 'pathcomponents', CountMatches(path, r'\/.'), | 51 'pathComponents', CountMatches(path, r'\/.'), |
46 'slugdetector', CountMatches(path, r'[^\w/]'), | 52 'slugDetector', CountMatches(path, r'[^\w/]'), |
47 'pathnumbers', CountMatches(path, r'\d+'), | 53 'pathNumbers', CountMatches(path, r'\d+'), |
48 'lastSegmentLength', len(GetLastSegment(path)), | 54 'lastSegmentLength', len(GetLastSegment(path)), |
49 'formcount', numForms, | 55 |
50 'anchorcount', numAnchors, | 56 'formCount', numForms, |
51 'elementcount', numElements, | 57 'anchorCount', numAnchors, |
52 'anchorratio', float(numAnchors) / max(1, numElements), | 58 'elementCount', numElements, |
53 'innertextlength', len(innerText), | 59 'anchorRatio', float(numAnchors) / max(1, numElements), |
54 'textcontentlength', len(textContent), | 60 |
55 'innerhtmllength', len(innerHTML), | 61 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)), |
56 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), | 62 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)), |
57 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), | 63 'mozScoreAllLinear', min(mozScores[5], 6000), |
58 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textCon
tent)), | |
59 'innertextwordcount', innerTextWords, | |
60 'textcontentwordcount', textContentWords, | |
61 'innerhtmlwordcount', innerHTMLWords, | |
62 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), | |
63 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords
), | |
64 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textCon
tentWords), | |
65 ] | 64 ] |
66 | 65 |
67 def main(argv): | 66 def main(argv): |
68 parser = argparse.ArgumentParser() | 67 parser = argparse.ArgumentParser() |
69 parser.add_argument('--out', required=True) | 68 parser.add_argument('--out', required=True) |
70 parser.add_argument('--core', required=True) | 69 parser.add_argument('--core', required=True) |
71 options = parser.parse_args(argv) | 70 options = parser.parse_args(argv) |
72 | 71 |
73 if os.path.exists(options.out): | 72 if os.path.exists(options.out): |
74 raise Exception('exists: ' + options.out) | 73 raise Exception('exists: ' + options.out) |
75 | 74 |
76 core = None | 75 core = None |
77 with open(options.core) as core_file: | 76 with open(options.core) as core_file: |
78 core = json.load(core_file) | 77 core = marshal.load(core_file) |
79 | 78 |
80 for entry in core: | 79 for entry in core: |
81 features = entry['features'] | 80 features = entry['features'] |
| 81 print 'processing %d' % (entry['index']) |
82 entry['features'] = CalcDerivedFeatures( | 82 entry['features'] = CalcDerivedFeatures( |
| 83 entry['index'], |
83 features['opengraph'], | 84 features['opengraph'], |
84 features['url'], | 85 features['url'], |
| 86 features['title'], |
85 features['numElements'], | 87 features['numElements'], |
86 features['numAnchors'], | 88 features['numAnchors'], |
87 features['numForms'], | 89 features['numForms'], |
| 90 features['numPPRE'], |
| 91 features['visibleElements'], |
| 92 features['visiblePPRE'], |
88 features['innerText'], | 93 features['innerText'], |
89 features['textContent'], | 94 features['textContent'], |
90 features['innerHTML']) | 95 features['innerHTML'], |
| 96 features['numTextInput'], |
| 97 features['numPasswordInput'], |
| 98 [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllL
inear'], |
| 99 features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozS
coreFastAllLinear']] |
| 100 ) |
91 | 101 |
92 with open(options.out, 'w') as outfile: | 102 with open(options.out, 'w') as outfile: |
93 json.dump(core, outfile, indent=1) | 103 json.dump(core, outfile, indent=1) |
94 | 104 |
95 return 0 | 105 return 0 |
96 | 106 |
97 if __name__ == '__main__': | 107 if __name__ == '__main__': |
98 sys.exit(main(sys.argv[1:])) | 108 sys.exit(main(sys.argv[1:])) |
99 | 109 |
OLD | NEW |