OLD | NEW |
| (Empty) |
1 #!/usr/bin/env python | |
2 # Copyright 2014 The Chromium Authors. All rights reserved. | |
3 # Use of this source code is governed by a BSD-style license that can be | |
4 # found in the LICENSE file. | |
5 | |
6 import argparse | |
7 import csv | |
8 import json | |
9 import os | |
10 import shutil | |
11 import sys | |
12 import re | |
13 import urlparse | |
14 | |
15 def CountMatches(s, p): | |
16 return len(re.findall(p, s)) | |
17 | |
18 def WordCount(s): | |
19 return CountMatches(s, r'\w+') | |
20 | |
21 def GetLastSegment(path): | |
22 return re.search('[^/]*\/?$', path).group(0) | |
23 | |
24 def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner
Text, textContent, innerHTML): | |
25 path = urlparse.urlparse(url).path | |
26 | |
27 path = path.encode('utf-8') | |
28 innerText = innerText.encode('utf-8') | |
29 textContent = textContent.encode('utf-8') | |
30 innerHTML = innerHTML.encode('utf-8') | |
31 | |
32 innerTextWords = WordCount(innerText) | |
33 textContentWords = WordCount(textContent) | |
34 innerHTMLWords = WordCount(innerHTML) | |
35 return [ | |
36 'opengraph', opengraph, | |
37 'forum', 'forum' in path, | |
38 'index', 'index' in path, | |
39 'view', 'view' in path, | |
40 'asp', '.asp' in path, | |
41 'phpbb', 'phpbb' in path, | |
42 'php', path.endswith('.php'), | |
43 'pathlength', len(path), | |
44 'domain', len(path) < 2, | |
45 'pathcomponents', CountMatches(path, r'\/.'), | |
46 'slugdetector', CountMatches(path, r'[^\w/]'), | |
47 'pathnumbers', CountMatches(path, r'\d+'), | |
48 'lastSegmentLength', len(GetLastSegment(path)), | |
49 'formcount', numForms, | |
50 'anchorcount', numAnchors, | |
51 'elementcount', numElements, | |
52 'anchorratio', float(numAnchors) / max(1, numElements), | |
53 'innertextlength', len(innerText), | |
54 'textcontentlength', len(textContent), | |
55 'innerhtmllength', len(innerHTML), | |
56 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), | |
57 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), | |
58 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textCon
tent)), | |
59 'innertextwordcount', innerTextWords, | |
60 'textcontentwordcount', textContentWords, | |
61 'innerhtmlwordcount', innerHTMLWords, | |
62 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), | |
63 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords
), | |
64 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textCon
tentWords), | |
65 ] | |
66 | |
67 def main(argv): | |
68 parser = argparse.ArgumentParser() | |
69 parser.add_argument('--out', required=True) | |
70 parser.add_argument('--core', required=True) | |
71 options = parser.parse_args(argv) | |
72 | |
73 if os.path.exists(options.out): | |
74 raise Exception('exists: ' + options.out) | |
75 | |
76 core = None | |
77 with open(options.core) as core_file: | |
78 core = json.load(core_file) | |
79 | |
80 for entry in core: | |
81 features = entry['features'] | |
82 entry['features'] = CalcDerivedFeatures( | |
83 features['opengraph'], | |
84 features['url'], | |
85 features['numElements'], | |
86 features['numAnchors'], | |
87 features['numForms'], | |
88 features['innerText'], | |
89 features['textContent'], | |
90 features['innerHTML']) | |
91 | |
92 with open(options.out, 'w') as outfile: | |
93 json.dump(core, outfile, indent=1) | |
94 | |
95 return 0 | |
96 | |
97 if __name__ == '__main__': | |
98 sys.exit(main(sys.argv[1:])) | |
99 | |
OLD | NEW |