Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(720)

Side by Side Diff: calculate_derived_features.py

Issue 1289123002: Merge branch 'master' into heuristics Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 #!/usr/bin/env python
2 # Copyright 2014 The Chromium Authors. All rights reserved.
3 # Use of this source code is governed by a BSD-style license that can be
4 # found in the LICENSE file.
5
6 import argparse
7 import csv
8 import json
9 import os
10 import shutil
11 import sys
12 import re
13 import urlparse
14
15 def CountMatches(s, p):
16 return len(re.findall(p, s))
17
18 def WordCount(s):
19 return CountMatches(s, r'\w+')
20
21 def GetLastSegment(path):
22 return re.search('[^/]*\/?$', path).group(0)
23
24 def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner Text, textContent, innerHTML):
25 path = urlparse.urlparse(url).path
26
27 path = path.encode('utf-8')
28 innerText = innerText.encode('utf-8')
29 textContent = textContent.encode('utf-8')
30 innerHTML = innerHTML.encode('utf-8')
31
32 innerTextWords = WordCount(innerText)
33 textContentWords = WordCount(textContent)
34 innerHTMLWords = WordCount(innerHTML)
35 return [
36 'opengraph', opengraph,
37 'forum', 'forum' in path,
38 'index', 'index' in path,
39 'view', 'view' in path,
40 'asp', '.asp' in path,
41 'phpbb', 'phpbb' in path,
42 'php', path.endswith('.php'),
43 'pathlength', len(path),
44 'domain', len(path) < 2,
45 'pathcomponents', CountMatches(path, r'\/.'),
46 'slugdetector', CountMatches(path, r'[^\w/]'),
47 'pathnumbers', CountMatches(path, r'\d+'),
48 'lastSegmentLength', len(GetLastSegment(path)),
49 'formcount', numForms,
50 'anchorcount', numAnchors,
51 'elementcount', numElements,
52 'anchorratio', float(numAnchors) / max(1, numElements),
53 'innertextlength', len(innerText),
54 'textcontentlength', len(textContent),
55 'innerhtmllength', len(innerHTML),
56 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
57 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
58 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textCon tent)),
59 'innertextwordcount', innerTextWords,
60 'textcontentwordcount', textContentWords,
61 'innerhtmlwordcount', innerHTMLWords,
62 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
63 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords ),
64 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textCon tentWords),
65 ]
66
67 def main(argv):
68 parser = argparse.ArgumentParser()
69 parser.add_argument('--out', required=True)
70 parser.add_argument('--core', required=True)
71 options = parser.parse_args(argv)
72
73 if os.path.exists(options.out):
74 raise Exception('exists: ' + options.out)
75
76 core = None
77 with open(options.core) as core_file:
78 core = json.load(core_file)
79
80 for entry in core:
81 features = entry['features']
82 entry['features'] = CalcDerivedFeatures(
83 features['opengraph'],
84 features['url'],
85 features['numElements'],
86 features['numAnchors'],
87 features['numForms'],
88 features['innerText'],
89 features['textContent'],
90 features['innerHTML'])
91
92 with open(options.out, 'w') as outfile:
93 json.dump(core, outfile, indent=1)
94
95 return 0
96
97 if __name__ == '__main__':
98 sys.exit(main(sys.argv[1:]))
99
OLDNEW
« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698