Index: calculate_derived_features.py |
diff --git a/calculate_derived_features.py b/calculate_derived_features.py |
new file mode 100644 |
index 0000000000000000000000000000000000000000..7b22e0139a37fa9abef1009630a4d34df21f41d5 |
--- /dev/null |
+++ b/calculate_derived_features.py |
@@ -0,0 +1,99 @@ |
+#!/usr/bin/env python |
+# Copyright 2014 The Chromium Authors. All rights reserved. |
+# Use of this source code is governed by a BSD-style license that can be |
+# found in the LICENSE file. |
+ |
+import argparse |
+import csv |
+import json |
+import os |
+import shutil |
+import sys |
+import re |
+import urlparse |
+ |
+def CountMatches(s, p): |
+ return len(re.findall(p, s)) |
+ |
+def WordCount(s): |
+ return CountMatches(s, r'\w+') |
+ |
+def GetLastSegment(path): |
+ return re.search('[^/]*\/?$', path).group(0) |
+ |
+def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML): |
+ path = urlparse.urlparse(url).path |
+ |
+ path = path.encode('utf-8') |
+ innerText = innerText.encode('utf-8') |
+ textContent = textContent.encode('utf-8') |
+ innerHTML = innerHTML.encode('utf-8') |
+ |
+ innerTextWords = WordCount(innerText) |
+ textContentWords = WordCount(textContent) |
+ innerHTMLWords = WordCount(innerHTML) |
+ return [ |
+ 'opengraph', opengraph, |
+ 'forum', 'forum' in path, |
+ 'index', 'index' in path, |
+ 'view', 'view' in path, |
+ 'asp', '.asp' in path, |
+ 'phpbb', 'phpbb' in path, |
+ 'php', path.endswith('.php'), |
+ 'pathlength', len(path), |
+ 'domain', len(path) < 2, |
+ 'pathcomponents', CountMatches(path, r'\/.'), |
+ 'slugdetector', CountMatches(path, r'[^\w/]'), |
+ 'pathnumbers', CountMatches(path, r'\d+'), |
+ 'lastSegmentLength', len(GetLastSegment(path)), |
+ 'formcount', numForms, |
+ 'anchorcount', numAnchors, |
+ 'elementcount', numElements, |
+ 'anchorratio', float(numAnchors) / max(1, numElements), |
+ 'innertextlength', len(innerText), |
+ 'textcontentlength', len(textContent), |
+ 'innerhtmllength', len(innerHTML), |
+ 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), |
+ 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), |
+ 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)), |
+ 'innertextwordcount', innerTextWords, |
+ 'textcontentwordcount', textContentWords, |
+ 'innerhtmlwordcount', innerHTMLWords, |
+ 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), |
+ 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords), |
+ 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords), |
+ ] |
+ |
+def main(argv): |
+ parser = argparse.ArgumentParser() |
+ parser.add_argument('--out', required=True) |
+ parser.add_argument('--core', required=True) |
+ options = parser.parse_args(argv) |
+ |
+ if os.path.exists(options.out): |
+ raise Exception('exists: ' + options.out) |
+ |
+ core = None |
+ with open(options.core) as core_file: |
+ core = json.load(core_file) |
+ |
+ for entry in core: |
+ features = entry['features'] |
+ entry['features'] = CalcDerivedFeatures( |
+ features['opengraph'], |
+ features['url'], |
+ features['numElements'], |
+ features['numAnchors'], |
+ features['numForms'], |
+ features['innerText'], |
+ features['textContent'], |
+ features['innerHTML']) |
+ |
+ with open(options.out, 'w') as outfile: |
+ json.dump(core, outfile, indent=1) |
+ |
+ return 0 |
+ |
+if __name__ == '__main__': |
+ sys.exit(main(sys.argv[1:])) |
+ |