Index: calculate_derived_features.py |
diff --git a/calculate_derived_features.py b/calculate_derived_features.py |
deleted file mode 100644 |
index 7b22e0139a37fa9abef1009630a4d34df21f41d5..0000000000000000000000000000000000000000 |
--- a/calculate_derived_features.py |
+++ /dev/null |
@@ -1,99 +0,0 @@ |
-#!/usr/bin/env python |
-# Copyright 2014 The Chromium Authors. All rights reserved. |
-# Use of this source code is governed by a BSD-style license that can be |
-# found in the LICENSE file. |
- |
-import argparse |
-import csv |
-import json |
-import os |
-import shutil |
-import sys |
-import re |
-import urlparse |
- |
-def CountMatches(s, p): |
- return len(re.findall(p, s)) |
- |
-def WordCount(s): |
- return CountMatches(s, r'\w+') |
- |
-def GetLastSegment(path): |
- return re.search('[^/]*\/?$', path).group(0) |
- |
-def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML): |
- path = urlparse.urlparse(url).path |
- |
- path = path.encode('utf-8') |
- innerText = innerText.encode('utf-8') |
- textContent = textContent.encode('utf-8') |
- innerHTML = innerHTML.encode('utf-8') |
- |
- innerTextWords = WordCount(innerText) |
- textContentWords = WordCount(textContent) |
- innerHTMLWords = WordCount(innerHTML) |
- return [ |
- 'opengraph', opengraph, |
- 'forum', 'forum' in path, |
- 'index', 'index' in path, |
- 'view', 'view' in path, |
- 'asp', '.asp' in path, |
- 'phpbb', 'phpbb' in path, |
- 'php', path.endswith('.php'), |
- 'pathlength', len(path), |
- 'domain', len(path) < 2, |
- 'pathcomponents', CountMatches(path, r'\/.'), |
- 'slugdetector', CountMatches(path, r'[^\w/]'), |
- 'pathnumbers', CountMatches(path, r'\d+'), |
- 'lastSegmentLength', len(GetLastSegment(path)), |
- 'formcount', numForms, |
- 'anchorcount', numAnchors, |
- 'elementcount', numElements, |
- 'anchorratio', float(numAnchors) / max(1, numElements), |
- 'innertextlength', len(innerText), |
- 'textcontentlength', len(textContent), |
- 'innerhtmllength', len(innerHTML), |
- 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), |
- 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), |
- 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)), |
- 'innertextwordcount', innerTextWords, |
- 'textcontentwordcount', textContentWords, |
- 'innerhtmlwordcount', innerHTMLWords, |
- 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), |
- 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords), |
- 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords), |
- ] |
- |
-def main(argv): |
- parser = argparse.ArgumentParser() |
- parser.add_argument('--out', required=True) |
- parser.add_argument('--core', required=True) |
- options = parser.parse_args(argv) |
- |
- if os.path.exists(options.out): |
- raise Exception('exists: ' + options.out) |
- |
- core = None |
- with open(options.core) as core_file: |
- core = json.load(core_file) |
- |
- for entry in core: |
- features = entry['features'] |
- entry['features'] = CalcDerivedFeatures( |
- features['opengraph'], |
- features['url'], |
- features['numElements'], |
- features['numAnchors'], |
- features['numForms'], |
- features['innerText'], |
- features['textContent'], |
- features['innerHTML']) |
- |
- with open(options.out, 'w') as outfile: |
- json.dump(core, outfile, indent=1) |
- |
- return 0 |
- |
-if __name__ == '__main__': |
- sys.exit(main(sys.argv[1:])) |
- |