| Index: calculate_derived_features.py
|
| diff --git a/calculate_derived_features.py b/calculate_derived_features.py
|
| deleted file mode 100644
|
| index 7b22e0139a37fa9abef1009630a4d34df21f41d5..0000000000000000000000000000000000000000
|
| --- a/calculate_derived_features.py
|
| +++ /dev/null
|
| @@ -1,99 +0,0 @@
|
| -#!/usr/bin/env python
|
| -# Copyright 2014 The Chromium Authors. All rights reserved.
|
| -# Use of this source code is governed by a BSD-style license that can be
|
| -# found in the LICENSE file.
|
| -
|
| -import argparse
|
| -import csv
|
| -import json
|
| -import os
|
| -import shutil
|
| -import sys
|
| -import re
|
| -import urlparse
|
| -
|
| -def CountMatches(s, p):
|
| - return len(re.findall(p, s))
|
| -
|
| -def WordCount(s):
|
| - return CountMatches(s, r'\w+')
|
| -
|
| -def GetLastSegment(path):
|
| - return re.search('[^/]*\/?$', path).group(0)
|
| -
|
| -def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):
|
| - path = urlparse.urlparse(url).path
|
| -
|
| - path = path.encode('utf-8')
|
| - innerText = innerText.encode('utf-8')
|
| - textContent = textContent.encode('utf-8')
|
| - innerHTML = innerHTML.encode('utf-8')
|
| -
|
| - innerTextWords = WordCount(innerText)
|
| - textContentWords = WordCount(textContent)
|
| - innerHTMLWords = WordCount(innerHTML)
|
| - return [
|
| - 'opengraph', opengraph,
|
| - 'forum', 'forum' in path,
|
| - 'index', 'index' in path,
|
| - 'view', 'view' in path,
|
| - 'asp', '.asp' in path,
|
| - 'phpbb', 'phpbb' in path,
|
| - 'php', path.endswith('.php'),
|
| - 'pathlength', len(path),
|
| - 'domain', len(path) < 2,
|
| - 'pathcomponents', CountMatches(path, r'\/.'),
|
| - 'slugdetector', CountMatches(path, r'[^\w/]'),
|
| - 'pathnumbers', CountMatches(path, r'\d+'),
|
| - 'lastSegmentLength', len(GetLastSegment(path)),
|
| - 'formcount', numForms,
|
| - 'anchorcount', numAnchors,
|
| - 'elementcount', numElements,
|
| - 'anchorratio', float(numAnchors) / max(1, numElements),
|
| - 'innertextlength', len(innerText),
|
| - 'textcontentlength', len(textContent),
|
| - 'innerhtmllength', len(innerHTML),
|
| - 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
|
| - 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
|
| - 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),
|
| - 'innertextwordcount', innerTextWords,
|
| - 'textcontentwordcount', textContentWords,
|
| - 'innerhtmlwordcount', innerHTMLWords,
|
| - 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
|
| - 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),
|
| - 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),
|
| - ]
|
| -
|
| -def main(argv):
|
| - parser = argparse.ArgumentParser()
|
| - parser.add_argument('--out', required=True)
|
| - parser.add_argument('--core', required=True)
|
| - options = parser.parse_args(argv)
|
| -
|
| - if os.path.exists(options.out):
|
| - raise Exception('exists: ' + options.out)
|
| -
|
| - core = None
|
| - with open(options.core) as core_file:
|
| - core = json.load(core_file)
|
| -
|
| - for entry in core:
|
| - features = entry['features']
|
| - entry['features'] = CalcDerivedFeatures(
|
| - features['opengraph'],
|
| - features['url'],
|
| - features['numElements'],
|
| - features['numAnchors'],
|
| - features['numForms'],
|
| - features['innerText'],
|
| - features['textContent'],
|
| - features['innerHTML'])
|
| -
|
| - with open(options.out, 'w') as outfile:
|
| - json.dump(core, outfile, indent=1)
|
| -
|
| - return 0
|
| -
|
| -if __name__ == '__main__':
|
| - sys.exit(main(sys.argv[1:]))
|
| -
|
|
|