| Index: calculate_derived_features.py
|
| diff --git a/calculate_derived_features.py b/calculate_derived_features.py
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..7b22e0139a37fa9abef1009630a4d34df21f41d5
|
| --- /dev/null
|
| +++ b/calculate_derived_features.py
|
| @@ -0,0 +1,99 @@
|
| +#!/usr/bin/env python
|
| +# Copyright 2014 The Chromium Authors. All rights reserved.
|
| +# Use of this source code is governed by a BSD-style license that can be
|
| +# found in the LICENSE file.
|
| +
|
| +import argparse
|
| +import csv
|
| +import json
|
| +import os
|
| +import shutil
|
| +import sys
|
| +import re
|
| +import urlparse
|
| +
|
| +def CountMatches(s, p):
|
| + return len(re.findall(p, s))
|
| +
|
| +def WordCount(s):
|
| + return CountMatches(s, r'\w+')
|
| +
|
| +def GetLastSegment(path):
|
| + return re.search('[^/]*\/?$', path).group(0)
|
| +
|
| +def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):
|
| + path = urlparse.urlparse(url).path
|
| +
|
| + path = path.encode('utf-8')
|
| + innerText = innerText.encode('utf-8')
|
| + textContent = textContent.encode('utf-8')
|
| + innerHTML = innerHTML.encode('utf-8')
|
| +
|
| + innerTextWords = WordCount(innerText)
|
| + textContentWords = WordCount(textContent)
|
| + innerHTMLWords = WordCount(innerHTML)
|
| + return [
|
| + 'opengraph', opengraph,
|
| + 'forum', 'forum' in path,
|
| + 'index', 'index' in path,
|
| + 'view', 'view' in path,
|
| + 'asp', '.asp' in path,
|
| + 'phpbb', 'phpbb' in path,
|
| + 'php', path.endswith('.php'),
|
| + 'pathlength', len(path),
|
| + 'domain', len(path) < 2,
|
| + 'pathcomponents', CountMatches(path, r'\/.'),
|
| + 'slugdetector', CountMatches(path, r'[^\w/]'),
|
| + 'pathnumbers', CountMatches(path, r'\d+'),
|
| + 'lastSegmentLength', len(GetLastSegment(path)),
|
| + 'formcount', numForms,
|
| + 'anchorcount', numAnchors,
|
| + 'elementcount', numElements,
|
| + 'anchorratio', float(numAnchors) / max(1, numElements),
|
| + 'innertextlength', len(innerText),
|
| + 'textcontentlength', len(textContent),
|
| + 'innerhtmllength', len(innerHTML),
|
| + 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
|
| + 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
|
| + 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),
|
| + 'innertextwordcount', innerTextWords,
|
| + 'textcontentwordcount', textContentWords,
|
| + 'innerhtmlwordcount', innerHTMLWords,
|
| + 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
|
| + 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),
|
| + 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),
|
| + ]
|
| +
|
| +def main(argv):
|
| + parser = argparse.ArgumentParser()
|
| + parser.add_argument('--out', required=True)
|
| + parser.add_argument('--core', required=True)
|
| + options = parser.parse_args(argv)
|
| +
|
| + if os.path.exists(options.out):
|
| + raise Exception('exists: ' + options.out)
|
| +
|
| + core = None
|
| + with open(options.core) as core_file:
|
| + core = json.load(core_file)
|
| +
|
| + for entry in core:
|
| + features = entry['features']
|
| + entry['features'] = CalcDerivedFeatures(
|
| + features['opengraph'],
|
| + features['url'],
|
| + features['numElements'],
|
| + features['numAnchors'],
|
| + features['numForms'],
|
| + features['innerText'],
|
| + features['textContent'],
|
| + features['innerHTML'])
|
| +
|
| + with open(options.out, 'w') as outfile:
|
| + json.dump(core, outfile, indent=1)
|
| +
|
| + return 0
|
| +
|
| +if __name__ == '__main__':
|
| + sys.exit(main(sys.argv[1:]))
|
| +
|
|
|