OLD | NEW |
(Empty) | |
| 1 #!/usr/bin/env python |
| 2 # Copyright 2014 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. |
| 5 |
| 6 import argparse |
| 7 import csv |
| 8 import json |
| 9 import os |
| 10 import shutil |
| 11 import sys |
| 12 import re |
| 13 import urlparse |
| 14 |
| 15 def CountMatches(s, p): |
| 16 return len(re.findall(p, s)) |
| 17 |
| 18 def WordCount(s): |
| 19 return CountMatches(s, r'\w+') |
| 20 |
| 21 def GetLastSegment(path): |
| 22 return re.search('[^/]*\/?$', path).group(0) |
| 23 |
| 24 def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner
Text, textContent, innerHTML): |
| 25 path = urlparse.urlparse(url).path |
| 26 |
| 27 path = path.encode('utf-8') |
| 28 innerText = innerText.encode('utf-8') |
| 29 textContent = textContent.encode('utf-8') |
| 30 innerHTML = innerHTML.encode('utf-8') |
| 31 |
| 32 innerTextWords = WordCount(innerText) |
| 33 textContentWords = WordCount(textContent) |
| 34 innerHTMLWords = WordCount(innerHTML) |
| 35 return [ |
| 36 'opengraph', opengraph, |
| 37 'forum', 'forum' in path, |
| 38 'index', 'index' in path, |
| 39 'view', 'view' in path, |
| 40 'asp', '.asp' in path, |
| 41 'phpbb', 'phpbb' in path, |
| 42 'php', path.endswith('.php'), |
| 43 'pathlength', len(path), |
| 44 'domain', len(path) < 2, |
| 45 'pathcomponents', CountMatches(path, r'\/.'), |
| 46 'slugdetector', CountMatches(path, r'[^\w/]'), |
| 47 'pathnumbers', CountMatches(path, r'\d+'), |
| 48 'lastSegmentLength', len(GetLastSegment(path)), |
| 49 'formcount', numForms, |
| 50 'anchorcount', numAnchors, |
| 51 'elementcount', numElements, |
| 52 'anchorratio', float(numAnchors) / max(1, numElements), |
| 53 'innertextlength', len(innerText), |
| 54 'textcontentlength', len(textContent), |
| 55 'innerhtmllength', len(innerHTML), |
| 56 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), |
| 57 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), |
| 58 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textCon
tent)), |
| 59 'innertextwordcount', innerTextWords, |
| 60 'textcontentwordcount', textContentWords, |
| 61 'innerhtmlwordcount', innerHTMLWords, |
| 62 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), |
| 63 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords
), |
| 64 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textCon
tentWords), |
| 65 ] |
| 66 |
| 67 def main(argv): |
| 68 parser = argparse.ArgumentParser() |
| 69 parser.add_argument('--out', required=True) |
| 70 parser.add_argument('--core', required=True) |
| 71 options = parser.parse_args(argv) |
| 72 |
| 73 if os.path.exists(options.out): |
| 74 raise Exception('exists: ' + options.out) |
| 75 |
| 76 core = None |
| 77 with open(options.core) as core_file: |
| 78 core = json.load(core_file) |
| 79 |
| 80 for entry in core: |
| 81 features = entry['features'] |
| 82 entry['features'] = CalcDerivedFeatures( |
| 83 features['opengraph'], |
| 84 features['url'], |
| 85 features['numElements'], |
| 86 features['numAnchors'], |
| 87 features['numForms'], |
| 88 features['innerText'], |
| 89 features['textContent'], |
| 90 features['innerHTML']) |
| 91 |
| 92 with open(options.out, 'w') as outfile: |
| 93 json.dump(core, outfile, indent=1) |
| 94 |
| 95 return 0 |
| 96 |
| 97 if __name__ == '__main__': |
| 98 sys.exit(main(sys.argv[1:])) |
| 99 |
OLD | NEW |