| OLD | NEW |
| 1 #!/usr/bin/env python | 1 #!/usr/bin/env python |
| 2 # Copyright 2016 The Chromium Authors. All rights reserved. | 2 # Copyright 2016 The Chromium Authors. All rights reserved. |
| 3 # Use of this source code is governed by a BSD-style license that can be | 3 # Use of this source code is governed by a BSD-style license that can be |
| 4 # found in the LICENSE file. | 4 # found in the LICENSE file. |
| 5 | 5 |
| 6 import argparse | 6 import argparse |
| 7 import csv | 7 import csv |
| 8 import json | 8 import json |
| 9 import marshal | 9 import marshal |
| 10 import os | 10 import os |
| 11 import shutil | 11 import shutil |
| 12 import math | 12 import math |
| 13 import sys | 13 import sys |
| 14 import re | 14 import re |
| 15 import urlparse | 15 import urlparse |
| 16 | 16 |
| 17 def CountMatches(s, p): | 17 def CountMatches(s, p): |
| 18 return len(re.findall(p, s)) | 18 return len(re.findall(p, s)) |
| 19 | 19 |
| 20 def WordCount(s): | 20 def WordCount(s): |
| 21 return CountMatches(s, r'\w+') | 21 return CountMatches(s, r'\w+') |
| 22 | 22 |
| 23 def GetLastSegment(path): | 23 def GetLastSegment(path): |
| 24 return re.search('[^/]*\/?$', path).group(0) | 24 return re.search('[^/]*\/?$', path).group(0) |
| 25 | 25 |
| 26 def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n
umForms, numPPRE, visibleElements, visiblePPRE, | 26 def CalcDerivedFeatures(index, raw): |
| 27 innerText, textContent, innerHTML, numText, numPassword, mozScores): | 27 return _CalcDerivedFeatures( |
| 28 index, |
| 29 raw, |
| 30 raw['opengraph'], |
| 31 raw['url'], |
| 32 raw['title'], |
| 33 raw['numElements'], |
| 34 raw['numAnchors'], |
| 35 raw['numForms'], |
| 36 raw['numPPRE'], |
| 37 raw['visibleElements'], |
| 38 raw['visiblePPRE'], |
| 39 raw['innerText'], |
| 40 raw['textContent'], |
| 41 raw['innerHTML'], |
| 42 raw['numTextInput'], |
| 43 raw['numPasswordInput'] |
| 44 ) |
| 45 |
| 46 def _CalcDerivedFeatures(index, raw, opengraph, url, title, numElements, numAnch
ors, numForms, numPPRE, visibleElements, visiblePPRE, |
| 47 innerText, textContent, innerHTML, numText, numPassword): |
| 28 path = urlparse.urlparse(url).path | 48 path = urlparse.urlparse(url).path |
| 29 | 49 |
| 30 path = path.encode('utf-8') | 50 path = path.encode('utf-8') |
| 31 innerText = innerText.encode('utf-8') | 51 innerText = innerText.encode('utf-8') |
| 32 textContent = textContent.encode('utf-8') | 52 textContent = textContent.encode('utf-8') |
| 33 innerHTML = innerHTML.encode('utf-8') | 53 innerHTML = innerHTML.encode('utf-8') |
| 34 | 54 |
| 35 innerTextWords = WordCount(innerText) | 55 innerTextWords = WordCount(innerText) |
| 36 textContentWords = WordCount(textContent) | 56 textContentWords = WordCount(textContent) |
| 37 innerHTMLWords = WordCount(innerHTML) | 57 innerHTMLWords = WordCount(innerHTML) |
| 38 return [ | 58 features = [ |
| 59 'id', index, |
| 60 'sin', math.sin(index), |
| 39 'openGraph', opengraph, | 61 'openGraph', opengraph, |
| 40 | 62 |
| 41 'forum', 'forum' in path, | 63 'forum', 'forum' in path, |
| 42 'index', 'index' in path, | 64 'index', 'index' in path, |
| 43 'search', 'search' in path, | 65 'search', 'search' in path, |
| 44 'view', 'view' in path, | 66 'view', 'view' in path, |
| 45 'archive', 'archive' in path, | 67 'archive', 'archive' in path, |
| 46 'asp', '.asp' in path, | 68 'asp', '.asp' in path, |
| 47 'phpbb', 'phpbb' in path, | 69 'phpbb', 'phpbb' in path, |
| 48 'php', path.endswith('.php'), | 70 'php', path.endswith('.php'), |
| 49 'pathLength', len(path), | 71 'pathLength', len(path), |
| 50 'domain', len(path) < 2, | 72 'domain', len(path) < 2, |
| 51 'pathComponents', CountMatches(path, r'\/.'), | 73 'pathComponents', CountMatches(path, r'\/.'), |
| 52 'slugDetector', CountMatches(path, r'[^\w/]'), | 74 'slugDetector', CountMatches(path, r'[^\w/]'), |
| 53 'pathNumbers', CountMatches(path, r'\d+'), | 75 'pathNumbers', CountMatches(path, r'\d+'), |
| 54 'lastSegmentLength', len(GetLastSegment(path)), | 76 'lastSegmentLength', len(GetLastSegment(path)), |
| 55 | 77 |
| 78 'visibleRatio', float(visibleElements) / max(1, numElements), |
| 79 'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE), |
| 80 'PPRERatio', float(numPPRE) / max(1, numElements), |
| 81 'anchorPPRERatio', float(numAnchors) / max(1, numPPRE), |
| 82 |
| 83 'innerTextLength', len(innerText), |
| 84 'textContentLength', len(textContent), |
| 85 'innerHtmlLength', len(innerHTML), |
| 86 'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)), |
| 87 'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)), |
| 88 'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textCon
tent)), |
| 89 |
| 90 'innerTextWordCount', innerTextWords, |
| 91 'textContentWordCount', textContentWords, |
| 92 'innerhtmlWordCount', innerHTMLWords, |
| 93 'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords), |
| 94 'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords
), |
| 95 'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textCon
tentWords), |
| 96 |
| 97 'textCount', numText, |
| 98 'passwordCount', numPassword, |
| 56 'formCount', numForms, | 99 'formCount', numForms, |
| 57 'anchorCount', numAnchors, | 100 'anchorCount', numAnchors, |
| 58 'elementCount', numElements, | 101 'elementCount', numElements, |
| 59 'anchorRatio', float(numAnchors) / max(1, numElements), | 102 'anchorRatio', float(numAnchors) / max(1, numElements), |
| 103 ] |
| 60 | 104 |
| 61 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)), | 105 for k in sorted(raw): |
| 62 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)), | 106 if 'mozScore' in k or 'num' in k: |
| 63 'mozScoreAllLinear', min(mozScores[5], 6000), | 107 features += [k, raw[k]] |
| 64 ] | 108 |
| 109 return features |
| 65 | 110 |
| 66 def main(argv): | 111 def main(argv): |
| 67 parser = argparse.ArgumentParser() | 112 parser = argparse.ArgumentParser() |
| 68 parser.add_argument('--out', required=True) | 113 parser.add_argument('--out', required=True) |
| 69 parser.add_argument('--core', required=True) | 114 parser.add_argument('--core', required=True) |
| 70 options = parser.parse_args(argv) | 115 options = parser.parse_args(argv) |
| 71 | 116 |
| 72 if os.path.exists(options.out): | 117 if os.path.exists(options.out): |
| 73 raise Exception('exists: ' + options.out) | 118 raise Exception('exists: ' + options.out) |
| 74 | 119 |
| 75 core = None | 120 core = None |
| 76 with open(options.core) as core_file: | 121 with open(options.core) as core_file: |
| 77 core = marshal.load(core_file) | 122 core = marshal.load(core_file) |
| 78 | 123 |
| 79 for entry in core: | 124 for entry in core: |
| 80 features = entry['features'] | 125 features = entry['features'] |
| 81 print 'processing %d' % (entry['index']) | 126 print 'processing %d' % (entry['index']) |
| 82 entry['features'] = CalcDerivedFeatures( | 127 |
| 83 entry['index'], | 128 entry['features'] = CalcDerivedFeatures(entry['index'], features) |
| 84 features['opengraph'], | |
| 85 features['url'], | |
| 86 features['title'], | |
| 87 features['numElements'], | |
| 88 features['numAnchors'], | |
| 89 features['numForms'], | |
| 90 features['numPPRE'], | |
| 91 features['visibleElements'], | |
| 92 features['visiblePPRE'], | |
| 93 features['innerText'], | |
| 94 features['textContent'], | |
| 95 features['innerHTML'], | |
| 96 features['numTextInput'], | |
| 97 features['numPasswordInput'], | |
| 98 [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllL
inear'], | |
| 99 features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozS
coreFastAllLinear']] | |
| 100 ) | |
| 101 | 129 |
| 102 with open(options.out, 'w') as outfile: | 130 with open(options.out, 'w') as outfile: |
| 103 json.dump(core, outfile, indent=1) | 131 json.dump(core, outfile, indent=1) |
| 104 | 132 |
| 105 return 0 | 133 return 0 |
| 106 | 134 |
| 107 if __name__ == '__main__': | 135 if __name__ == '__main__': |
| 108 sys.exit(main(sys.argv[1:])) | 136 sys.exit(main(sys.argv[1:])) |
| 109 | 137 |
| OLD | NEW |