| Index: heuristics/distillable/calculate_derived_features.py
|
| diff --git a/calculate_derived_features.py b/heuristics/distillable/calculate_derived_features.py
|
| old mode 100644
|
| new mode 100755
|
| similarity index 55%
|
| rename from calculate_derived_features.py
|
| rename to heuristics/distillable/calculate_derived_features.py
|
| index 7b22e0139a37fa9abef1009630a4d34df21f41d5..1ae52967464a57a2c5a730d77860f86dcdf3d738
|
| --- a/calculate_derived_features.py
|
| +++ b/heuristics/distillable/calculate_derived_features.py
|
| @@ -1,13 +1,15 @@
|
| #!/usr/bin/env python
|
| -# Copyright 2014 The Chromium Authors. All rights reserved.
|
| +# Copyright 2016 The Chromium Authors. All rights reserved.
|
| # Use of this source code is governed by a BSD-style license that can be
|
| # found in the LICENSE file.
|
|
|
| import argparse
|
| import csv
|
| import json
|
| +import marshal
|
| import os
|
| import shutil
|
| +import math
|
| import sys
|
| import re
|
| import urlparse
|
| @@ -21,7 +23,8 @@ def WordCount(s):
|
| def GetLastSegment(path):
|
| return re.search('[^/]*\/?$', path).group(0)
|
|
|
| -def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):
|
| +def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, numForms, numPPRE, visibleElements, visiblePPRE,
|
| + innerText, textContent, innerHTML, numText, numPassword, mozScores):
|
| path = urlparse.urlparse(url).path
|
|
|
| path = path.encode('utf-8')
|
| @@ -33,35 +36,31 @@ def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner
|
| textContentWords = WordCount(textContent)
|
| innerHTMLWords = WordCount(innerHTML)
|
| return [
|
| - 'opengraph', opengraph,
|
| + 'openGraph', opengraph,
|
| +
|
| 'forum', 'forum' in path,
|
| 'index', 'index' in path,
|
| + 'search', 'search' in path,
|
| 'view', 'view' in path,
|
| + 'archive', 'archive' in path,
|
| 'asp', '.asp' in path,
|
| 'phpbb', 'phpbb' in path,
|
| 'php', path.endswith('.php'),
|
| - 'pathlength', len(path),
|
| + 'pathLength', len(path),
|
| 'domain', len(path) < 2,
|
| - 'pathcomponents', CountMatches(path, r'\/.'),
|
| - 'slugdetector', CountMatches(path, r'[^\w/]'),
|
| - 'pathnumbers', CountMatches(path, r'\d+'),
|
| + 'pathComponents', CountMatches(path, r'\/.'),
|
| + 'slugDetector', CountMatches(path, r'[^\w/]'),
|
| + 'pathNumbers', CountMatches(path, r'\d+'),
|
| 'lastSegmentLength', len(GetLastSegment(path)),
|
| - 'formcount', numForms,
|
| - 'anchorcount', numAnchors,
|
| - 'elementcount', numElements,
|
| - 'anchorratio', float(numAnchors) / max(1, numElements),
|
| - 'innertextlength', len(innerText),
|
| - 'textcontentlength', len(textContent),
|
| - 'innerhtmllength', len(innerHTML),
|
| - 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
|
| - 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
|
| - 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),
|
| - 'innertextwordcount', innerTextWords,
|
| - 'textcontentwordcount', textContentWords,
|
| - 'innerhtmlwordcount', innerHTMLWords,
|
| - 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
|
| - 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),
|
| - 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),
|
| +
|
| + 'formCount', numForms,
|
| + 'anchorCount', numAnchors,
|
| + 'elementCount', numElements,
|
| + 'anchorRatio', float(numAnchors) / max(1, numElements),
|
| +
|
| + 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)),
|
| + 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)),
|
| + 'mozScoreAllLinear', min(mozScores[5], 6000),
|
| ]
|
|
|
| def main(argv):
|
| @@ -75,19 +74,30 @@ def main(argv):
|
|
|
| core = None
|
| with open(options.core) as core_file:
|
| - core = json.load(core_file)
|
| + core = marshal.load(core_file)
|
|
|
| for entry in core:
|
| features = entry['features']
|
| + print 'processing %d' % (entry['index'])
|
| entry['features'] = CalcDerivedFeatures(
|
| + entry['index'],
|
| features['opengraph'],
|
| features['url'],
|
| + features['title'],
|
| features['numElements'],
|
| features['numAnchors'],
|
| features['numForms'],
|
| + features['numPPRE'],
|
| + features['visibleElements'],
|
| + features['visiblePPRE'],
|
| features['innerText'],
|
| features['textContent'],
|
| - features['innerHTML'])
|
| + features['innerHTML'],
|
| + features['numTextInput'],
|
| + features['numPasswordInput'],
|
| + [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllLinear'],
|
| + features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozScoreFastAllLinear']]
|
| + )
|
|
|
| with open(options.out, 'w') as outfile:
|
| json.dump(core, outfile, indent=1)
|
|
|