Index: heuristics/distillable/calculate_derived_features.py |
diff --git a/calculate_derived_features.py b/heuristics/distillable/calculate_derived_features.py |
old mode 100644 |
new mode 100755 |
similarity index 55% |
rename from calculate_derived_features.py |
rename to heuristics/distillable/calculate_derived_features.py |
index 7b22e0139a37fa9abef1009630a4d34df21f41d5..1ae52967464a57a2c5a730d77860f86dcdf3d738 |
--- a/calculate_derived_features.py |
+++ b/heuristics/distillable/calculate_derived_features.py |
@@ -1,13 +1,15 @@ |
#!/usr/bin/env python |
-# Copyright 2014 The Chromium Authors. All rights reserved. |
+# Copyright 2016 The Chromium Authors. All rights reserved. |
# Use of this source code is governed by a BSD-style license that can be |
# found in the LICENSE file. |
import argparse |
import csv |
import json |
+import marshal |
import os |
import shutil |
+import math |
import sys |
import re |
import urlparse |
@@ -21,7 +23,8 @@ def WordCount(s): |
def GetLastSegment(path): |
return re.search('[^/]*\/?$', path).group(0) |
-def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML): |
+def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, numForms, numPPRE, visibleElements, visiblePPRE, |
+ innerText, textContent, innerHTML, numText, numPassword, mozScores): |
path = urlparse.urlparse(url).path |
path = path.encode('utf-8') |
@@ -33,35 +36,31 @@ def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner |
textContentWords = WordCount(textContent) |
innerHTMLWords = WordCount(innerHTML) |
return [ |
- 'opengraph', opengraph, |
+ 'openGraph', opengraph, |
+ |
'forum', 'forum' in path, |
'index', 'index' in path, |
+ 'search', 'search' in path, |
'view', 'view' in path, |
+ 'archive', 'archive' in path, |
'asp', '.asp' in path, |
'phpbb', 'phpbb' in path, |
'php', path.endswith('.php'), |
- 'pathlength', len(path), |
+ 'pathLength', len(path), |
'domain', len(path) < 2, |
- 'pathcomponents', CountMatches(path, r'\/.'), |
- 'slugdetector', CountMatches(path, r'[^\w/]'), |
- 'pathnumbers', CountMatches(path, r'\d+'), |
+ 'pathComponents', CountMatches(path, r'\/.'), |
+ 'slugDetector', CountMatches(path, r'[^\w/]'), |
+ 'pathNumbers', CountMatches(path, r'\d+'), |
'lastSegmentLength', len(GetLastSegment(path)), |
- 'formcount', numForms, |
- 'anchorcount', numAnchors, |
- 'elementcount', numElements, |
- 'anchorratio', float(numAnchors) / max(1, numElements), |
- 'innertextlength', len(innerText), |
- 'textcontentlength', len(textContent), |
- 'innerhtmllength', len(innerHTML), |
- 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)), |
- 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)), |
- 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)), |
- 'innertextwordcount', innerTextWords, |
- 'textcontentwordcount', textContentWords, |
- 'innerhtmlwordcount', innerHTMLWords, |
- 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords), |
- 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords), |
- 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords), |
+ |
+ 'formCount', numForms, |
+ 'anchorCount', numAnchors, |
+ 'elementCount', numElements, |
+ 'anchorRatio', float(numAnchors) / max(1, numElements), |
+ |
+ 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)), |
+ 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)), |
+ 'mozScoreAllLinear', min(mozScores[5], 6000), |
] |
def main(argv): |
@@ -75,19 +74,30 @@ def main(argv): |
core = None |
with open(options.core) as core_file: |
- core = json.load(core_file) |
+ core = marshal.load(core_file) |
for entry in core: |
features = entry['features'] |
+ print 'processing %d' % (entry['index']) |
entry['features'] = CalcDerivedFeatures( |
+ entry['index'], |
features['opengraph'], |
features['url'], |
+ features['title'], |
features['numElements'], |
features['numAnchors'], |
features['numForms'], |
+ features['numPPRE'], |
+ features['visibleElements'], |
+ features['visiblePPRE'], |
features['innerText'], |
features['textContent'], |
- features['innerHTML']) |
+ features['innerHTML'], |
+ features['numTextInput'], |
+ features['numPasswordInput'], |
+ [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllLinear'], |
+ features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozScoreFastAllLinear']] |
+ ) |
with open(options.out, 'w') as outfile: |
json.dump(core, outfile, indent=1) |