Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(301)

Unified Diff: heuristics/distillable/calculate_derived_features.py

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/extract_features.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: heuristics/distillable/calculate_derived_features.py
diff --git a/calculate_derived_features.py b/heuristics/distillable/calculate_derived_features.py
old mode 100644
new mode 100755
similarity index 55%
rename from calculate_derived_features.py
rename to heuristics/distillable/calculate_derived_features.py
index 7b22e0139a37fa9abef1009630a4d34df21f41d5..1ae52967464a57a2c5a730d77860f86dcdf3d738
--- a/calculate_derived_features.py
+++ b/heuristics/distillable/calculate_derived_features.py
@@ -1,13 +1,15 @@
#!/usr/bin/env python
-# Copyright 2014 The Chromium Authors. All rights reserved.
+# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
import argparse
import csv
import json
+import marshal
import os
import shutil
+import math
import sys
import re
import urlparse
@@ -21,7 +23,8 @@ def WordCount(s):
def GetLastSegment(path):
return re.search('[^/]*\/?$', path).group(0)
-def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):
+def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, numForms, numPPRE, visibleElements, visiblePPRE,
+ innerText, textContent, innerHTML, numText, numPassword, mozScores):
path = urlparse.urlparse(url).path
path = path.encode('utf-8')
@@ -33,35 +36,31 @@ def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, inner
textContentWords = WordCount(textContent)
innerHTMLWords = WordCount(innerHTML)
return [
- 'opengraph', opengraph,
+ 'openGraph', opengraph,
+
'forum', 'forum' in path,
'index', 'index' in path,
+ 'search', 'search' in path,
'view', 'view' in path,
+ 'archive', 'archive' in path,
'asp', '.asp' in path,
'phpbb', 'phpbb' in path,
'php', path.endswith('.php'),
- 'pathlength', len(path),
+ 'pathLength', len(path),
'domain', len(path) < 2,
- 'pathcomponents', CountMatches(path, r'\/.'),
- 'slugdetector', CountMatches(path, r'[^\w/]'),
- 'pathnumbers', CountMatches(path, r'\d+'),
+ 'pathComponents', CountMatches(path, r'\/.'),
+ 'slugDetector', CountMatches(path, r'[^\w/]'),
+ 'pathNumbers', CountMatches(path, r'\d+'),
'lastSegmentLength', len(GetLastSegment(path)),
- 'formcount', numForms,
- 'anchorcount', numAnchors,
- 'elementcount', numElements,
- 'anchorratio', float(numAnchors) / max(1, numElements),
- 'innertextlength', len(innerText),
- 'textcontentlength', len(textContent),
- 'innerhtmllength', len(innerHTML),
- 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
- 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
- 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),
- 'innertextwordcount', innerTextWords,
- 'textcontentwordcount', textContentWords,
- 'innerhtmlwordcount', innerHTMLWords,
- 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
- 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),
- 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),
+
+ 'formCount', numForms,
+ 'anchorCount', numAnchors,
+ 'elementCount', numElements,
+ 'anchorRatio', float(numAnchors) / max(1, numElements),
+
+ 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)),
+ 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)),
+ 'mozScoreAllLinear', min(mozScores[5], 6000),
]
def main(argv):
@@ -75,19 +74,30 @@ def main(argv):
core = None
with open(options.core) as core_file:
- core = json.load(core_file)
+ core = marshal.load(core_file)
for entry in core:
features = entry['features']
+ print 'processing %d' % (entry['index'])
entry['features'] = CalcDerivedFeatures(
+ entry['index'],
features['opengraph'],
features['url'],
+ features['title'],
features['numElements'],
features['numAnchors'],
features['numForms'],
+ features['numPPRE'],
+ features['visibleElements'],
+ features['visiblePPRE'],
features['innerText'],
features['textContent'],
- features['innerHTML'])
+ features['innerHTML'],
+ features['numTextInput'],
+ features['numPasswordInput'],
+ [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllLinear'],
+ features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozScoreFastAllLinear']]
+ )
with open(options.out, 'w') as outfile:
json.dump(core, outfile, indent=1)
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/extract_features.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698