Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(670)

Unified Diff: calculate_derived_features.py

Issue 1289123002: Merge branch 'master' into heuristics Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: calculate_derived_features.py
diff --git a/calculate_derived_features.py b/calculate_derived_features.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b22e0139a37fa9abef1009630a4d34df21f41d5
--- /dev/null
+++ b/calculate_derived_features.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python
+# Copyright 2014 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+import argparse
+import csv
+import json
+import os
+import shutil
+import sys
+import re
+import urlparse
+
+def CountMatches(s, p):
+ return len(re.findall(p, s))
+
+def WordCount(s):
+ return CountMatches(s, r'\w+')
+
+def GetLastSegment(path):
+ return re.search('[^/]*\/?$', path).group(0)
+
+def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):
+ path = urlparse.urlparse(url).path
+
+ path = path.encode('utf-8')
+ innerText = innerText.encode('utf-8')
+ textContent = textContent.encode('utf-8')
+ innerHTML = innerHTML.encode('utf-8')
+
+ innerTextWords = WordCount(innerText)
+ textContentWords = WordCount(textContent)
+ innerHTMLWords = WordCount(innerHTML)
+ return [
+ 'opengraph', opengraph,
+ 'forum', 'forum' in path,
+ 'index', 'index' in path,
+ 'view', 'view' in path,
+ 'asp', '.asp' in path,
+ 'phpbb', 'phpbb' in path,
+ 'php', path.endswith('.php'),
+ 'pathlength', len(path),
+ 'domain', len(path) < 2,
+ 'pathcomponents', CountMatches(path, r'\/.'),
+ 'slugdetector', CountMatches(path, r'[^\w/]'),
+ 'pathnumbers', CountMatches(path, r'\d+'),
+ 'lastSegmentLength', len(GetLastSegment(path)),
+ 'formcount', numForms,
+ 'anchorcount', numAnchors,
+ 'elementcount', numElements,
+ 'anchorratio', float(numAnchors) / max(1, numElements),
+ 'innertextlength', len(innerText),
+ 'textcontentlength', len(textContent),
+ 'innerhtmllength', len(innerHTML),
+ 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
+ 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
+ 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),
+ 'innertextwordcount', innerTextWords,
+ 'textcontentwordcount', textContentWords,
+ 'innerhtmlwordcount', innerHTMLWords,
+ 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
+ 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),
+ 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),
+ ]
+
+def main(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--out', required=True)
+ parser.add_argument('--core', required=True)
+ options = parser.parse_args(argv)
+
+ if os.path.exists(options.out):
+ raise Exception('exists: ' + options.out)
+
+ core = None
+ with open(options.core) as core_file:
+ core = json.load(core_file)
+
+ for entry in core:
+ features = entry['features']
+ entry['features'] = CalcDerivedFeatures(
+ features['opengraph'],
+ features['url'],
+ features['numElements'],
+ features['numAnchors'],
+ features['numForms'],
+ features['innerText'],
+ features['textContent'],
+ features['innerHTML'])
+
+ with open(options.out, 'w') as outfile:
+ json.dump(core, outfile, indent=1)
+
+ return 0
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
+
« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698