Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3219)

Unified Diff: calculate_derived_features.py

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: calculate_derived_features.py
diff --git a/calculate_derived_features.py b/calculate_derived_features.py
deleted file mode 100644
index 7b22e0139a37fa9abef1009630a4d34df21f41d5..0000000000000000000000000000000000000000
--- a/calculate_derived_features.py
+++ /dev/null
@@ -1,99 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2014 The Chromium Authors. All rights reserved.
-# Use of this source code is governed by a BSD-style license that can be
-# found in the LICENSE file.
-
-import argparse
-import csv
-import json
-import os
-import shutil
-import sys
-import re
-import urlparse
-
-def CountMatches(s, p):
- return len(re.findall(p, s))
-
-def WordCount(s):
- return CountMatches(s, r'\w+')
-
-def GetLastSegment(path):
- return re.search('[^/]*\/?$', path).group(0)
-
-def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):
- path = urlparse.urlparse(url).path
-
- path = path.encode('utf-8')
- innerText = innerText.encode('utf-8')
- textContent = textContent.encode('utf-8')
- innerHTML = innerHTML.encode('utf-8')
-
- innerTextWords = WordCount(innerText)
- textContentWords = WordCount(textContent)
- innerHTMLWords = WordCount(innerHTML)
- return [
- 'opengraph', opengraph,
- 'forum', 'forum' in path,
- 'index', 'index' in path,
- 'view', 'view' in path,
- 'asp', '.asp' in path,
- 'phpbb', 'phpbb' in path,
- 'php', path.endswith('.php'),
- 'pathlength', len(path),
- 'domain', len(path) < 2,
- 'pathcomponents', CountMatches(path, r'\/.'),
- 'slugdetector', CountMatches(path, r'[^\w/]'),
- 'pathnumbers', CountMatches(path, r'\d+'),
- 'lastSegmentLength', len(GetLastSegment(path)),
- 'formcount', numForms,
- 'anchorcount', numAnchors,
- 'elementcount', numElements,
- 'anchorratio', float(numAnchors) / max(1, numElements),
- 'innertextlength', len(innerText),
- 'textcontentlength', len(textContent),
- 'innerhtmllength', len(innerHTML),
- 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),
- 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),
- 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),
- 'innertextwordcount', innerTextWords,
- 'textcontentwordcount', textContentWords,
- 'innerhtmlwordcount', innerHTMLWords,
- 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),
- 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),
- 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),
- ]
-
-def main(argv):
- parser = argparse.ArgumentParser()
- parser.add_argument('--out', required=True)
- parser.add_argument('--core', required=True)
- options = parser.parse_args(argv)
-
- if os.path.exists(options.out):
- raise Exception('exists: ' + options.out)
-
- core = None
- with open(options.core) as core_file:
- core = json.load(core_file)
-
- for entry in core:
- features = entry['features']
- entry['features'] = CalcDerivedFeatures(
- features['opengraph'],
- features['url'],
- features['numElements'],
- features['numAnchors'],
- features['numForms'],
- features['innerText'],
- features['textContent'],
- features['innerHTML'])
-
- with open(options.out, 'w') as outfile:
- json.dump(core, outfile, indent=1)
-
- return 0
-
-if __name__ == '__main__':
- sys.exit(main(sys.argv[1:]))
-
« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698