calculate_derived_features.py - Issue 1289123002: Merge branch 'master' into heuristics

Unified Diff: calculate_derived_features.py

Issue 1289123002: Merge branch 'master' into heuristics Base URL: git@github.com:chromium/dom-distiller.git@master

Patch Set: Created 5 years, 4 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: calculate_derived_features.py

diff --git a/calculate_derived_features.py b/calculate_derived_features.py

new file mode 100644

index 0000000000000000000000000000000000000000..7b22e0139a37fa9abef1009630a4d34df21f41d5

--- /dev/null

+++ b/calculate_derived_features.py

@@ -0,0 +1,99 @@

+#!/usr/bin/env python

+# Use of this source code is governed by a BSD-style license that can be

+# found in the LICENSE file.

+import argparse

+import csv

+import json

+import os

+import shutil

+import sys

+import re

+import urlparse

+def CountMatches(s, p):

+ return len(re.findall(p, s))

+def WordCount(s):

+ return CountMatches(s, r'\w+')

+def GetLastSegment(path):

+ return re.search('[^/]*\/?$', path).group(0)

+def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):

+ path = urlparse.urlparse(url).path

+ path = path.encode('utf-8')

+ innerText = innerText.encode('utf-8')

+ textContent = textContent.encode('utf-8')

+ innerHTML = innerHTML.encode('utf-8')

+ innerTextWords = WordCount(innerText)

+ textContentWords = WordCount(textContent)

+ innerHTMLWords = WordCount(innerHTML)

+ return [

+ 'opengraph', opengraph,

+ 'forum', 'forum' in path,

+ 'index', 'index' in path,

+ 'view', 'view' in path,

+ 'asp', '.asp' in path,

+ 'phpbb', 'phpbb' in path,

+ 'php', path.endswith('.php'),

+ 'pathlength', len(path),

+ 'domain', len(path) < 2,

+ 'pathcomponents', CountMatches(path, r'\/.'),

+ 'slugdetector', CountMatches(path, r'[^\w/]'),

+ 'pathnumbers', CountMatches(path, r'\d+'),

+ 'lastSegmentLength', len(GetLastSegment(path)),

+ 'formcount', numForms,

+ 'anchorcount', numAnchors,

+ 'elementcount', numElements,

+ 'anchorratio', float(numAnchors) / max(1, numElements),

+ 'innertextlength', len(innerText),

+ 'textcontentlength', len(textContent),

+ 'innerhtmllength', len(innerHTML),

+ 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),

+ 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),

+ 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),

+ 'innertextwordcount', innerTextWords,

+ 'textcontentwordcount', textContentWords,

+ 'innerhtmlwordcount', innerHTMLWords,

+ 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),

+ 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),

+ 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),

+ ]

+def main(argv):

+ parser = argparse.ArgumentParser()

+ parser.add_argument('--out', required=True)

+ parser.add_argument('--core', required=True)

+ options = parser.parse_args(argv)

+ if os.path.exists(options.out):

+ raise Exception('exists: ' + options.out)

+ core = None

+ with open(options.core) as core_file:

+ core = json.load(core_file)

+ for entry in core:

+ features = entry['features']

+ entry['features'] = CalcDerivedFeatures(

+ features['opengraph'],

+ features['url'],

+ features['numElements'],

+ features['numAnchors'],

+ features['numForms'],

+ features['innerText'],

+ features['textContent'],

+ features['innerHTML'])

+ with open(options.out, 'w') as outfile:

+ json.dump(core, outfile, indent=1)

+ return 0

+if __name__ == '__main__':

+ sys.exit(main(sys.argv[1:]))

« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »