calculate_derived_features.py - Issue 1620043002: Add scripts for distillability modelling

Unified Diff: calculate_derived_features.py

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master

Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: calculate_derived_features.py

diff --git a/calculate_derived_features.py b/calculate_derived_features.py

deleted file mode 100644

index 7b22e0139a37fa9abef1009630a4d34df21f41d5..0000000000000000000000000000000000000000

--- a/calculate_derived_features.py

+++ /dev/null

@@ -1,99 +0,0 @@

-#!/usr/bin/env python

-# Use of this source code is governed by a BSD-style license that can be

-# found in the LICENSE file.

-import argparse

-import csv

-import json

-import os

-import shutil

-import sys

-import re

-import urlparse

-def CountMatches(s, p):

- return len(re.findall(p, s))

-def WordCount(s):

- return CountMatches(s, r'\w+')

-def GetLastSegment(path):

- return re.search('[^/]*\/?$', path).group(0)

-def CalcDerivedFeatures(opengraph, url, numElements, numAnchors, numForms, innerText, textContent, innerHTML):

- path = urlparse.urlparse(url).path

- path = path.encode('utf-8')

- innerText = innerText.encode('utf-8')

- textContent = textContent.encode('utf-8')

- innerHTML = innerHTML.encode('utf-8')

- innerTextWords = WordCount(innerText)

- textContentWords = WordCount(textContent)

- innerHTMLWords = WordCount(innerHTML)

- return [

- 'opengraph', opengraph,

- 'forum', 'forum' in path,

- 'index', 'index' in path,

- 'view', 'view' in path,

- 'asp', '.asp' in path,

- 'phpbb', 'phpbb' in path,

- 'php', path.endswith('.php'),

- 'pathlength', len(path),

- 'domain', len(path) < 2,

- 'pathcomponents', CountMatches(path, r'\/.'),

- 'slugdetector', CountMatches(path, r'[^\w/]'),

- 'pathnumbers', CountMatches(path, r'\d+'),

- 'lastSegmentLength', len(GetLastSegment(path)),

- 'formcount', numForms,

- 'anchorcount', numAnchors,

- 'elementcount', numElements,

- 'anchorratio', float(numAnchors) / max(1, numElements),

- 'innertextlength', len(innerText),

- 'textcontentlength', len(textContent),

- 'innerhtmllength', len(innerHTML),

- 'innertextlengthratio', float(len(innerText)) / max(1, len(innerHTML)),

- 'textcontentlengthratio', float(len(textContent)) / max(1, len(innerHTML)),

- 'innertexttextcontentlengthratio',float(len(innerText)) / max(1, len(textContent)),

- 'innertextwordcount', innerTextWords,

- 'textcontentwordcount', textContentWords,

- 'innerhtmlwordcount', innerHTMLWords,

- 'innertextwordcountratio', float(innerTextWords) / max(1, innerHTMLWords),

- 'textcontentwordcountratio', float(textContentWords) / max(1, innerHTMLWords),

- 'innertexttextcontentwordcountratio', float(innerTextWords) / max(1, textContentWords),

- ]

-def main(argv):

- parser = argparse.ArgumentParser()

- parser.add_argument('--out', required=True)

- parser.add_argument('--core', required=True)

- options = parser.parse_args(argv)

- if os.path.exists(options.out):

- raise Exception('exists: ' + options.out)

- core = None

- with open(options.core) as core_file:

- core = json.load(core_file)

- for entry in core:

- features = entry['features']

- entry['features'] = CalcDerivedFeatures(

- features['opengraph'],

- features['url'],

- features['numElements'],

- features['numAnchors'],

- features['numForms'],

- features['innerText'],

- features['textContent'],

- features['innerHTML'])

- with open(options.out, 'w') as outfile:

- json.dump(core, outfile, indent=1)

- return 0

-if __name__ == '__main__':

- sys.exit(main(sys.argv[1:]))

« no previous file with comments | « no previous file | extract_features.js » ('j') | no next file with comments »