Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(327)

Unified Diff: heuristics/distillable/calculate_derived_features.py

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/check_derived_features.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: heuristics/distillable/calculate_derived_features.py
diff --git a/heuristics/distillable/calculate_derived_features.py b/heuristics/distillable/calculate_derived_features.py
index 1ae52967464a57a2c5a730d77860f86dcdf3d738..dcff117ccfdb697663290571355651b5bfe92a74 100755
--- a/heuristics/distillable/calculate_derived_features.py
+++ b/heuristics/distillable/calculate_derived_features.py
@@ -23,8 +23,28 @@ def WordCount(s):
def GetLastSegment(path):
return re.search('[^/]*\/?$', path).group(0)
-def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, numForms, numPPRE, visibleElements, visiblePPRE,
- innerText, textContent, innerHTML, numText, numPassword, mozScores):
+def CalcDerivedFeatures(index, raw):
+ return _CalcDerivedFeatures(
+ index,
+ raw,
+ raw['opengraph'],
+ raw['url'],
+ raw['title'],
+ raw['numElements'],
+ raw['numAnchors'],
+ raw['numForms'],
+ raw['numPPRE'],
+ raw['visibleElements'],
+ raw['visiblePPRE'],
+ raw['innerText'],
+ raw['textContent'],
+ raw['innerHTML'],
+ raw['numTextInput'],
+ raw['numPasswordInput']
+ )
+
+def _CalcDerivedFeatures(index, raw, opengraph, url, title, numElements, numAnchors, numForms, numPPRE, visibleElements, visiblePPRE,
+ innerText, textContent, innerHTML, numText, numPassword):
path = urlparse.urlparse(url).path
path = path.encode('utf-8')
@@ -35,7 +55,9 @@ def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n
innerTextWords = WordCount(innerText)
textContentWords = WordCount(textContent)
innerHTMLWords = WordCount(innerHTML)
- return [
+ features = [
+ 'id', index,
+ 'sin', math.sin(index),
'openGraph', opengraph,
'forum', 'forum' in path,
@@ -53,16 +75,39 @@ def CalcDerivedFeatures(index, opengraph, url, title, numElements, numAnchors, n
'pathNumbers', CountMatches(path, r'\d+'),
'lastSegmentLength', len(GetLastSegment(path)),
+ 'visibleRatio', float(visibleElements) / max(1, numElements),
+ 'visiblePPRERatio', float(visiblePPRE) / max(1, numPPRE),
+ 'PPRERatio', float(numPPRE) / max(1, numElements),
+ 'anchorPPRERatio', float(numAnchors) / max(1, numPPRE),
+
+ 'innerTextLength', len(innerText),
+ 'textContentLength', len(textContent),
+ 'innerHtmlLength', len(innerHTML),
+ 'innerTextLengthRatio', float(len(innerText)) / max(1, len(innerHTML)),
+ 'textContentLengthRatio', float(len(textContent)) / max(1, len(innerHTML)),
+ 'innerTexttextContentLengthRatio',float(len(innerText)) / max(1, len(textContent)),
+
+ 'innerTextWordCount', innerTextWords,
+ 'textContentWordCount', textContentWords,
+ 'innerhtmlWordCount', innerHTMLWords,
+ 'innerTextWordCountRatio', float(innerTextWords) / max(1, innerHTMLWords),
+ 'textContentWordCountRatio', float(textContentWords) / max(1, innerHTMLWords),
+ 'innerTexttextContentWordCountRatio', float(innerTextWords) / max(1, textContentWords),
+
+ 'textCount', numText,
+ 'passwordCount', numPassword,
'formCount', numForms,
'anchorCount', numAnchors,
'elementCount', numElements,
'anchorRatio', float(numAnchors) / max(1, numElements),
-
- 'mozScore', min(mozScores[3], 6 * math.sqrt(1000-140)),
- 'mozScoreAllSqrt', min(mozScores[4], 6 * math.sqrt(1000)),
- 'mozScoreAllLinear', min(mozScores[5], 6000),
]
+ for k in sorted(raw):
+ if 'mozScore' in k or 'num' in k:
+ features += [k, raw[k]]
+
+ return features
+
def main(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--out', required=True)
@@ -79,25 +124,8 @@ def main(argv):
for entry in core:
features = entry['features']
print 'processing %d' % (entry['index'])
- entry['features'] = CalcDerivedFeatures(
- entry['index'],
- features['opengraph'],
- features['url'],
- features['title'],
- features['numElements'],
- features['numAnchors'],
- features['numForms'],
- features['numPPRE'],
- features['visibleElements'],
- features['visiblePPRE'],
- features['innerText'],
- features['textContent'],
- features['innerHTML'],
- features['numTextInput'],
- features['numPasswordInput'],
- [features['mozScore'], features['mozScoreAllSqrt'], features['mozScoreAllLinear'],
- features['mozScoreFast'], features['mozScoreFastAllSqrt'], features['mozScoreFastAllLinear']]
- )
+
+ entry['features'] = CalcDerivedFeatures(entry['index'], features)
with open(options.out, 'w') as outfile:
json.dump(core, outfile, indent=1)
« no previous file with comments | « heuristics/distillable/README.md ('k') | heuristics/distillable/check_derived_features.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698