Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(369)

Unified Diff: heuristics/distillable/extract_features.js

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « heuristics/distillable/calculate_derived_features.py ('k') | heuristics/distillable/get_screenshots.py » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: heuristics/distillable/extract_features.js
diff --git a/heuristics/distillable/extract_features.js b/heuristics/distillable/extract_features.js
new file mode 100644
index 0000000000000000000000000000000000000000..5392aa052fa65ee65d337f5d3cfe8af03c343183
--- /dev/null
+++ b/heuristics/distillable/extract_features.js
@@ -0,0 +1,98 @@
+return (function() {
+ function hasOGArticle() {
+ var elems = document.head.querySelectorAll(
+ 'meta[property="og:type"],meta[name="og:type"]');
+ for (var i in elems) {
+ if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ function isVisible(e) {
+ var bounds = e.getBoundingClientRect()
+ var style = window.getComputedStyle(e);
+ return !(
+ bounds.height == 0 || bounds.width == 0 ||
+ style.display == "none" ||
+ style.visibility == "hidden" ||
+ style.opacity == 0
+ )
+ }
+
+ function countVisible(nodes) {
+ var count = 0;
+ for (var i = 0; i < nodes.length; i++) {
+ var node = nodes[i];
+ if (!isVisible(node)) {
+ continue;
+ }
+ count++;
+ }
+ return count;
+ }
+
+ var unlikelyCandidates = /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i;
+ var okMaybeItsACandidate = /and|article|body|column|main|shadow/i;
+
+ function mozScore() {
+ return _mozScore(true, 0.5, 140, true, 1e100);
+ }
+
+ function _mozScore(trim, power, cut, excludeLi, saturate) {
+ var score = 0;
+
+ var nodes = document.querySelectorAll('p,pre')
+ for (var i = 0; i < nodes.length; i++) {
+ var node = nodes[i];
+ if (!isVisible(node)) {
+ continue;
+ }
+ var matchString = node.className + " " + node.id;
+ if (unlikelyCandidates.test(matchString) &&
+ !okMaybeItsACandidate.test(matchString)) {
+ continue;
+ }
+
+ if (excludeLi && node.matches && node.matches("li p")) {
+ continue;
+ }
+
+ var textContent = node.textContent;
+ if (trim) textContent = textContent.trim();
+ var textContentLength = textContent.length;
+ textContentLength = Math.min(saturate, textContentLength)
+ if (textContentLength < cut) {
+ continue;
+ }
+
+ score += Math.pow(textContentLength - cut, power);
+ }
+ return score;
+ }
+
+ var body = document.body;
+ return {
+ 'opengraph': hasOGArticle(),
+ 'url': document.location.href,
+ 'title': document.title,
+ 'numElements': body.querySelectorAll('*').length,
+ 'numAnchors': body.querySelectorAll('a').length,
+ 'numForms': body.querySelectorAll('form').length,
+ 'numTextInput': body.querySelectorAll('input[type="text"]').length,
+ 'numPasswordInput': body.querySelectorAll('input[type="password"]').length,
+ 'numPPRE': body.querySelectorAll('p,pre').length,
+ 'innerText': body.innerText,
+ 'textContent': body.textContent,
+ 'innerHTML': body.innerHTML,
+ 'mozScore': _mozScore(true, 0.5, 140, true, 1e100),
+ 'mozScoreAllSqrt': _mozScore(true, 0.5, 0, true, 1e100),
+ 'mozScoreAllLinear': _mozScore(true, 1, 0, true, 1e100),
+ 'mozScoreFast': _mozScore(false, 0.5, 140, true, 1000),
+ 'mozScoreFastAllSqrt': _mozScore(false, 0.5, 0, true, 1000),
+ 'mozScoreFastAllLinear': _mozScore(false, 1, 0, true, 1000),
+ 'visibleElements': countVisible(body.querySelectorAll('*')),
+ 'visiblePPRE': countVisible(body.querySelectorAll('p,pre')),
+ }
+})()
« no previous file with comments | « heuristics/distillable/calculate_derived_features.py ('k') | heuristics/distillable/get_screenshots.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698