Index: heuristics/distillable/extract_features.js |
diff --git a/heuristics/distillable/extract_features.js b/heuristics/distillable/extract_features.js |
new file mode 100644 |
index 0000000000000000000000000000000000000000..5392aa052fa65ee65d337f5d3cfe8af03c343183 |
--- /dev/null |
+++ b/heuristics/distillable/extract_features.js |
@@ -0,0 +1,98 @@ |
+return (function() { |
+ function hasOGArticle() { |
+ var elems = document.head.querySelectorAll( |
+ 'meta[property="og:type"],meta[name="og:type"]'); |
+ for (var i in elems) { |
+ if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') { |
+ return true; |
+ } |
+ } |
+ return false; |
+ } |
+ |
+ function isVisible(e) { |
+ var bounds = e.getBoundingClientRect() |
+ var style = window.getComputedStyle(e); |
+ return !( |
+ bounds.height == 0 || bounds.width == 0 || |
+ style.display == "none" || |
+ style.visibility == "hidden" || |
+ style.opacity == 0 |
+ ) |
+ } |
+ |
+ function countVisible(nodes) { |
+ var count = 0; |
+ for (var i = 0; i < nodes.length; i++) { |
+ var node = nodes[i]; |
+ if (!isVisible(node)) { |
+ continue; |
+ } |
+ count++; |
+ } |
+ return count; |
+ } |
+ |
+ var unlikelyCandidates = /banner|combx|comment|community|disqus|extra|foot|header|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i; |
+ var okMaybeItsACandidate = /and|article|body|column|main|shadow/i; |
+ |
+ function mozScore() { |
+ return _mozScore(true, 0.5, 140, true, 1e100); |
+ } |
+ |
+ function _mozScore(trim, power, cut, excludeLi, saturate) { |
+ var score = 0; |
+ |
+ var nodes = document.querySelectorAll('p,pre') |
+ for (var i = 0; i < nodes.length; i++) { |
+ var node = nodes[i]; |
+ if (!isVisible(node)) { |
+ continue; |
+ } |
+ var matchString = node.className + " " + node.id; |
+ if (unlikelyCandidates.test(matchString) && |
+ !okMaybeItsACandidate.test(matchString)) { |
+ continue; |
+ } |
+ |
+ if (excludeLi && node.matches && node.matches("li p")) { |
+ continue; |
+ } |
+ |
+ var textContent = node.textContent; |
+ if (trim) textContent = textContent.trim(); |
+ var textContentLength = textContent.length; |
+ textContentLength = Math.min(saturate, textContentLength) |
+ if (textContentLength < cut) { |
+ continue; |
+ } |
+ |
+ score += Math.pow(textContentLength - cut, power); |
+ } |
+ return score; |
+ } |
+ |
+ var body = document.body; |
+ return { |
+ 'opengraph': hasOGArticle(), |
+ 'url': document.location.href, |
+ 'title': document.title, |
+ 'numElements': body.querySelectorAll('*').length, |
+ 'numAnchors': body.querySelectorAll('a').length, |
+ 'numForms': body.querySelectorAll('form').length, |
+ 'numTextInput': body.querySelectorAll('input[type="text"]').length, |
+ 'numPasswordInput': body.querySelectorAll('input[type="password"]').length, |
+ 'numPPRE': body.querySelectorAll('p,pre').length, |
+ 'innerText': body.innerText, |
+ 'textContent': body.textContent, |
+ 'innerHTML': body.innerHTML, |
+ 'mozScore': _mozScore(true, 0.5, 140, true, 1e100), |
+ 'mozScoreAllSqrt': _mozScore(true, 0.5, 0, true, 1e100), |
+ 'mozScoreAllLinear': _mozScore(true, 1, 0, true, 1e100), |
+ 'mozScoreFast': _mozScore(false, 0.5, 140, true, 1000), |
+ 'mozScoreFastAllSqrt': _mozScore(false, 0.5, 0, true, 1000), |
+ 'mozScoreFastAllLinear': _mozScore(false, 1, 0, true, 1000), |
+ 'visibleElements': countVisible(body.querySelectorAll('*')), |
+ 'visiblePPRE': countVisible(body.querySelectorAll('p,pre')), |
+ } |
+})() |