Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Side by Side Diff: heuristics/distillable/extract_features.js

Issue 1620043002: Add scripts for distillability modelling (Closed) Base URL: git@github.com:chromium/dom-distiller.git@master
Patch Set: set upstream patchset, identical to patch set 2 Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 return (function() {
2 function hasOGArticle() {
3 var elems = document.head.querySelectorAll(
4 'meta[property="og:type"],meta[name="og:type"]');
5 for (var i in elems) {
6 if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') {
7 return true;
8 }
9 }
10 return false;
11 }
12
13 function isVisible(e) {
14 var bounds = e.getBoundingClientRect()
15 var style = window.getComputedStyle(e);
16 return !(
17 bounds.height == 0 || bounds.width == 0 ||
18 style.display == "none" ||
19 style.visibility == "hidden" ||
20 style.opacity == 0
21 )
22 }
23
24 function countVisible(nodes) {
25 var count = 0;
26 for (var i = 0; i < nodes.length; i++) {
27 var node = nodes[i];
28 if (!isVisible(node)) {
29 continue;
30 }
31 count++;
32 }
33 return count;
34 }
35
36 var unlikelyCandidates = /banner|combx|comment|community|disqus|extra|foot|hea der|menu|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|a gegate|pagination|pager|popup/i;
37 var okMaybeItsACandidate = /and|article|body|column|main|shadow/i;
38
39 function mozScore() {
40 return _mozScore(true, 0.5, 140, true, 1e100);
41 }
42
43 function _mozScore(trim, power, cut, excludeLi, saturate) {
44 var score = 0;
45
46 var nodes = document.querySelectorAll('p,pre')
47 for (var i = 0; i < nodes.length; i++) {
48 var node = nodes[i];
49 if (!isVisible(node)) {
50 continue;
51 }
52 var matchString = node.className + " " + node.id;
53 if (unlikelyCandidates.test(matchString) &&
54 !okMaybeItsACandidate.test(matchString)) {
55 continue;
56 }
57
58 if (excludeLi && node.matches && node.matches("li p")) {
59 continue;
60 }
61
62 var textContent = node.textContent;
63 if (trim) textContent = textContent.trim();
64 var textContentLength = textContent.length;
65 textContentLength = Math.min(saturate, textContentLength)
66 if (textContentLength < cut) {
67 continue;
68 }
69
70 score += Math.pow(textContentLength - cut, power);
71 }
72 return score;
73 }
74
75 var body = document.body;
76 return {
77 'opengraph': hasOGArticle(),
78 'url': document.location.href,
79 'title': document.title,
80 'numElements': body.querySelectorAll('*').length,
81 'numAnchors': body.querySelectorAll('a').length,
82 'numForms': body.querySelectorAll('form').length,
83 'numTextInput': body.querySelectorAll('input[type="text"]').length,
84 'numPasswordInput': body.querySelectorAll('input[type="password"]').length,
85 'numPPRE': body.querySelectorAll('p,pre').length,
86 'innerText': body.innerText,
87 'textContent': body.textContent,
88 'innerHTML': body.innerHTML,
89 'mozScore': _mozScore(true, 0.5, 140, true, 1e100),
90 'mozScoreAllSqrt': _mozScore(true, 0.5, 0, true, 1e100),
91 'mozScoreAllLinear': _mozScore(true, 1, 0, true, 1e100),
92 'mozScoreFast': _mozScore(false, 0.5, 140, true, 1000),
93 'mozScoreFastAllSqrt': _mozScore(false, 0.5, 0, true, 1000),
94 'mozScoreFastAllLinear': _mozScore(false, 1, 0, true, 1000),
95 'visibleElements': countVisible(body.querySelectorAll('*')),
96 'visiblePPRE': countVisible(body.querySelectorAll('p,pre')),
97 }
98 })()
OLDNEW
« no previous file with comments | « heuristics/distillable/calculate_derived_features.py ('k') | heuristics/distillable/get_screenshots.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698