Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(163)

Side by Side Diff: heuristics/distillable/extract_features.js

Issue 1808503002: Update distillability modeling scripts to predict long articles (Closed) Base URL: git@github.com:chromium/dom-distiller.git@ml-visible
Patch Set: update docs Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 return (function() { 1 return (function() {
2 function hasOGArticle() { 2 function hasOGArticle() {
3 var elems = document.head.querySelectorAll( 3 var elems = document.head.querySelectorAll(
4 'meta[property="og:type"],meta[name="og:type"]'); 4 'meta[property="og:type"],meta[name="og:type"]');
5 for (var i in elems) { 5 for (var i in elems) {
6 if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') { 6 if (elems[i].content && elems[i].content.toUpperCase() == 'ARTICLE') {
7 return true; 7 return true;
8 } 8 }
9 } 9 }
10 return false; 10 return false;
(...skipping 55 matching lines...) Expand 10 before | Expand all | Expand 10 after
66 if (textContentLength < cut) { 66 if (textContentLength < cut) {
67 continue; 67 continue;
68 } 68 }
69 69
70 score += Math.pow(textContentLength - cut, power); 70 score += Math.pow(textContentLength - cut, power);
71 } 71 }
72 return score; 72 return score;
73 } 73 }
74 74
75 var body = document.body; 75 var body = document.body;
76 return { 76 var features = {
77 'opengraph': hasOGArticle(), 77 'opengraph': hasOGArticle(),
78 'url': document.location.href, 78 'url': document.location.href,
79 'title': document.title, 79 'title': document.title,
80 'numElements': body.querySelectorAll('*').length, 80 'numElements': body.querySelectorAll('*').length,
81 'numAnchors': body.querySelectorAll('a').length, 81 'numAnchors': body.querySelectorAll('a').length,
82 'numForms': body.querySelectorAll('form').length, 82 'numForms': body.querySelectorAll('form').length,
83 'numTextInput': body.querySelectorAll('input[type="text"]').length, 83 'numTextInput': body.querySelectorAll('input[type="text"]').length,
84 'numPasswordInput': body.querySelectorAll('input[type="password"]').length, 84 'numPasswordInput': body.querySelectorAll('input[type="password"]').length,
85 'numPPRE': body.querySelectorAll('p,pre').length, 85 'numPPRE': body.querySelectorAll('p,pre').length,
86 'innerText': body.innerText, 86 'innerText': body.innerText,
87 'textContent': body.textContent, 87 'textContent': body.textContent,
88 'innerHTML': body.innerHTML, 88 'innerHTML': body.innerHTML,
89 'mozScore': _mozScore(true, 0.5, 140, true, 1e100), 89 'mozScore': Math.min(6 * Math.sqrt(1000 - 140), _mozScore(false, 0.5, 140, true, 1000)),
90 'mozScoreAllSqrt': _mozScore(true, 0.5, 0, true, 1e100), 90 'mozScoreAllSqrt': Math.min(6 * Math.sqrt(1000), _mozScore(false, 0.5, 0, t rue, 1000)),
91 'mozScoreAllLinear': _mozScore(true, 1, 0, true, 1e100), 91 'mozScoreAllLinear': Math.min(6 * 1000, _mozScore(false, 1, 0, true, 1000)) ,
92 'mozScoreFast': _mozScore(false, 0.5, 140, true, 1000),
93 'mozScoreFastAllSqrt': _mozScore(false, 0.5, 0, true, 1000),
94 'mozScoreFastAllLinear': _mozScore(false, 1, 0, true, 1000),
95 'visibleElements': countVisible(body.querySelectorAll('*')), 92 'visibleElements': countVisible(body.querySelectorAll('*')),
96 'visiblePPRE': countVisible(body.querySelectorAll('p,pre')), 93 'visiblePPRE': countVisible(body.querySelectorAll('p,pre')),
97 } 94 }
95 return features;
98 })() 96 })()
OLDNEW
« no previous file with comments | « heuristics/distillable/check_distilled_mhtml.py ('k') | heuristics/distillable/get_screenshots.py » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698