Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1248643004: Test distillability without JavaScript (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@early
Patch Set: update model, functionally correct locally Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/InputTypeNames.h"
10 #include "core/css/CSSComputedStyleDeclaration.h"
11 #include "core/dom/ElementTraversal.h"
12 #include "core/dom/NodeComputedStyle.h"
13 #include "core/editing/iterators/TextIterator.h"
14 #include "core/html/HTMLHeadElement.h"
15 #include "core/html/HTMLInputElement.h"
16 #include "core/html/HTMLMetaElement.h"
17 #include "public/platform/WebDistillability.h"
18 #include "wtf/text/StringBuilder.h"
19 #include "wtf/text/StringImpl.h"
20
21 namespace blink {
22
23 using namespace HTMLNames;
24
25 namespace {
26
27 // Saturate the length of a paragraph to save time.
28 const int kTextContentLengthSaturation = 1000;
29
30 unsigned textContentLengthSaturated(Element& root)
31 {
32 unsigned length = 0;
33 // This skips shadow dom intentionally.
34 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
35 if (!node.isTextNode()) {
36 continue;
37 }
38 length += toText(node).length();
39 if (length > kTextContentLengthSaturation) {
40 return kTextContentLengthSaturation;
41 }
42 }
43 return length;
44 }
45
46 bool isVisible(Element& element)
47 {
48 const ComputedStyle* style = element.computedStyle();
49 if (!style) {
50 return false;
51 }
52 ASSERT(style->display() != NONE);
53 return (
54 style->visibility() != HIDDEN
55 && style->opacity() != 0
56 );
57 }
58
59 bool matchAttributes(Element& element, const Vector<String>& words)
60 {
61 const String& classes = element.getClassAttribute();
62 const String& id = element.getIdAttribute();
63 for (const String& word : words) {
64 if (classes.findIgnoringCase(word) != WTF::kNotFound
65 || id.findIgnoringCase(word) != WTF::kNotFound) {
66 return true;
67 }
68 }
69 return false;
70 }
71
72 // underListItem denotes that at least one of the ancesters is <li> element.
73 void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool un derListItem = false)
74 {
75 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
76 if (unlikelyCandidates.isEmpty()) {
77 for (auto word : {"banner", "combx", "comment", "community", "disqus", " extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox ", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pag er", "popup"}) {
mdjones 2015/11/03 02:54:46 Do webkit files not have the 80 char line limit?
esprehn 2015/11/03 04:12:10 They do not, you should wrap where you think it's
wychen 2015/11/03 07:56:39 Fixed here: https://codereview.chromium.org/141903
78 unlikelyCandidates.append(word);
79 }
80 }
81 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
82 if (highlyLikelyCandidates.isEmpty()) {
83 for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
84 highlyLikelyCandidates.append(word);
85 }
86 }
87 // Filter out short P elements. The threshold is set to around 2 English sen tences.
88 const unsigned kParagraphLengthThreshold = 140;
89
90 // Saturate the scores to save time. The max is the score of 6 long paragrap hs.
91 const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - k ParagraphLengthThreshold);
92 const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturat ion);
93 const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation ;
94
95 for (Node& node : NodeTraversal::childrenOf(root)) {
96 if (node.isTextNode()) {
97 features.textContentLength += toText(node).length();
98 continue;
99 }
100 if (!node.isElementNode()) {
101 continue;
102 }
103
104 features.elementCount++;
105 Element& element = toElement(node);
106 if (element.hasTagName(aTag)) {
107 features.anchorCount++;
108 } else if (element.hasTagName(formTag)) {
109 features.formCount++;
110 } else if (element.hasTagName(inputTag)) {
111 const HTMLInputElement& input = toHTMLInputElement(element);
112 if (equalIgnoringCase(input.type(), InputTypeNames::text)) {
113 features.textInputCount++;
114 } else if (equalIgnoringCase(input.type(), InputTypeNames::password) ) {
115 features.passwordInputCount++;
116 }
117 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
118 if (element.hasTagName(pTag)) {
119 features.pCount++;
120 } else {
121 features.preCount++;
122 }
123 if (!underListItem
124 && (features.mozScore < kMozScoreSaturation
125 || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation
126 || features.mozScoreAllLinear < kMozScoreAllLinearSaturation )
127 && isVisible(element)
128 && (!matchAttributes(element, unlikelyCandidates) || matchAttrib utes(element, highlyLikelyCandidates))
129 ) {
130 unsigned length = textContentLengthSaturated(element);
131 if (length >= kParagraphLengthThreshold) {
132 features.mozScore += sqrt(length - kParagraphLengthThreshold );
133 features.mozScore = std::min(features.mozScore, kMozScoreSat uration);
134 }
135 features.mozScoreAllSqrt += sqrt(length);
136 features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kM ozScoreAllSqrtSaturation);
137
138 features.mozScoreAllLinear += length;
139 features.mozScoreAllLinear = std::min(features.mozScoreAllLinear , kMozScoreAllLinearSaturation);
140 }
141 } else if (element.hasTagName(liTag)) {
142 underListItem = true;
143 }
144 collectFeatures(element, features, underListItem);
145 }
146 }
147
148 bool hasOpenGraphArticle(const Element& head)
149 {
150 DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
151 DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
152 for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
153 if (!isHTMLMetaElement(*child))
154 continue;
155 const HTMLMetaElement& meta = toHTMLMetaElement(*child);
156
157 if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
158 if (equalIgnoringCase(meta.content(), "article")) {
159 return true;
160 }
161 }
162 }
163 return false;
164 }
165
166 } // namespace
167
168 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
169 {
170 WebDistillabilityFeatures features = WebDistillabilityFeatures();
171
172 if (!document.frame() || !document.frame()->isMainFrame())
173 return features;
174
175 ASSERT(document.hasFinishedParsing());
176
177 if (!document.body() || !document.head())
178 return features;
179
180 // Traverse the DOM tree and collect statistics.
181 collectFeatures(*document.body(), features);
182 features.openGraph = hasOpenGraphArticle(*document.head());
183
184 return features;
185 }
186
187 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698