Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(466)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: merge master Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/InputTypeNames.h"
10 #include "core/dom/ElementTraversal.h"
11 #include "core/dom/NodeComputedStyle.h"
12 #include "core/dom/Text.h"
13 #include "core/frame/FrameHost.h"
14 #include "core/html/HTMLHeadElement.h"
15 #include "core/html/HTMLInputElement.h"
16 #include "core/html/HTMLMetaElement.h"
17 #include "public/platform/Platform.h"
18 #include "public/platform/WebDistillability.h"
19
20 namespace blink {
21
22 using namespace HTMLNames;
23
24 namespace {
25
26 // Saturate the length of a paragraph to save time.
27 const int kTextContentLengthSaturation = 1000;
28
29 // Filter out short P elements. The threshold is set to around 2 English sentenc es.
30 const unsigned kParagraphLengthThreshold = 140;
31
32 // Saturate the scores to save time. The max is the score of 6 long paragraphs.
33 const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLength Saturation - kParagraphLengthThreshold)
34 const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContent LengthSaturation);
35 const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;
36
37 unsigned textContentLengthSaturated(Element& root)
38 {
39 unsigned length = 0;
40 // This skips shadow DOM intentionally, to match the JavaScript implementati on.
41 // We would like to use the same statistics extracted by the JavaScript impl ementation
42 // on iOS, and JavaScript cannot peek deeply into shadow DOM except on moder n Chrome
43 // versions.
44 // Given shadow DOM rarely appears in <P> elements in long-form articles, th e overall
45 // accuracy should not be largely affected.
46 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
47 if (!node.isTextNode()) {
48 continue;
49 }
50 length += toText(node).length();
51 if (length > kTextContentLengthSaturation) {
52 return kTextContentLengthSaturation;
53 }
54 }
55 return length;
56 }
57
58 bool isVisible(const Element& element)
59 {
60 const ComputedStyle* style = element.computedStyle();
esprehn 2015/11/05 01:21:59 you need ASSERT(!element.document().needsLayoutTr
wychen 2015/11/05 01:47:40 Skipped.
61 if (!style)
62 return false;
63 return (
64 style->display() != NONE
65 && style->visibility() != HIDDEN
66 && style->opacity() != 0
67 );
68 }
69
70 bool matchAttributes(const Element& element, const Vector<String>& words)
71 {
72 const String& classes = element.getClassAttribute();
73 const String& id = element.getIdAttribute();
74 for (const String& word : words) {
75 if (classes.findIgnoringCase(word) != WTF::kNotFound
76 || id.findIgnoringCase(word) != WTF::kNotFound) {
77 return true;
78 }
79 }
80 return false;
81 }
82
83 bool isGoodForScoring(bool underListItem, const WebDistillabilityFeatures& featu res, const Element& element)
84 {
85 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
86 if (unlikelyCandidates.isEmpty()) {
87 auto words = {
88 "banner",
89 "combx",
90 "comment",
91 "community",
92 "disqus",
93 "extra",
94 "foot",
95 "header",
96 "menu",
97 "related",
98 "remark",
99 "rss",
100 "share",
101 "shoutbox",
102 "sidebar",
103 "skyscraper",
104 "sponsor",
105 "ad-break",
106 "agegate",
107 "pagination",
108 "pager",
109 "popup"
110 };
111 for (auto word : words) {
112 unlikelyCandidates.append(word);
113 }
114 }
115 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
116 if (highlyLikelyCandidates.isEmpty()) {
117 auto words = {
118 "and",
119 "article",
120 "body",
121 "column",
122 "main",
123 "shadow"
124 };
125 for (auto word : words) {
126 highlyLikelyCandidates.append(word);
127 }
128 }
129
130 if (underListItem)
131 return false;
132 if (!isVisible(element))
133 return false;
134 if (features.mozScore >= kMozScoreSaturation
135 && features.mozScoreAllSqrt >= kMozScoreAllSqrtSaturation
136 && features.mozScoreAllLinear >= kMozScoreAllLinearSaturation)
137 return false;
138 if (matchAttributes(element, unlikelyCandidates) && !matchAttributes(element , highlyLikelyCandidates))
esprehn 2015/11/05 01:21:58 I'd wrap at the && like you did above
wychen 2015/11/05 01:47:40 Done.
139 return false;
140 return true;
141 }
142
143 // underListItem denotes that at least one of the ancesters is <li> element.
144 void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool un derListItem = false)
145 {
146 for (Node& node : NodeTraversal::childrenOf(root)) {
147 if (!node.isElementNode()) {
148 continue;
149 }
150
151 features.elementCount++;
152 Element& element = toElement(node);
153 if (element.hasTagName(aTag)) {
154 features.anchorCount++;
155 } else if (element.hasTagName(formTag)) {
156 features.formCount++;
157 } else if (element.hasTagName(inputTag)) {
158 const HTMLInputElement& input = toHTMLInputElement(element);
159 if (input.type() == InputTypeNames::text) {
160 features.textInputCount++;
161 } else if (input.type() == InputTypeNames::password) {
162 features.passwordInputCount++;
163 }
164 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
165 if (element.hasTagName(pTag)) {
166 features.pCount++;
167 } else {
168 features.preCount++;
169 }
170 if (isGoodForScoring(underListItem, features, element)) {
esprehn 2015/11/05 01:21:58 I'd probably move the underListItem check out so t
wychen 2015/11/05 01:47:40 Done.
171 unsigned length = textContentLengthSaturated(element);
172 if (length >= kParagraphLengthThreshold) {
173 features.mozScore += sqrt(length - kParagraphLengthThreshold );
174 features.mozScore = std::min(features.mozScore, kMozScoreSat uration);
175 }
176 features.mozScoreAllSqrt += sqrt(length);
177 features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kM ozScoreAllSqrtSaturation);
178
179 features.mozScoreAllLinear += length;
180 features.mozScoreAllLinear = std::min(features.mozScoreAllLinear , kMozScoreAllLinearSaturation);
181 }
182 } else if (element.hasTagName(liTag)) {
183 underListItem = true;
184 }
185 collectFeatures(element, features, underListItem);
186 }
187 }
188
189 bool hasOpenGraphArticle(const Element& head)
190 {
191 DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
192 DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
193 for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
194 if (!isHTMLMetaElement(*child))
195 continue;
196 const HTMLMetaElement& meta = toHTMLMetaElement(*child);
197
198 if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
199 if (equalIgnoringCase(meta.content(), "article")) {
200 return true;
201 }
202 }
203 }
204 return false;
205 }
206
207 bool isMobileFriendly(Document& document)
208 {
209 FrameHost* frameHost = document.frameHost();
210 if (!frameHost)
211 return false;
212 return frameHost->visualViewport().shouldDisableDesktopWorkarounds();
esprehn 2015/11/05 01:21:59 in blink we'd usually write: if (FrameHost* frame
wychen 2015/11/05 01:47:40 Done. Is this style for performance? Like what LIK
esprehn 2015/11/05 01:54:17 Not for performance, it just makes the scope of yo
wychen 2015/11/05 02:00:00 I see. Thanks
213 }
214
215 } // namespace
216
217 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
218 {
219 TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics");
220
221 WebDistillabilityFeatures features = WebDistillabilityFeatures();
222
223 if (!document.frame() || !document.frame()->isMainFrame())
224 return features;
225
226 ASSERT(document.hasFinishedParsing());
227
228 HTMLElement* body = document.body();
229 HTMLElement* head = document.head();
230
231 if (!body || !head)
232 return features;
233
234 if (isMobileFriendly(document)) {
235 features.isMobileFriendly = true;
esprehn 2015/11/05 01:21:58 so if it's mobile friendly we don't need to collec
wychen 2015/11/05 01:47:40 Yes. We currently only trigger Reader Mode on non-
236 return features;
237 }
238
239 double startTime = monotonicallyIncreasingTime();
240
241 // Traverse the DOM tree and collect statistics.
esprehn 2015/11/05 01:21:58 you either need to call updateLayoutTreeIfNeeded()
wychen 2015/11/05 01:47:40 I'll skip the assertion above. It might be slightl
242 collectFeatures(*body, features);
243 features.openGraph = hasOpenGraphArticle(*head);
244
245 double elapsedTime = monotonicallyIncreasingTime() - startTime;
246 Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", stati c_cast<int>(1e6 * elapsedTime), 1, 1000000, 50);
247
248 return features;
249 }
250
251 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698