Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(65)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comments, remove innerText Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/InputTypeNames.h"
10 #include "core/css/CSSComputedStyleDeclaration.h"
11 #include "core/dom/ElementTraversal.h"
12 #include "core/editing/iterators/TextIterator.h"
13 #include "core/html/HTMLHeadElement.h"
14 #include "core/html/HTMLInputElement.h"
15 #include "core/html/HTMLMetaElement.h"
16
17 // TODO(wychen): The following lines will be gone before landing.
18 #include "core/inspector/ConsoleMessage.h"
19
20 #include "public/platform/WebDistillability.h"
21 #include "wtf/text/StringBuilder.h"
22 #include "wtf/text/StringImpl.h"
23
24 namespace blink {
25
26 using namespace HTMLNames;
27
28 namespace {
29
30 // Saturate the length of a paragraph to save time.
31 const int kTextContentLengthSaturation = 1000;
32
33 unsigned trimmedTextContentLength(Element& root)
34 {
35 int firstNonWhite = -1, lastNonWhite = -1;
36 unsigned position = 0;
37 unsigned length = 0;
38 // TODO(wychen): scan backwards for lastNonWhite should be much faster in pr actice.
39 // This skips shadow dom intentionally.
40 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
41 if (!node.isTextNode()) {
42 continue;
43 }
44 const String& text = toText(node).data();
45 for (unsigned i = 0; i < text.length(); i++) {
46 if (!isSpaceOrNewline(text[i])) {
47 if (firstNonWhite < 0) {
48 firstNonWhite = lastNonWhite = i + position;
49 } else {
50 lastNonWhite = i + position;
51 }
52 }
53 }
54 if (firstNonWhite >= 0) {
55 length = lastNonWhite - firstNonWhite + 1;
56 if (length > kTextContentLengthSaturation) {
wychen 2015/10/28 22:00:51 With saturations, the total cost of trimmedTextCon
57 return kTextContentLengthSaturation;
58 }
59 }
60 position += text.length();
61 }
62 return length;
63 }
64
65 bool isVisible(Element& element)
66 {
67 const ComputedStyle* style = element.computedStyle();
68 if (!style) {
69 return false;
70 }
71 ASSERT(style->display() != NONE);
esprehn 2015/11/03 07:45:10 this isn't true, you can still have a style and be
wychen 2015/11/03 08:59:48 Done.
72 return (
73 style->visibility() != HIDDEN
74 && style->opacity() != 0
75 );
76 }
77
78 bool matchAttributes(Element& element, const Vector<String>& words)
79 {
80 const String& classes = element.getClassAttribute();
81 const String& id = element.getIdAttribute();
82 for (const String& word : words) {
83 if (classes.findIgnoringCase(word) != WTF::kNotFound
84 || id.findIgnoringCase(word) != WTF::kNotFound) {
85 return true;
86 }
87 }
88 return false;
89 }
90
91 // underListItem denotes that at least one of the ancesters is <li> element.
92 void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool un derListItem = false)
93 {
94 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
95 if (unlikelyCandidates.isEmpty()) {
96 for (auto word : {"banner", "combx", "comment", "community", "disqus", " extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox ", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pag er", "popup"}) {
97 unlikelyCandidates.append(word);
98 }
99 }
100 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
101 if (highlyLikelyCandidates.isEmpty()) {
102 for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
103 highlyLikelyCandidates.append(word);
104 }
105 }
106 // Filter out short P elements. The threshold is set to around 2 English sen tences.
107 const unsigned kParagraphLengthThreshold = 140;
108
109 // Saturate the scores to save time. The max is the score of 6 long paragrap hs.
110 const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - k ParagraphLengthThreshold);
111 const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturat ion);
112 const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation ;
113
114 for (Node& node : NodeTraversal::childrenOf(root)) {
115 if (node.isTextNode()) {
116 features.textContentLength += toText(node).length();
117 continue;
118 }
119 if (!node.isElementNode()) {
120 continue;
121 }
122
123 features.elementCount++;
124 Element& element = toElement(node);
125 if (element.hasTagName(aTag)) {
126 features.anchorCount++;
127 } else if (element.hasTagName(formTag)) {
128 features.formCount++;
129 } else if (element.hasTagName(inputTag)) {
130 const HTMLInputElement& input = toHTMLInputElement(element);
131 if (equalIgnoringCase(input.type(), InputTypeNames::text)) {
132 features.textInputCount++;
133 } else if (equalIgnoringCase(input.type(), InputTypeNames::password) ) {
134 features.passwordInputCount++;
135 }
136 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
137 if (element.hasTagName(pTag)) {
138 features.pCount++;
139 } else {
140 features.preCount++;
141 }
142 if (!underListItem
143 && (features.mozScore < kMozScoreSaturation
144 || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation
145 || features.mozScoreAllLinear < kMozScoreAllLinearSaturation )
146 && isVisible(element)
147 && (!matchAttributes(element, unlikelyCandidates) || matchAttrib utes(element, highlyLikelyCandidates))
148 ) {
149 unsigned length = trimmedTextContentLength(element);
150 if (length >= kParagraphLengthThreshold) {
151 features.mozScore += sqrt(length - kParagraphLengthThreshold );
152 features.mozScore = std::min(features.mozScore, kMozScoreSat uration);
153 }
154 features.mozScoreAllSqrt += sqrt(length);
155 features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kM ozScoreAllSqrtSaturation);
156
157 features.mozScoreAllLinear += length;
158 features.mozScoreAllLinear = std::min(features.mozScoreAllLinear , kMozScoreAllLinearSaturation);
159 }
160 } else if (element.hasTagName(liTag)) {
161 underListItem = true;
162 }
163 collectFeatures(element, features, underListItem);
164 }
165 }
166
167 bool hasOpenGraphArticle(const Element& head)
168 {
169 DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
170 DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
171 for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
172 if (!isHTMLMetaElement(*child))
173 continue;
174 const HTMLMetaElement& meta = toHTMLMetaElement(*child);
175
176 if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
177 if (equalIgnoringCase(meta.content(), "article")) {
178 return true;
179 }
180 }
181 }
182 return false;
183 }
184
185 } // namespace
186
187 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
188 {
189 WebDistillabilityFeatures features = WebDistillabilityFeatures();
190
191 if (!document.frame() || !document.frame()->isMainFrame())
192 return features;
193
194 ASSERT(document.hasFinishedParsing());
195
196 if (!document.body() || !document.head())
197 return features;
198
199 // The following DISTILLER_NDEBUG sections would be gone when landing.
200 #ifndef DISTILLER_NDEBUG
201 double startTime = WTF::currentTime();
202 #endif
203
204 // First, traverse the DOM tree and collect statistics.
205 collectFeatures(*document.body(), features);
206
207 #ifndef DISTILLER_NDEBUG
208 double elapsedTime = WTF::currentTime() - startTime;
209 startTime = WTF::currentTime();
210 #endif
211
212 features.openGraph = hasOpenGraphArticle(*document.head());
213
214 #ifndef DISTILLER_NDEBUG
215 double ogElapsedTime = WTF::currentTime() - startTime;
216 #endif
217
218 #ifndef DISTILLER_NDEBUG
219 StringBuilder message;
220 message.append("openGraph: ");
221 message.appendNumber(features.openGraph);
222 message.append(", elementCount: ");
223 message.appendNumber(features.elementCount);
224 message.append(", anchorCount: ");
225 message.appendNumber(features.anchorCount);
226 message.append(", formCount: ");
227 message.appendNumber(features.formCount);
228 message.append(", textInputCount: ");
229 message.appendNumber(features.textInputCount);
230 message.append(", passwordInputCount: ");
231 message.appendNumber(features.passwordInputCount);
232 message.append(", pCount: ");
233 message.appendNumber(features.pCount);
234 message.append(", preCount: ");
235 message.appendNumber(features.preCount);
236 message.append(", textContentLength: ");
237 message.appendNumber(features.textContentLength);
238 message.append(", mozScore: ");
239 message.appendNumber(features.mozScore);
240 message.append(", mozScoreAllSqrt: ");
241 message.appendNumber(features.mozScoreAllSqrt);
242 message.append(", mozScoreAllLinear: ");
243 message.appendNumber(features.mozScoreAllLinear);
244 message.append("\nElapsed time (ms): ");
245 message.appendNumber(elapsedTime * 1000);
246 message.append(", openGraph time (ms): ");
247 message.appendNumber(ogElapsedTime * 1000);
248
249 RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(C onsoleAPIMessageSource, DebugMessageLevel, message.toString());
250 document.addConsoleMessage(consoleMessage);
251 #endif
252
253 return features;
254 }
255
256 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698