Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(103)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comments, add saturation Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/InputTypeNames.h"
10 #include "core/css/CSSComputedStyleDeclaration.h"
11 #include "core/dom/ElementTraversal.h"
12 #include "core/editing/iterators/TextIterator.h"
13 #include "core/html/HTMLHeadElement.h"
14 #include "core/html/HTMLInputElement.h"
15 #include "core/html/HTMLMetaElement.h"
16
17 // TODO(wychen): The following lines will be gone before landing.
18 #include "core/inspector/ConsoleMessage.h"
19
20 #include "public/platform/WebDistillability.h"
21 #include "wtf/text/StringBuilder.h"
22 #include "wtf/text/StringImpl.h"
23
24 namespace blink {
25
26 using namespace HTMLNames;
27
28 namespace {
29
30 // Saturate the length of a paragraph to save time.
31 const int kTextContentLengthSaturation = 1000;
32
33 unsigned trimmedTextContentLength(Element& root)
34 {
35 int firstNonWhite = -1, lastNonWhite = -1;
36 unsigned position = 0;
37 unsigned length = 0;
38 // TODO(wychen): scan backwards for lastNonWhite should be much faster in pr actice.
39 // This skips shadow dom intentionally.
40 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
41 if (!node.isTextNode()) {
42 continue;
43 }
44 const String& text = toText(node).data();
45 for (unsigned i = 0; i < text.length(); i++) {
46 if (!isSpaceOrNewline(text[i])) {
47 if (firstNonWhite < 0) {
48 firstNonWhite = lastNonWhite = i + position;
49 } else {
50 lastNonWhite = i + position;
51 }
52 }
53 }
54 if (firstNonWhite >= 0) {
55 length = lastNonWhite - firstNonWhite + 1;
56 if (length > kTextContentLengthSaturation) {
57 return kTextContentLengthSaturation;
58 }
59 }
60 position += text.length();
61 }
62 return length;
63 }
64
65 unsigned innerTextLength(Element& root)
wychen 2015/10/28 22:00:51 I'll remove innerTextLength() for this version.
66 {
67 unsigned length = 0;
68 EphemeralRange range = EphemeralRange::rangeOfContents(root);
69 TextIterator it(range.startPosition(), range.endPosition(), TextIteratorForI nnerText);
70 for (; !it.atEnd(); it.advance()) {
71 length += it.length();
72 }
73 return length;
74 }
75
76 bool isVisible(Element& element)
77 {
78 const blink::ComputedStyle* style = element.ensureComputedStyle();
esprehn 2015/10/28 00:07:28 don't use ensureComputedStyle(), it forces us to c
wychen 2015/10/28 00:25:45 Quick question. I want to use getBoundingClientRec
esprehn 2015/10/28 00:37:17 That'll be more expensive than just checking the d
wychen 2015/10/28 00:49:46 Since display is not inherited, the current checki
esprehn 2015/10/28 00:57:19 var computedStyle = getComputedStyle(element); !co
wychen 2015/10/28 22:00:51 I just tried that. If the parent is display==none,
79 return (
80 style->display() != NONE
81 && style->visibility() != HIDDEN
82 && style->opacity() != 0
83 );
84 }
85
86 bool matchAttributes(Element& element, const Vector<String>& words)
87 {
88 const String& classes = element.getClassAttribute();
89 const String& id = element.getIdAttribute();
90 for (const String& word : words) {
91 if (classes.findIgnoringCase(word) != WTF::kNotFound
92 || id.findIgnoringCase(word) != WTF::kNotFound) {
93 return true;
94 }
95 }
96 return false;
97 }
98
99 // underListItem denotes that at least one of the ancesters is <li> element.
100 void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool un derListItem = false)
101 {
102 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
103 if (unlikelyCandidates.isEmpty()) {
104 for (auto word : {"banner", "combx", "comment", "community", "disqus", " extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox ", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pag er", "popup"}) {
105 unlikelyCandidates.append(word);
106 }
107 }
108 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
109 if (highlyLikelyCandidates.isEmpty()) {
110 for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
111 highlyLikelyCandidates.append(word);
112 }
113 }
114 // Filter out short P elements. The threshold is set to around 2 English sen tences.
115 const unsigned kParagraphLengthThreshold = 140;
116
117 // Saturate the scores to save time. The max is the score of 6 long paragrap hs.
118 const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - k ParagraphLengthThreshold);
119 const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturat ion);
120 const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation ;
121
122 for (Node& node : NodeTraversal::childrenOf(root)) {
123 if (node.isTextNode()) {
124 features.textContentLength += toText(node).length();
125 continue;
126 }
127 if (!node.isElementNode()) {
128 continue;
129 }
130
131 features.elementCount++;
132 Element& element = toElement(node);
133 if (element.hasTagName(aTag)) {
134 features.anchorCount++;
135 } else if (element.hasTagName(formTag)) {
136 features.formCount++;
137 } else if (element.hasTagName(inputTag)) {
138 const HTMLInputElement& input = toHTMLInputElement(element);
139 if (equalIgnoringCase(input.type(), InputTypeNames::text)) {
140 features.textInputCount++;
141 } else if (equalIgnoringCase(input.type(), InputTypeNames::password) ) {
142 features.passwordInputCount++;
143 }
144 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
145 if (element.hasTagName(pTag)) {
146 features.pCount++;
147 } else {
148 features.preCount++;
149 }
150 if (!underListItem
151 && (features.mozScore < kMozScoreSaturation
152 || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation
153 || features.mozScoreAllLinear < kMozScoreAllLinearSaturation )
154 && isVisible(element)
155 && (!matchAttributes(element, unlikelyCandidates) || matchAttrib utes(element, highlyLikelyCandidates))
156 ) {
157 unsigned length = trimmedTextContentLength(element);
158 if (length >= kParagraphLengthThreshold) {
159 features.mozScore += sqrt(length - kParagraphLengthThreshold );
160 features.mozScore = std::min(features.mozScore, kMozScoreSat uration);
161 }
162 features.mozScoreAllSqrt += sqrt(length);
163 features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kM ozScoreAllSqrtSaturation);
164
165 features.mozScoreAllLinear += length;
166 features.mozScoreAllLinear = std::min(features.mozScoreAllLinear , kMozScoreAllLinearSaturation);
167 }
168 } else if (element.hasTagName(liTag)) {
169 underListItem = true;
170 }
171 collectFeatures(element, features, underListItem);
172 }
173 }
174
175 bool hasOpenGraphArticle(const Element& head)
176 {
177 for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
178 if (!isHTMLMetaElement(*child))
179 continue;
180 const HTMLMetaElement& meta = toHTMLMetaElement(*child);
181 DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
182 DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
esprehn 2015/10/28 00:07:28 move these to the top of the function, not inside
wychen 2015/10/28 22:00:51 Done.
183
184 if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
185 if (equalIgnoringCase(meta.content(), "article")) {
186 return true;
187 }
188 }
189 }
190 return false;
191 }
192
193 } // namespace
194
195 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
196 {
197 WebDistillabilityFeatures features = WebDistillabilityFeatures();
198
199 if (!document.frame() || !document.frame()->isMainFrame())
200 return features;
201
202 ASSERT(document.hasFinishedParsing());
203
204 if (!document.body() || !document.head())
205 return features;
206
207 // The following DISTILLER_NDEBUG sections would be gone when landing.
208 #ifndef DISTILLER_NDEBUG
209 double startTime = WTF::currentTime();
esprehn 2015/10/28 00:07:28 add tracing macros, remove your time based stuff.
wychen 2015/10/28 22:00:51 Thanks for the tip. I gave it a try, but it involv
210 #endif
211
212 // First, traverse the DOM tree and collect statistics.
213 collectFeatures(*document.body(), features);
214
215 #ifndef DISTILLER_NDEBUG
216 double elapsedTime = WTF::currentTime() - startTime;
217 startTime = WTF::currentTime();
218 #endif
219
220 features.openGraph = hasOpenGraphArticle(*document.head());
221
222 #ifndef DISTILLER_NDEBUG
223 double ogElapsedTime = WTF::currentTime() - startTime;
224 startTime = WTF::currentTime();
225 #endif
226
227 // Next, traverse the Layout tree and collect statistics on innerText length .
228 document.updateLayout();
dglazkov 2015/10/27 23:59:19 Instead of updateLayout, just ASSERT to ensure the
wychen 2015/10/28 22:00:51 Done.
229 features.innerTextLength = innerTextLength(*document.body());
230
231 #ifndef DISTILLER_NDEBUG
232 double innerTextElapsedTime = WTF::currentTime() - startTime;
233 #endif
234
235 #ifndef DISTILLER_NDEBUG
236 StringBuilder message;
237 message.append("openGraph: ");
238 message.appendNumber(features.openGraph);
239 message.append(", elementCount: ");
240 message.appendNumber(features.elementCount);
241 message.append(", anchorCount: ");
242 message.appendNumber(features.anchorCount);
243 message.append(", formCount: ");
244 message.appendNumber(features.formCount);
245 message.append(", textInputCount: ");
246 message.appendNumber(features.textInputCount);
247 message.append(", passwordInputCount: ");
248 message.appendNumber(features.passwordInputCount);
249 message.append(", pCount: ");
250 message.appendNumber(features.pCount);
251 message.append(", preCount: ");
252 message.appendNumber(features.preCount);
253 message.append(", innerTextLength: ");
254 message.appendNumber(features.innerTextLength);
255 message.append(", textContentLength: ");
256 message.appendNumber(features.textContentLength);
257 message.append(", mozScore: ");
258 message.appendNumber(features.mozScore);
259 message.append(", mozScoreAllSqrt: ");
260 message.appendNumber(features.mozScoreAllSqrt);
261 message.append(", mozScoreAllLinear: ");
262 message.appendNumber(features.mozScoreAllLinear);
263 message.append("\nElapsed time (ms): ");
264 message.appendNumber(elapsedTime * 1000);
265 message.append(", openGraph time (ms): ");
266 message.appendNumber(ogElapsedTime * 1000);
267 message.append(", innerText time (ms): ");
268 message.appendNumber(innerTextElapsedTime * 1000);
269
270 RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(C onsoleAPIMessageSource, DebugMessageLevel, message.toString());
271 document.addConsoleMessage(consoleMessage);
272 #endif
273
274 return features;
275 }
276
277 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698