Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(69)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comments, add tests Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/css/CSSComputedStyleDeclaration.h"
10 #include "core/editing/iterators/TextIterator.h"
11 #include "core/html/HTMLHeadElement.h"
12 #include "core/html/HTMLInputElement.h"
13 #include "core/html/HTMLMetaElement.h"
14
15 // TODO(wychen): The following lines will be gone before landing.
16 #include "core/inspector/ConsoleMessage.h"
17
18 #include "public/platform/WebDistillability.h"
19 #include "wtf/text/StringBuilder.h"
20 #include "wtf/text/StringImpl.h"
21
22 namespace blink {
23
24 using namespace HTMLNames;
25
26 namespace {
27
28 unsigned trimmedTextContentLength(Element& root)
esprehn 2015/10/26 21:43:09 Do pages usually have enough whitespace for this t
wychen 2015/10/27 23:52:12 Trimming the white spaces should make this less no
29 {
30 int firstNonWhite = -1, lastNonWhite = -1;
31 unsigned position = 0;
32 // TODO(wychen): scan backwards for lastNonWhite should be much faster in pr actice.
33 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
esprehn 2015/10/26 21:43:08 This doesn't understand shadow dom. ex. <p><conte
wychen 2015/10/27 23:52:12 The visibility model used here is the same as Java
34 if (!node.isTextNode()) {
35 continue;
36 }
37 const String& text = toText(node).data();
38 for (unsigned i = 0; i < text.length(); i++) {
39 if (!isSpaceOrNewline(text[i])) {
40 if (firstNonWhite < 0) {
41 firstNonWhite = lastNonWhite = i + position;
42 } else {
43 lastNonWhite = i + position;
44 }
45 }
46 }
47 position += text.length();
48 }
49 if (firstNonWhite < 0) {
50 return 0;
51 }
52 return lastNonWhite - firstNonWhite + 1;
53 }
54
55 unsigned innerTextLength(Element& root)
56 {
57 unsigned length = 0;
58 EphemeralRange range = EphemeralRange::rangeOfContents(root);
59 TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPo sition(), TextIteratorForInnerText);
esprehn 2015/10/26 21:43:09 Why do you care about innerText length at all? Tha
wychen 2015/10/27 23:52:12 The length ratio between innerText and textContent
60 for (; !it.atEnd(); it.advance()) {
61 length += it.length();
62 }
63 return length;
64 }
65
66 bool isVisible(Element& element)
67 {
68 const blink::ComputedStyle* style = element.ensureComputedStyle();
esprehn 2015/10/26 21:43:08 this forces a style computation on elements that w
wychen 2015/10/27 23:52:12 This statistics collection happens right after the
69 return !(
esprehn 2015/10/26 21:43:09 run demorgans
wychen 2015/10/27 23:52:12 Done.
70 style->display() == NONE
71 || style->visibility() == HIDDEN
72 || style->opacity() == 0
73 );
74 }
75
76 bool matchName(Element& element, const Vector<String>& words)
77 {
78 if (element.hasClass()) {
79 const String& hay = element.getClassAttribute();
80 for (const String& word: words) {
esprehn 2015/10/26 21:43:09 missing space
wychen 2015/10/27 23:52:12 Done.
81 if (hay.findIgnoringCase(word) != WTF::kNotFound) {
82 return true;
83 }
84 }
85 }
86 if (UNLIKELY(element.hasID())) {
esprehn 2015/10/26 21:43:09 remove UNLIKELY
wychen 2015/10/27 23:52:12 Done.
87 const String& hay = element.getIdAttribute();
esprehn 2015/10/26 21:43:09 these already have a hasID() check in them, so I'd
wychen 2015/10/27 23:52:11 Done.
88 for (const String& word: words) {
esprehn 2015/10/26 21:43:08 missing space
wychen 2015/10/27 23:52:12 Done.
89 if (hay.findIgnoringCase(word) != WTF::kNotFound) {
90 return true;
91 }
92 }
93 }
94 return false;
95 }
96
97 // underListItem denotes that at least one of the ancesters is <li> element.
98 void walk(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)
esprehn 2015/10/26 21:43:08 needs a better name. collectFeatures?
wychen 2015/10/27 23:52:12 Done.
99 {
100 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
101 if (unlikelyCandidates.size() == 0) {
esprehn 2015/10/26 21:43:09 isEmpty()
wychen 2015/10/27 23:52:11 Done.
102 for (auto word : {"banner", "combx", "comment", "community", "disqus", " extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox ", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pag er", "popup"}) {
103 unlikelyCandidates.append(word);
104 }
105 }
106 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
107 if (highlyLikelyCandidates.size() == 0) {
esprehn 2015/10/26 21:43:08 isEmpty()
wychen 2015/10/27 23:52:12 Done.
108 for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
109 highlyLikelyCandidates.append(word);
110 }
111 }
112 const unsigned kParagraphLengthThreshold = 140;
esprehn 2015/10/26 21:43:08 why did you pick 140? Add a comment.
wychen 2015/10/27 23:52:12 Done.
113
114 for (Node& node : NodeTraversal::childrenOf(root)) {
115 if (node.isTextNode()) {
116 features.textContentLength += toText(node).length();
117 continue;
118 }
119 if (!node.isElementNode()) {
120 continue;
121 }
122
123 features.elementCount++;
124 Element& element = toElement(node);
125 if (element.hasTagName(aTag)) {
126 features.anchorCount++;
127 } else if (element.hasTagName(formTag)) {
128 features.formCount++;
129 } else if (element.hasTagName(inputTag)) {
130 const HTMLInputElement& input = toHTMLInputElement(element);
131 if (equalIgnoringCase(input.type(), "text")) {
esprehn 2015/10/26 21:43:09 input.type() == InputTypeNames::text
wychen 2015/10/27 23:52:12 Done.
132 features.textInputCount++;
133 } else if (equalIgnoringCase(input.type(), "password")) {
esprehn 2015/10/26 21:43:08 == InputTypeNames::password
wychen 2015/10/27 23:52:12 Done.
134 features.passwordInputCount++;
135 }
136 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
137 if (element.hasTagName(pTag)) {
138 features.pCount++;
139 } else {
140 features.preCount++;
141 }
142 if (!underListItem && isVisible(element)
143 && (!matchName(element, unlikelyCandidates) || matchName(element , highlyLikelyCandidates))) {
esprehn 2015/10/26 21:43:09 matchAttributes? It's not really related to name a
wychen 2015/10/27 23:52:12 Done.
144 unsigned length = trimmedTextContentLength(element);
145 if (length >= kParagraphLengthThreshold) {
146 features.mozScore += sqrt(length - kParagraphLengthThreshold );
147 }
148 features.mozScoreAllSqrt += sqrt(length);
149 features.mozScoreAllLinear += length;
150 }
151 }
152 walk(element, features, element.hasTagName(liTag) || underListItem);
esprehn 2015/10/26 21:43:09 this checks hasTagName(liTag) for every element, e
wychen 2015/10/27 23:52:12 Done.
153 }
154 }
155
156 bool hasOpenGraphArticle(const Element& head)
157 {
158 for (const Node& node : NodeTraversal::childrenOf(head)) {
esprehn 2015/10/26 21:43:09 for (const Element* child = ElementTraversal::firs
wychen 2015/10/27 23:52:12 Done.
159 if (!node.isElementNode())
160 continue;
161 const Element& element = toElement(node);
162 if (!isHTMLMetaElement(element))
163 continue;
164 const HTMLMetaElement& meta = toHTMLMetaElement(element);
165 if (meta.name() == "og:type" || element.getAttribute("property") == "og: type") {
esprehn 2015/10/26 21:43:09 You want to declare static local AtomicString vari
wychen 2015/10/27 23:52:12 Done.
166 if (equalIgnoringCase(meta.content(), "article")) {
167 return true;
168 }
169 }
170 }
171 return false;
172 }
173
174 } // namespace
175
176 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
177 {
178 WebDistillabilityFeatures features = WebDistillabilityFeatures();
179
180 if (!document.frame() || !document.frame()->isMainFrame())
181 return features;
182
183 ASSERT(document.hasFinishedParsing());
184
185 if (!document.body() || !document.head())
186 return features;
187
188 // First, traverse the DOM tree and collect statistics.
189 walk(*document.body(), features);
190 features.openGraph = hasOpenGraphArticle(*document.head());
191
192 // Next, traverse the Layout tree and collect statistics on innerText length .
esprehn 2015/10/26 21:43:08 this needs to do document->updateLayout() so it's
wychen 2015/10/27 23:52:12 Done. I'm curious when should updateLayoutIgnorePe
193 features.innerTextLength += innerTextLength(*document.body());
esprehn 2015/10/26 21:43:09 this really seems unnecessary, you can just collec
wychen 2015/10/27 23:52:12 There seems to be much more than visibility in Tex
194
195 // The following DISTILLER_NDEBUG section would be gone when landing.
196 #ifndef DISTILLER_NDEBUG
197 StringBuilder message;
198 message.append("openGraph: ");
199 message.appendNumber(features.openGraph);
200 message.append(", elementCount: ");
201 message.appendNumber(features.elementCount);
202 message.append(", anchorCount: ");
203 message.appendNumber(features.anchorCount);
204 message.append(", formCount: ");
205 message.appendNumber(features.formCount);
206 message.append(", textInputCount: ");
207 message.appendNumber(features.textInputCount);
208 message.append(", passwordInputCount: ");
209 message.appendNumber(features.passwordInputCount);
210 message.append(", pCount: ");
211 message.appendNumber(features.pCount);
212 message.append(", innerTextLength: ");
213 message.appendNumber(features.innerTextLength);
214 message.append(", textContentLength: ");
215 message.appendNumber(features.textContentLength);
216 message.append(", mozScore: ");
217 message.appendNumber(features.mozScore);
218 message.append(", mozScoreAllSqrt: ");
219 message.appendNumber(features.mozScoreAllSqrt);
220 message.append(", mozScoreAllLinear: ");
221 message.appendNumber(features.mozScoreAllLinear);
222
223 RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(C onsoleAPIMessageSource, DebugMessageLevel, message.toString());
224 document.addConsoleMessage(consoleMessage);
225 #endif
226
227 return features;
228 }
229
230 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698