Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(161)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: wrap long line Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/InputTypeNames.h"
10 #include "core/css/CSSComputedStyleDeclaration.h"
esprehn 2015/11/03 07:45:10 don't need this.
wychen 2015/11/03 08:59:49 Done.
11 #include "core/dom/ElementTraversal.h"
12 #include "core/dom/NodeComputedStyle.h"
13 #include "core/editing/iterators/TextIterator.h"
esprehn 2015/11/03 07:45:10 don't need this.
wychen 2015/11/03 08:59:49 Done.
14 #include "core/frame/FrameHost.h"
15 #include "core/html/HTMLHeadElement.h"
16 #include "core/html/HTMLInputElement.h"
17 #include "core/html/HTMLMetaElement.h"
18 #include "public/platform/Platform.h"
19 #include "public/platform/WebDistillability.h"
20 #include "wtf/text/StringBuilder.h"
21 #include "wtf/text/StringImpl.h"
esprehn 2015/11/03 07:45:10 you don't need StringImpl, StringBuilder
wychen 2015/11/03 08:59:48 Done. Just curious, did you use IDE to check unne
22
23 namespace blink {
24
25 using namespace HTMLNames;
26
27 namespace {
28
29 // Saturate the length of a paragraph to save time.
30 const int kTextContentLengthSaturation = 1000;
31
32 unsigned textContentLengthSaturated(Element& root)
33 {
34 unsigned length = 0;
35 // This skips shadow dom intentionally, to match the JavaScript implementati on.
esprehn 2015/11/03 07:45:10 Why?
wychen 2015/11/03 08:59:48 Done.
36 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
37 if (!node.isTextNode()) {
38 continue;
39 }
40 length += toText(node).length();
41 if (length > kTextContentLengthSaturation) {
42 return kTextContentLengthSaturation;
43 }
44 }
45 return length;
46 }
47
48 bool isVisible(Element& element)
49 {
50 const ComputedStyle* style = element.computedStyle();
51 if (!style)
52 return false;
53 ASSERT(style->display() != NONE);
esprehn 2015/11/03 07:45:10 this assert is wrong, you can have a style and be
wychen 2015/11/03 08:59:49 Done.
54 return (
55 style->visibility() != HIDDEN
56 && style->opacity() != 0
57 );
58 }
59
60 bool matchAttributes(Element& element, const Vector<String>& words)
61 {
62 const String& classes = element.getClassAttribute();
63 const String& id = element.getIdAttribute();
64 for (const String& word : words) {
65 if (classes.findIgnoringCase(word) != WTF::kNotFound
66 || id.findIgnoringCase(word) != WTF::kNotFound) {
67 return true;
68 }
69 }
70 return false;
71 }
72
73 // underListItem denotes that at least one of the ancesters is <li> element.
74 void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool un derListItem = false)
75 {
76 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
77 if (unlikelyCandidates.isEmpty()) {
78 auto words = {
79 "banner",
80 "combx",
81 "comment",
82 "community",
83 "disqus",
84 "extra",
85 "foot",
86 "header",
87 "menu",
88 "related",
89 "remark",
90 "rss",
91 "share",
92 "shoutbox",
93 "sidebar",
94 "skyscraper",
95 "sponsor",
96 "ad-break",
97 "agegate",
98 "pagination",
99 "pager",
100 "popup"
101 };
102 for (auto word : words) {
103 unlikelyCandidates.append(word);
104 }
105 }
106 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
107 if (highlyLikelyCandidates.isEmpty()) {
108 for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
esprehn 2015/11/03 07:45:10 I'd wrap this one too.
wychen 2015/11/03 08:59:48 Done.
109 highlyLikelyCandidates.append(word);
110 }
111 }
112 // Filter out short P elements. The threshold is set to around 2 English sen tences.
113 const unsigned kParagraphLengthThreshold = 140;
114
115 // Saturate the scores to save time. The max is the score of 6 long paragrap hs.
116 const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - k ParagraphLengthThreshold);
117 const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturat ion);
118 const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation ;
119
120 for (Node& node : NodeTraversal::childrenOf(root)) {
121 if (node.isTextNode()) {
122 features.textContentLength += toText(node).length();
esprehn 2015/11/03 07:45:10 this is going to add the length of every inline <s
wychen 2015/11/03 08:59:48 It is possible that innerTextLength/textContentLen
123 continue;
124 }
125 if (!node.isElementNode()) {
126 continue;
127 }
128
129 features.elementCount++;
130 Element& element = toElement(node);
131 if (element.hasTagName(aTag)) {
132 features.anchorCount++;
133 } else if (element.hasTagName(formTag)) {
134 features.formCount++;
135 } else if (element.hasTagName(inputTag)) {
136 const HTMLInputElement& input = toHTMLInputElement(element);
137 if (equalIgnoringCase(input.type(), InputTypeNames::text)) {
esprehn 2015/11/03 07:45:10 ditto == InputTypeNames::text
wychen 2015/11/03 08:59:48 Good catch! I guess AtomicString == is faster.
138 features.textInputCount++;
139 } else if (equalIgnoringCase(input.type(), InputTypeNames::password) ) {
esprehn 2015/11/03 07:45:10 this is always lowercase, you can just do == ::pas
wychen 2015/11/03 08:59:48 Done.
140 features.passwordInputCount++;
141 }
142 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
143 if (element.hasTagName(pTag)) {
144 features.pCount++;
145 } else {
146 features.preCount++;
147 }
148 if (!underListItem
149 && (features.mozScore < kMozScoreSaturation
150 || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation
151 || features.mozScoreAllLinear < kMozScoreAllLinearSaturation )
152 && isVisible(element)
esprehn 2015/11/03 07:45:10 this is a crazy set of conditions, in blink we try
wychen 2015/11/03 08:59:48 Done.
153 && (!matchAttributes(element, unlikelyCandidates) || matchAttrib utes(element, highlyLikelyCandidates))
154 ) {
155 unsigned length = textContentLengthSaturated(element);
156 if (length >= kParagraphLengthThreshold) {
157 features.mozScore += sqrt(length - kParagraphLengthThreshold );
158 features.mozScore = std::min(features.mozScore, kMozScoreSat uration);
159 }
160 features.mozScoreAllSqrt += sqrt(length);
161 features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kM ozScoreAllSqrtSaturation);
162
163 features.mozScoreAllLinear += length;
164 features.mozScoreAllLinear = std::min(features.mozScoreAllLinear , kMozScoreAllLinearSaturation);
165 }
166 } else if (element.hasTagName(liTag)) {
167 underListItem = true;
168 }
169 collectFeatures(element, features, underListItem);
170 }
171 }
172
173 bool hasOpenGraphArticle(const Element& head)
174 {
175 DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
176 DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
177 for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
178 if (!isHTMLMetaElement(*child))
179 continue;
180 const HTMLMetaElement& meta = toHTMLMetaElement(*child);
181
182 if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
183 if (equalIgnoringCase(meta.content(), "article")) {
184 return true;
185 }
186 }
187 }
188 return false;
189 }
190
191 bool isMobileFriendly(Document& document)
192 {
193 FrameHost* frameHost = document.frameHost();
194 if (!frameHost)
195 return false;
196 return frameHost->visualViewport().shouldDisableDesktopWorkarounds();
197 }
198
199 } // namespace
200
201 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
202 {
203 WebDistillabilityFeatures features = WebDistillabilityFeatures();
204
205 if (!document.frame() || !document.frame()->isMainFrame())
206 return features;
207
208 ASSERT(document.hasFinishedParsing());
209
esprehn 2015/11/03 07:45:10 TRACE_EVENT0("DocumentStatisticsCollector::collect
wychen 2015/11/03 08:59:49 Done.
210 HTMLElement* body = document.body();
esprehn 2015/11/03 07:45:10 needs a trace macro
wychen 2015/11/03 08:59:48 I don't understand this comment. Did you mean DEFI
211 HTMLElement* head = document.head();
212
213 if (!body || !head)
214 return features;
215
216 if (isMobileFriendly(document)) {
217 features.isMobileFriendly = true;
218 return features;
219 }
220
221 double startTime = monotonicallyIncreasingTime();
222
223 // Traverse the DOM tree and collect statistics.
224 collectFeatures(*body, features);
225 features.openGraph = hasOpenGraphArticle(*head);
226
227 double elapsedTime = monotonicallyIncreasingTime() - startTime;
228 Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", stati c_cast<int>(1e6 * elapsedTime), 1, 1000000, 50);
229
230 return features;
231 }
232
233 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698