Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(256)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: add mobile friendly detection Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/InputTypeNames.h"
10 #include "core/css/CSSComputedStyleDeclaration.h"
11 #include "core/dom/ElementTraversal.h"
12 #include "core/dom/NodeComputedStyle.h"
13 #include "core/editing/iterators/TextIterator.h"
14 #include "core/frame/FrameHost.h"
15 #include "core/html/HTMLHeadElement.h"
16 #include "core/html/HTMLInputElement.h"
17 #include "core/html/HTMLMetaElement.h"
18 #include "public/platform/WebDistillability.h"
19 #include "wtf/text/StringBuilder.h"
20 #include "wtf/text/StringImpl.h"
21
22 namespace blink {
23
24 using namespace HTMLNames;
25
26 namespace {
27
28 // Saturate the length of a paragraph to save time.
29 const int kTextContentLengthSaturation = 1000;
30
31 unsigned textContentLengthSaturated(Element& root)
32 {
33 unsigned length = 0;
34 // This skips shadow dom intentionally.
dglazkov 2015/11/03 04:45:37 Please explain why in the comment.
wychen 2015/11/03 07:13:41 Done.
35 for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
36 if (!node.isTextNode()) {
37 continue;
38 }
39 length += toText(node).length();
40 if (length > kTextContentLengthSaturation) {
41 return kTextContentLengthSaturation;
42 }
43 }
44 return length;
45 }
46
47 bool isVisible(Element& element)
48 {
49 const ComputedStyle* style = element.computedStyle();
50 if (!style) {
dglazkov 2015/11/03 04:45:37 Don't need braces here.
wychen 2015/11/03 07:13:41 Done.
51 return false;
52 }
53 ASSERT(style->display() != NONE);
54 return (
55 style->visibility() != HIDDEN
56 && style->opacity() != 0
57 );
58 }
59
60 bool matchAttributes(Element& element, const Vector<String>& words)
61 {
62 const String& classes = element.getClassAttribute();
63 const String& id = element.getIdAttribute();
64 for (const String& word : words) {
65 if (classes.findIgnoringCase(word) != WTF::kNotFound
66 || id.findIgnoringCase(word) != WTF::kNotFound) {
67 return true;
68 }
69 }
70 return false;
71 }
72
73 // underListItem denotes that at least one of the ancesters is <li> element.
74 void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool un derListItem = false)
75 {
76 DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
77 if (unlikelyCandidates.isEmpty()) {
78 for (auto word : {"banner", "combx", "comment", "community", "disqus", " extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox ", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pag er", "popup"}) {
79 unlikelyCandidates.append(word);
80 }
81 }
82 DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
83 if (highlyLikelyCandidates.isEmpty()) {
84 for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
85 highlyLikelyCandidates.append(word);
86 }
87 }
88 // Filter out short P elements. The threshold is set to around 2 English sen tences.
89 const unsigned kParagraphLengthThreshold = 140;
90
91 // Saturate the scores to save time. The max is the score of 6 long paragrap hs.
92 const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - k ParagraphLengthThreshold);
93 const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturat ion);
94 const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation ;
95
96 for (Node& node : NodeTraversal::childrenOf(root)) {
97 if (node.isTextNode()) {
98 features.textContentLength += toText(node).length();
99 continue;
100 }
101 if (!node.isElementNode()) {
102 continue;
103 }
104
105 features.elementCount++;
106 Element& element = toElement(node);
107 if (element.hasTagName(aTag)) {
108 features.anchorCount++;
109 } else if (element.hasTagName(formTag)) {
110 features.formCount++;
111 } else if (element.hasTagName(inputTag)) {
112 const HTMLInputElement& input = toHTMLInputElement(element);
113 if (equalIgnoringCase(input.type(), InputTypeNames::text)) {
114 features.textInputCount++;
115 } else if (equalIgnoringCase(input.type(), InputTypeNames::password) ) {
116 features.passwordInputCount++;
117 }
118 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
119 if (element.hasTagName(pTag)) {
120 features.pCount++;
121 } else {
122 features.preCount++;
123 }
124 if (!underListItem
125 && (features.mozScore < kMozScoreSaturation
126 || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation
127 || features.mozScoreAllLinear < kMozScoreAllLinearSaturation )
128 && isVisible(element)
129 && (!matchAttributes(element, unlikelyCandidates) || matchAttrib utes(element, highlyLikelyCandidates))
130 ) {
131 unsigned length = textContentLengthSaturated(element);
dglazkov 2015/11/03 04:45:37 Is this an O(NxM) built in here?
wychen 2015/11/03 07:13:41 I'm not quite sure about what you meant here. tex
132 if (length >= kParagraphLengthThreshold) {
133 features.mozScore += sqrt(length - kParagraphLengthThreshold );
134 features.mozScore = std::min(features.mozScore, kMozScoreSat uration);
135 }
136 features.mozScoreAllSqrt += sqrt(length);
137 features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kM ozScoreAllSqrtSaturation);
138
139 features.mozScoreAllLinear += length;
140 features.mozScoreAllLinear = std::min(features.mozScoreAllLinear , kMozScoreAllLinearSaturation);
141 }
142 } else if (element.hasTagName(liTag)) {
143 underListItem = true;
144 }
145 collectFeatures(element, features, underListItem);
146 }
147 }
148
149 bool hasOpenGraphArticle(const Element& head)
150 {
151 DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
152 DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
153 for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
154 if (!isHTMLMetaElement(*child))
155 continue;
156 const HTMLMetaElement& meta = toHTMLMetaElement(*child);
157
158 if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
159 if (equalIgnoringCase(meta.content(), "article")) {
160 return true;
161 }
162 }
163 }
164 return false;
165 }
166
167 bool isMobileFriendly(Document& document)
168 {
169 FrameHost* frameHost = document.frameHost();
170 if (!frameHost)
171 return false;
172 return frameHost->visualViewport().shouldDisableDesktopWorkarounds();
173 }
174
175 } // namespace
176
177 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
178 {
179 WebDistillabilityFeatures features = WebDistillabilityFeatures();
180
181 if (!document.frame() || !document.frame()->isMainFrame())
182 return features;
183
184 ASSERT(document.hasFinishedParsing());
185
186 if (!document.body() || !document.head())
dglazkov 2015/11/03 04:45:37 Both of these are traversals, so might be good to
wychen 2015/11/03 07:13:41 Done.
187 return features;
188
189 if (isMobileFriendly(document)) {
190 features.isMobileFriendly = true;
191 return features;
192 }
193
194 // Traverse the DOM tree and collect statistics.
195 collectFeatures(*document.body(), features);
196 features.openGraph = hasOpenGraphArticle(*document.head());
197
198 return features;
199 }
200
201 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698