Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(44)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1248643004: Test distillability without JavaScript (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@early
Patch Set: fix oopsies Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/css/CSSComputedStyleDeclaration.h"
10 #include "core/editing/EphemeralRange.h"
11 #include "core/editing/iterators/TextIterator.h"
12 #include "core/editing/iterators/WordAwareIterator.h"
13 #include "core/html/HTMLHeadElement.h"
14 #include "core/inspector/ConsoleMessage.h"
15 #include "platform/text/TextBreakIterator.h"
16 #include "public/platform/WebDistillability.h"
17 #include "wtf/text/StringBuilder.h"
18
19 using namespace WTF;
20 using namespace Unicode;
21
22 namespace blink {
23
24 using namespace HTMLNames;
25
26 namespace {
27
28 unsigned trimmedTextContentLength(Element& root)
29 {
30 // TODO(wychen): count the length without allocating the string.
31 return root.textContent().stripWhiteSpace().length();
dglazkov 2015/10/22 16:30:31 The TODO needs to be addressed before landing.
wychen 2015/10/23 02:51:30 Will do.
32 }
33
34 unsigned innerTextLength(Element& root)
35 {
36 unsigned length = 0;
37 EphemeralRange range = EphemeralRange::rangeOfContents(root);
38 TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPo sition(), TextIteratorForInnerText);
39 for (; !it.atEnd(); it.advance()) {
40 length += it.length();
41 }
42 return length;
43 }
44
45 class ExtractFeatureWalker {
dglazkov 2015/10/22 16:30:31 This can just be a separate class, no need to hide
wychen 2015/10/23 02:51:30 Do you mean creating new cpp/h files for this clas
46 public:
47 ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features ) :
48 m_document(document),
49 m_features(features)
50 {
51 unlikelyCandidates = {"banner", "combx", "comment", "community", "disqus ", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shou tbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"};
dglazkov 2015/10/22 16:30:31 Should these be static? Why initialize them in con
wychen 2015/10/23 02:51:30 Made static.
52 okMaybeItsACandidate = {"and", "article", "body", "column", "main", "sha dow"};
53 }
54
55 bool isVisible(Element& elem)
56 {
57 RefPtr<CSSStyleDeclaration> style =
58 m_document.domWindow()->getComputedStyle(&elem, String());
59 return !(
60 style->getPropertyValue("display") == "none"
61 || style->getPropertyValue("visibility") == "hidden"
62 || style->getPropertyValue("opacity") == "0"
63 );
64 }
65
66 bool matchName(Element& elem, const std::vector<String>& words)
67 {
68 String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribut e().lower();
dglazkov 2015/10/22 16:30:31 We have lots of style machinery to do this correct
wychen 2015/10/23 02:51:30 Could you elaborate how to use StyleResolver to im
69 for (const String& word: words) {
70 if (hay.find(word)) {
71 return true;
72 }
73 }
74 return false;
75 }
76
77 void walk()
78 {
79 walk(*m_document.body(), false);
80 }
81
82 private:
83 void walk(Element& root, bool underLi = false)
84 {
85 for (Node& node : NodeTraversal::childrenOf(root)) {
86 if (node.isTextNode()) {
87 String text = toText(node).data();
88 m_features.textContentLength += text.length();
89 continue;
90 }
91 if (!node.isElementNode()) {
92 continue;
93 }
94
95 m_features.numElements++;
96 Element& element = toElement(node);
97 if (element.hasTagName(aTag)) {
98 m_features.numAnchors++;
99 } else if (element.hasTagName(formTag)) {
100 m_features.numForms++;
101 } else if (element.hasTagName(inputTag)) {
102 if (element.getAttribute("type").lower() == "text") {
103 m_features.numTextInput++;
104 } else if (element.getAttribute("type").lower() == "pasword") {
105 m_features.numPasswordInput++;
106 }
107 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
108 m_features.numPPRE++;
109 if (!underLi && isVisible(element)
110 && (!matchName(element, unlikelyCandidates) || matchName(ele ment, okMaybeItsACandidate))) {
111 unsigned len = trimmedTextContentLength(element);
112 if (len >= 140) {
113 m_features.mozScore += sqrt(len - 140);
114 }
115 m_features.mozScoreAllSqrt += sqrt(len);
116 m_features.mozScoreAllLinear += len;
117 }
118 }
119 walk(element, element.hasTagName(liTag) || underLi);
120 }
121 }
122
123 std::vector<String> unlikelyCandidates, okMaybeItsACandidate;
124 Document& m_document;
125 WebDistillabilityFeatures& m_features;
126 };
127
128 bool hasOGArticle(const Element& head)
129 {
130 for (const Node& node : NodeTraversal::childrenOf(head)) {
dglazkov 2015/10/22 16:30:31 If you're traversing things in the method above, m
wychen 2015/10/23 02:51:30 hasOGArticle traverses head, while walk() traverse
131 if (!node.isElementNode())
132 continue;
133 const Element& element = toElement(node);
134 if (!element.hasTagName(metaTag))
135 continue;
136 if ((element.getAttribute("name") == ("og:type")) || (element.getAttribu te("property") == ("og:type"))) {
137 WTF::CString content = element.getAttribute("content").upper().utf8( );
138 if ((content) == "ARTICLE") {
139 return true;
140 }
141 }
142 }
143 return false;
144 };
145
146 } // namespace
147
148 DocumentStatisticsCollector::DocumentStatisticsCollector()
149 : m_readyToCollect(false)
150 {
151 }
152
153 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
154 {
155 WebDistillabilityFeatures features({0});
156 if (!m_readyToCollect)
157 return features;
158
159 if (!document.frame() || !document.frame()->isMainFrame())
160 return features;
161
162 if (!document.hasFinishedParsing())
163 return features;
164
165 ASSERT(document.body());
166
167 // First, traverse the DOM tree and collect statistics.
168 ExtractFeatureWalker walker(document, features);
169 walker.walk();
170
171 // Next, traverse the Layout tree and collect statistics on innerText length .
172 features.innerTextLength += innerTextLength(*document.body());
173
174 features.openGraph = hasOGArticle(*document.head());
175
176 // The following DISTILLER_NDEBUG section would be gone when landing.
177 #ifndef DISTILLER_NDEBUG
178 StringBuilder message;
179 message.append("openGraph: ");
180 message.appendNumber(features.openGraph);
181 message.append(", numElements: ");
182 message.appendNumber(features.numElements);
183 message.append(", numAnchors: ");
184 message.appendNumber(features.numAnchors);
185 message.append(", numForms: ");
186 message.appendNumber(features.numForms);
187 message.append(", numTextInput: ");
188 message.appendNumber(features.numTextInput);
189 message.append(", numPasswordInput: ");
190 message.appendNumber(features.numPasswordInput);
191 message.append(", numPPRE: ");
192 message.appendNumber(features.numPPRE);
193 message.append(", innerTextLength: ");
194 message.appendNumber(features.innerTextLength);
195 message.append(", textContentLength: ");
196 message.appendNumber(features.textContentLength);
197 message.append(", mozScore: ");
198 message.appendNumber(features.mozScore);
199 message.append(", mozScoreAllSqrt: ");
200 message.appendNumber(features.mozScoreAllSqrt);
201 message.append(", mozScoreAllLinear: ");
202 message.appendNumber(features.mozScoreAllLinear);
203
204 RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(C onsoleAPIMessageSource, DebugMessageLevel, message.toString());
205 document.addConsoleMessage(consoleMessage);
206 #endif
207
208 return features;
209 }
210
211 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698