Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(94)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "config.h"
6 #include "DocumentStatisticsCollector.h"
7
8 #include "core/HTMLNames.h"
9 #include "core/css/CSSComputedStyleDeclaration.h"
10 #include "core/editing/EphemeralRange.h"
11 #include "core/editing/iterators/TextIterator.h"
12 #include "core/editing/iterators/WordAwareIterator.h"
13 #include "core/html/HTMLHeadElement.h"
14 #include "core/inspector/ConsoleMessage.h"
15 #include "platform/text/TextBreakIterator.h"
16 #include "public/platform/WebDistillability.h"
17 #include "wtf/text/StringBuilder.h"
18
19 using namespace WTF;
esprehn 2015/10/23 04:59:14 don't using namespace WTF
wychen 2015/10/26 20:40:51 Done.
20 using namespace Unicode;
esprehn 2015/10/23 04:59:14 why do you need this?
wychen 2015/10/26 20:40:51 Done.
21
22 namespace blink {
23
24 using namespace HTMLNames;
25
26 namespace {
27
28 unsigned trimmedTextContentLength(Element& root)
29 {
30 // TODO(wychen): count the length without allocating the string.
31 return root.textContent().stripWhiteSpace().length();
esprehn 2015/10/23 04:59:14 this is very expensive, you're creating a huge str
wychen 2015/10/23 18:56:35 Would this be acceptable if it's rewritten so that
wychen 2015/10/26 20:40:52 Done.
32 }
33
34 unsigned innerTextLength(Element& root)
35 {
36 unsigned length = 0;
37 EphemeralRange range = EphemeralRange::rangeOfContents(root);
esprehn 2015/10/23 04:59:15 Why do you need innerText length? This is going to
wychen 2015/10/23 18:56:35 This is one of the features we fed to the machine
38 TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPo sition(), TextIteratorForInnerText);
39 for (; !it.atEnd(); it.advance()) {
40 length += it.length();
41 }
42 return length;
43 }
44
45 class ExtractFeatureWalker {
wychen 2015/10/23 02:55:47 Do you mean creating new cpp/h files for this clas
46 public:
47 ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features ) :
48 m_document(document),
49 m_features(features)
50 {
51 }
52
53 bool isVisible(Element& elem)
esprehn 2015/10/23 04:59:14 element, don't abbreviate in blink
wychen 2015/10/26 20:40:51 Done.
54 {
55 RefPtr<CSSStyleDeclaration> style =
56 m_document.domWindow()->getComputedStyle(&elem, String());
esprehn 2015/10/23 04:59:15 don't call through the domWindow(), you should alm
wychen 2015/10/26 20:40:52 Done.
57 return !(
58 style->getPropertyValue("display") == "none"
esprehn 2015/10/23 04:59:14 You want to use ComputedStyle and the API surface
wychen 2015/10/26 20:40:51 Done.
59 || style->getPropertyValue("visibility") == "hidden"
60 || style->getPropertyValue("opacity") == "0"
esprehn 2015/10/23 04:59:14 these properties are not inherited, this code does
wychen 2015/10/23 18:56:35 Opacity is indeed not inherited, but visibility is
61 );
62 }
63
64 bool matchName(Element& elem, const std::vector<String>& words)
esprehn 2015/10/23 04:59:15 don't use std::vector in blink, don't abbreviate.
wychen 2015/10/23 18:56:34 I'll use Vector instead.
65 {
66 String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribut e().lower();
wychen 2015/10/23 02:55:47 treat
esprehn 2015/10/23 04:59:14 don't combine concat the strings like this, also d
wychen 2015/10/23 18:56:35 Since partial matching is expected, I'll be using
67 for (const String& word: words) {
68 if (hay.find(word)) {
69 return true;
70 }
71 }
72 return false;
73 }
74
75 void walk()
76 {
77 walk(*m_document.body(), false);
esprehn 2015/10/23 04:59:15 what is false?
wychen 2015/10/23 18:56:35 The second argument should've been removed since t
78 m_features.openGraph = hasOGArticle(*m_document.head());
esprehn 2015/10/23 04:59:14 don't abbreviate, what is OG?
wychen 2015/10/23 18:56:35 Is hasOpenGraphArticle() a self-explanatory name?
79 }
80
81 private:
82 void walk(Element& root, bool underLi = false)
esprehn 2015/10/23 04:59:15 what is underLi?
wychen 2015/10/26 20:40:51 Added comment.
83 {
84 DEFINE_STATIC_LOCAL(std::vector<String>, unlikelyCandidates, ({"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", " related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponso r", "ad-break", "agegate", "pagination", "pager", "popup"}));
esprehn 2015/10/23 04:59:14 don't use std::vector, use WTF types, also I'm pre
wychen 2015/10/23 18:56:35 If we only want exact matching, this would be a br
85 DEFINE_STATIC_LOCAL(std::vector<String>, okMaybeItsACandidate, ({"and", "article", "body", "column", "main", "shadow"}));
esprehn 2015/10/23 04:59:14 ditto
86
87 for (Node& node : NodeTraversal::childrenOf(root)) {
esprehn 2015/10/23 04:59:15 this doesn't handle shadow dom
wychen 2015/10/23 18:56:35 Could you elaborate how shadow dom should be handl
88 if (node.isTextNode()) {
89 String text = toText(node).data();
90 m_features.textContentLength += text.length();
esprehn 2015/10/23 04:59:14 toText(node).length()
wychen 2015/10/26 20:40:52 Done.
91 continue;
92 }
93 if (!node.isElementNode()) {
94 continue;
95 }
96
97 m_features.numElements++;
98 Element& element = toElement(node);
99 if (element.hasTagName(aTag)) {
100 m_features.numAnchors++;
esprehn 2015/10/23 04:59:15 anchorCount formCount etc. don't abbreviate wit
wychen 2015/10/26 20:40:51 Done.
101 } else if (element.hasTagName(formTag)) {
102 m_features.numForms++;
103 } else if (element.hasTagName(inputTag)) {
104 if (element.getAttribute("type").lower() == "text") {
esprehn 2015/10/23 04:59:14 typeAttr, you also want to use equalIgnoringCase.
wychen 2015/10/26 20:40:51 Done.
105 m_features.numTextInput++;
106 } else if (element.getAttribute("type").lower() == "pasword") {
esprehn 2015/10/23 04:59:14 ditto
wychen 2015/10/26 20:40:51 Done.
107 m_features.numPasswordInput++;
108 }
109 } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
110 m_features.numPPRE++;
esprehn 2015/10/23 04:59:14 Lets count these separately, then you can sum on t
wychen 2015/10/26 20:40:51 Done.
111 if (!underLi && isVisible(element)
esprehn 2015/10/23 04:59:14 underListItem, also this isVisible thing doesn't w
wychen 2015/10/26 20:40:52 Done.
112 && (!matchName(element, unlikelyCandidates) || matchName(ele ment, okMaybeItsACandidate))) {
esprehn 2015/10/23 04:59:14 okMaybeItsACandidate needs a better name, both set
wychen 2015/10/23 18:56:35 Does highlyLikelyCandidates sound better?
113 unsigned len = trimmedTextContentLength(element);
114 if (len >= 140) {
esprehn 2015/10/23 05:17:30 if you only care about this we can write something
wychen 2015/10/26 20:40:52 I've saturated the length so that we could early o
115 m_features.mozScore += sqrt(len - 140);
esprehn 2015/10/23 04:59:14 why 140? this needs a comment in the code or a con
wychen 2015/10/26 20:40:51 Done.
116 }
117 m_features.mozScoreAllSqrt += sqrt(len);
118 m_features.mozScoreAllLinear += len;
119 }
120 }
121 walk(element, element.hasTagName(liTag) || underLi);
esprehn 2015/10/23 04:59:14 why do you care about ancestor li tags, what's the
wychen 2015/10/23 18:56:35 This is part of the scoring heuristics. <p> or <pr
122 }
123 }
124
125 bool hasOGArticle(const Element& head)
126 {
127 for (const Node& node : NodeTraversal::childrenOf(head)) {
esprehn 2015/10/23 04:59:15 again this doesn't handle shadow dom, but this is
wychen 2015/10/26 20:40:52 Acknowledged.
128 if (!node.isElementNode())
129 continue;
130 const Element& element = toElement(node);
131 if (!element.hasTagName(metaTag))
132 continue;
133 if ((element.getAttribute("name") == ("og:type")) || (element.getAtt ribute("property") == ("og:type"))) {
esprehn 2015/10/23 04:59:14 toHTMLMetaElement and then use ->name(), also remo
wychen 2015/10/26 20:40:52 Done, but getAttribute("property") is still there.
134 WTF::CString content = element.getAttribute("content").upper().u tf8();
esprehn 2015/10/23 04:59:14 don't convert to CString, you almost never want th
wychen 2015/10/26 20:40:51 Done.
135 if ((content) == "ARTICLE") {
esprehn 2015/10/23 04:59:14 You want to use ->content() and equalIgnoringCase
wychen 2015/10/26 20:40:51 Done.
136 return true;
137 }
138 }
139 }
140 return false;
141 };
142
143 private:
144 Document& m_document;
145 WebDistillabilityFeatures& m_features;
146 };
147
148 } // namespace
149
150 WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Documen t& document)
151 {
152 WebDistillabilityFeatures features({0});
esprehn 2015/10/23 04:59:14 Can we add a constructor for the struct instead an
wychen 2015/10/26 20:40:52 I'll use an initializer instead of adding a constr
153
154 if (!document.frame() || !document.frame()->isMainFrame())
155 return features;
156
157 if (!document.hasFinishedParsing())
esprehn 2015/10/23 04:59:15 how does this ever get called when we're still par
wychen 2015/10/26 20:40:51 Done.
158 return features;
159
160 ASSERT(document.body());
esprehn 2015/10/23 04:59:15 this doesn't have to be true, you can remove the b
wychen 2015/10/26 20:40:52 Done.
161
162 // First, traverse the DOM tree and collect statistics.
163 ExtractFeatureWalker walker(document, features);
esprehn 2015/10/23 04:59:15 why do you need a walker at all instead of just st
wychen 2015/10/26 20:40:51 Done.
164 walker.walk();
esprehn 2015/10/23 04:59:14 just use recursive functions
wychen 2015/10/26 20:40:51 Done.
165
166 // Next, traverse the Layout tree and collect statistics on innerText length .
167 features.innerTextLength += innerTextLength(*document.body());
esprehn 2015/10/23 04:59:14 you can remove the body, also this doesn't have to
wychen 2015/10/23 18:56:35 The ratio between innerText.length() and textConte
wychen 2015/10/26 20:40:51 Done.
168
169 // The following DISTILLER_NDEBUG section would be gone when landing.
170 #ifndef DISTILLER_NDEBUG
171 StringBuilder message;
172 message.append("openGraph: ");
173 message.appendNumber(features.openGraph);
174 message.append(", numElements: ");
175 message.appendNumber(features.numElements);
176 message.append(", numAnchors: ");
177 message.appendNumber(features.numAnchors);
178 message.append(", numForms: ");
179 message.appendNumber(features.numForms);
180 message.append(", numTextInput: ");
181 message.appendNumber(features.numTextInput);
182 message.append(", numPasswordInput: ");
183 message.appendNumber(features.numPasswordInput);
184 message.append(", numPPRE: ");
185 message.appendNumber(features.numPPRE);
186 message.append(", innerTextLength: ");
187 message.appendNumber(features.innerTextLength);
188 message.append(", textContentLength: ");
189 message.appendNumber(features.textContentLength);
190 message.append(", mozScore: ");
191 message.appendNumber(features.mozScore);
192 message.append(", mozScoreAllSqrt: ");
193 message.appendNumber(features.mozScoreAllSqrt);
194 message.append(", mozScoreAllLinear: ");
195 message.appendNumber(features.mozScoreAllLinear);
196
197 RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(C onsoleAPIMessageSource, DebugMessageLevel, message.toString());
198 document.addConsoleMessage(consoleMessage);
esprehn 2015/10/23 04:59:14 just fprintf, no reason to console log your debug
wychen 2015/10/26 20:40:52 I just wanted to quickly get message to adb log.
199 #endif
200
201 return features;
202 }
203
204 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698