Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..2bc6f51ecf34caa996cac44f3962360c111b6390 |
--- /dev/null |
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
@@ -0,0 +1,204 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "config.h" |
+#include "DocumentStatisticsCollector.h" |
+ |
+#include "core/HTMLNames.h" |
+#include "core/css/CSSComputedStyleDeclaration.h" |
+#include "core/editing/EphemeralRange.h" |
+#include "core/editing/iterators/TextIterator.h" |
+#include "core/editing/iterators/WordAwareIterator.h" |
+#include "core/html/HTMLHeadElement.h" |
+#include "core/inspector/ConsoleMessage.h" |
+#include "platform/text/TextBreakIterator.h" |
+#include "public/platform/WebDistillability.h" |
+#include "wtf/text/StringBuilder.h" |
+ |
+using namespace WTF; |
esprehn
2015/10/23 04:59:14
don't using namespace WTF
wychen
2015/10/26 20:40:51
Done.
|
+using namespace Unicode; |
esprehn
2015/10/23 04:59:14
why do you need this?
wychen
2015/10/26 20:40:51
Done.
|
+ |
+namespace blink { |
+ |
+using namespace HTMLNames; |
+ |
+namespace { |
+ |
+unsigned trimmedTextContentLength(Element& root) |
+{ |
+ // TODO(wychen): count the length without allocating the string. |
+ return root.textContent().stripWhiteSpace().length(); |
esprehn
2015/10/23 04:59:14
this is very expensive, you're creating a huge str
wychen
2015/10/23 18:56:35
Would this be acceptable if it's rewritten so that
wychen
2015/10/26 20:40:52
Done.
|
+} |
+ |
+unsigned innerTextLength(Element& root) |
+{ |
+ unsigned length = 0; |
+ EphemeralRange range = EphemeralRange::rangeOfContents(root); |
esprehn
2015/10/23 04:59:15
Why do you need innerText length? This is going to
wychen
2015/10/23 18:56:35
This is one of the features we fed to the machine
|
+ TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText); |
+ for (; !it.atEnd(); it.advance()) { |
+ length += it.length(); |
+ } |
+ return length; |
+} |
+ |
+class ExtractFeatureWalker { |
wychen
2015/10/23 02:55:47
Do you mean creating new cpp/h files for this clas
|
+public: |
+ ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features) : |
+ m_document(document), |
+ m_features(features) |
+ { |
+ } |
+ |
+ bool isVisible(Element& elem) |
esprehn
2015/10/23 04:59:14
element, don't abbreviate in blink
wychen
2015/10/26 20:40:51
Done.
|
+ { |
+ RefPtr<CSSStyleDeclaration> style = |
+ m_document.domWindow()->getComputedStyle(&elem, String()); |
esprehn
2015/10/23 04:59:15
don't call through the domWindow(), you should alm
wychen
2015/10/26 20:40:52
Done.
|
+ return !( |
+ style->getPropertyValue("display") == "none" |
esprehn
2015/10/23 04:59:14
You want to use ComputedStyle and the API surface
wychen
2015/10/26 20:40:51
Done.
|
+ || style->getPropertyValue("visibility") == "hidden" |
+ || style->getPropertyValue("opacity") == "0" |
esprehn
2015/10/23 04:59:14
these properties are not inherited, this code does
wychen
2015/10/23 18:56:35
Opacity is indeed not inherited, but visibility is
|
+ ); |
+ } |
+ |
+ bool matchName(Element& elem, const std::vector<String>& words) |
esprehn
2015/10/23 04:59:15
don't use std::vector in blink, don't abbreviate.
wychen
2015/10/23 18:56:34
I'll use Vector instead.
|
+ { |
+ String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribute().lower(); |
wychen
2015/10/23 02:55:47
treat
esprehn
2015/10/23 04:59:14
don't combine concat the strings like this, also d
wychen
2015/10/23 18:56:35
Since partial matching is expected, I'll be using
|
+ for (const String& word: words) { |
+ if (hay.find(word)) { |
+ return true; |
+ } |
+ } |
+ return false; |
+ } |
+ |
+ void walk() |
+ { |
+ walk(*m_document.body(), false); |
esprehn
2015/10/23 04:59:15
what is false?
wychen
2015/10/23 18:56:35
The second argument should've been removed since t
|
+ m_features.openGraph = hasOGArticle(*m_document.head()); |
esprehn
2015/10/23 04:59:14
don't abbreviate, what is OG?
wychen
2015/10/23 18:56:35
Is hasOpenGraphArticle() a self-explanatory name?
|
+ } |
+ |
+private: |
+ void walk(Element& root, bool underLi = false) |
esprehn
2015/10/23 04:59:15
what is underLi?
wychen
2015/10/26 20:40:51
Added comment.
|
+ { |
+ DEFINE_STATIC_LOCAL(std::vector<String>, unlikelyCandidates, ({"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"})); |
esprehn
2015/10/23 04:59:14
don't use std::vector, use WTF types, also I'm pre
wychen
2015/10/23 18:56:35
If we only want exact matching, this would be a br
|
+ DEFINE_STATIC_LOCAL(std::vector<String>, okMaybeItsACandidate, ({"and", "article", "body", "column", "main", "shadow"})); |
esprehn
2015/10/23 04:59:14
ditto
|
+ |
+ for (Node& node : NodeTraversal::childrenOf(root)) { |
esprehn
2015/10/23 04:59:15
this doesn't handle shadow dom
wychen
2015/10/23 18:56:35
Could you elaborate how shadow dom should be handl
|
+ if (node.isTextNode()) { |
+ String text = toText(node).data(); |
+ m_features.textContentLength += text.length(); |
esprehn
2015/10/23 04:59:14
toText(node).length()
wychen
2015/10/26 20:40:52
Done.
|
+ continue; |
+ } |
+ if (!node.isElementNode()) { |
+ continue; |
+ } |
+ |
+ m_features.numElements++; |
+ Element& element = toElement(node); |
+ if (element.hasTagName(aTag)) { |
+ m_features.numAnchors++; |
esprehn
2015/10/23 04:59:15
anchorCount
formCount
etc.
don't abbreviate wit
wychen
2015/10/26 20:40:51
Done.
|
+ } else if (element.hasTagName(formTag)) { |
+ m_features.numForms++; |
+ } else if (element.hasTagName(inputTag)) { |
+ if (element.getAttribute("type").lower() == "text") { |
esprehn
2015/10/23 04:59:14
typeAttr, you also want to use equalIgnoringCase.
wychen
2015/10/26 20:40:51
Done.
|
+ m_features.numTextInput++; |
+ } else if (element.getAttribute("type").lower() == "pasword") { |
esprehn
2015/10/23 04:59:14
ditto
wychen
2015/10/26 20:40:51
Done.
|
+ m_features.numPasswordInput++; |
+ } |
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
+ m_features.numPPRE++; |
esprehn
2015/10/23 04:59:14
Lets count these separately, then you can sum on t
wychen
2015/10/26 20:40:51
Done.
|
+ if (!underLi && isVisible(element) |
esprehn
2015/10/23 04:59:14
underListItem, also this isVisible thing doesn't w
wychen
2015/10/26 20:40:52
Done.
|
+ && (!matchName(element, unlikelyCandidates) || matchName(element, okMaybeItsACandidate))) { |
esprehn
2015/10/23 04:59:14
okMaybeItsACandidate needs a better name, both set
wychen
2015/10/23 18:56:35
Does highlyLikelyCandidates sound better?
|
+ unsigned len = trimmedTextContentLength(element); |
+ if (len >= 140) { |
esprehn
2015/10/23 05:17:30
if you only care about this we can write something
wychen
2015/10/26 20:40:52
I've saturated the length so that we could early o
|
+ m_features.mozScore += sqrt(len - 140); |
esprehn
2015/10/23 04:59:14
why 140? this needs a comment in the code or a con
wychen
2015/10/26 20:40:51
Done.
|
+ } |
+ m_features.mozScoreAllSqrt += sqrt(len); |
+ m_features.mozScoreAllLinear += len; |
+ } |
+ } |
+ walk(element, element.hasTagName(liTag) || underLi); |
esprehn
2015/10/23 04:59:14
why do you care about ancestor li tags, what's the
wychen
2015/10/23 18:56:35
This is part of the scoring heuristics. <p> or <pr
|
+ } |
+ } |
+ |
+ bool hasOGArticle(const Element& head) |
+ { |
+ for (const Node& node : NodeTraversal::childrenOf(head)) { |
esprehn
2015/10/23 04:59:15
again this doesn't handle shadow dom, but this is
wychen
2015/10/26 20:40:52
Acknowledged.
|
+ if (!node.isElementNode()) |
+ continue; |
+ const Element& element = toElement(node); |
+ if (!element.hasTagName(metaTag)) |
+ continue; |
+ if ((element.getAttribute("name") == ("og:type")) || (element.getAttribute("property") == ("og:type"))) { |
esprehn
2015/10/23 04:59:14
toHTMLMetaElement and then use ->name(), also remo
wychen
2015/10/26 20:40:52
Done, but getAttribute("property") is still there.
|
+ WTF::CString content = element.getAttribute("content").upper().utf8(); |
esprehn
2015/10/23 04:59:14
don't convert to CString, you almost never want th
wychen
2015/10/26 20:40:51
Done.
|
+ if ((content) == "ARTICLE") { |
esprehn
2015/10/23 04:59:14
You want to use ->content() and equalIgnoringCase
wychen
2015/10/26 20:40:51
Done.
|
+ return true; |
+ } |
+ } |
+ } |
+ return false; |
+ }; |
+ |
+private: |
+ Document& m_document; |
+ WebDistillabilityFeatures& m_features; |
+}; |
+ |
+} // namespace |
+ |
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
+{ |
+ WebDistillabilityFeatures features({0}); |
esprehn
2015/10/23 04:59:14
Can we add a constructor for the struct instead an
wychen
2015/10/26 20:40:52
I'll use an initializer instead of adding a constr
|
+ |
+ if (!document.frame() || !document.frame()->isMainFrame()) |
+ return features; |
+ |
+ if (!document.hasFinishedParsing()) |
esprehn
2015/10/23 04:59:15
how does this ever get called when we're still par
wychen
2015/10/26 20:40:51
Done.
|
+ return features; |
+ |
+ ASSERT(document.body()); |
esprehn
2015/10/23 04:59:15
this doesn't have to be true, you can remove the b
wychen
2015/10/26 20:40:52
Done.
|
+ |
+ // First, traverse the DOM tree and collect statistics. |
+ ExtractFeatureWalker walker(document, features); |
esprehn
2015/10/23 04:59:15
why do you need a walker at all instead of just st
wychen
2015/10/26 20:40:51
Done.
|
+ walker.walk(); |
esprehn
2015/10/23 04:59:14
just use recursive functions
wychen
2015/10/26 20:40:51
Done.
|
+ |
+ // Next, traverse the Layout tree and collect statistics on innerText length. |
+ features.innerTextLength += innerTextLength(*document.body()); |
esprehn
2015/10/23 04:59:14
you can remove the body, also this doesn't have to
wychen
2015/10/23 18:56:35
The ratio between innerText.length() and textConte
wychen
2015/10/26 20:40:51
Done.
|
+ |
+ // The following DISTILLER_NDEBUG section would be gone when landing. |
+#ifndef DISTILLER_NDEBUG |
+ StringBuilder message; |
+ message.append("openGraph: "); |
+ message.appendNumber(features.openGraph); |
+ message.append(", numElements: "); |
+ message.appendNumber(features.numElements); |
+ message.append(", numAnchors: "); |
+ message.appendNumber(features.numAnchors); |
+ message.append(", numForms: "); |
+ message.appendNumber(features.numForms); |
+ message.append(", numTextInput: "); |
+ message.appendNumber(features.numTextInput); |
+ message.append(", numPasswordInput: "); |
+ message.appendNumber(features.numPasswordInput); |
+ message.append(", numPPRE: "); |
+ message.appendNumber(features.numPPRE); |
+ message.append(", innerTextLength: "); |
+ message.appendNumber(features.innerTextLength); |
+ message.append(", textContentLength: "); |
+ message.appendNumber(features.textContentLength); |
+ message.append(", mozScore: "); |
+ message.appendNumber(features.mozScore); |
+ message.append(", mozScoreAllSqrt: "); |
+ message.appendNumber(features.mozScoreAllSqrt); |
+ message.append(", mozScoreAllLinear: "); |
+ message.appendNumber(features.mozScoreAllLinear); |
+ |
+ RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString()); |
+ document.addConsoleMessage(consoleMessage); |
esprehn
2015/10/23 04:59:14
just fprintf, no reason to console log your debug
wychen
2015/10/26 20:40:52
I just wanted to quickly get message to adb log.
|
+#endif |
+ |
+ return features; |
+} |
+ |
+} |