Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..4328e8ddd4732ee5fd26a11bea22485f8b02e9d3 |
--- /dev/null |
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
@@ -0,0 +1,211 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "config.h" |
+#include "DocumentStatisticsCollector.h" |
+ |
+#include "core/HTMLNames.h" |
+#include "core/css/CSSComputedStyleDeclaration.h" |
+#include "core/editing/EphemeralRange.h" |
+#include "core/editing/iterators/TextIterator.h" |
+#include "core/editing/iterators/WordAwareIterator.h" |
+#include "core/html/HTMLHeadElement.h" |
+#include "core/inspector/ConsoleMessage.h" |
+#include "platform/text/TextBreakIterator.h" |
+#include "public/platform/WebDistillability.h" |
+#include "wtf/text/StringBuilder.h" |
+ |
+using namespace WTF; |
+using namespace Unicode; |
+ |
+namespace blink { |
+ |
+using namespace HTMLNames; |
+ |
+namespace { |
+ |
+unsigned trimmedTextContentLength(Element& root) |
+{ |
+ // TODO(wychen): count the length without allocating the string. |
+ return root.textContent().stripWhiteSpace().length(); |
dglazkov
2015/10/22 16:30:31
The TODO needs to be addressed before landing.
wychen
2015/10/23 02:51:30
Will do.
|
+} |
+ |
+unsigned innerTextLength(Element& root) |
+{ |
+ unsigned length = 0; |
+ EphemeralRange range = EphemeralRange::rangeOfContents(root); |
+ TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText); |
+ for (; !it.atEnd(); it.advance()) { |
+ length += it.length(); |
+ } |
+ return length; |
+} |
+ |
+class ExtractFeatureWalker { |
dglazkov
2015/10/22 16:30:31
This can just be a separate class, no need to hide
wychen
2015/10/23 02:51:30
Do you mean creating new cpp/h files for this clas
|
+public: |
+ ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features) : |
+ m_document(document), |
+ m_features(features) |
+ { |
+ unlikelyCandidates = {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}; |
dglazkov
2015/10/22 16:30:31
Should these be static? Why initialize them in con
wychen
2015/10/23 02:51:30
Made static.
|
+ okMaybeItsACandidate = {"and", "article", "body", "column", "main", "shadow"}; |
+ } |
+ |
+ bool isVisible(Element& elem) |
+ { |
+ RefPtr<CSSStyleDeclaration> style = |
+ m_document.domWindow()->getComputedStyle(&elem, String()); |
+ return !( |
+ style->getPropertyValue("display") == "none" |
+ || style->getPropertyValue("visibility") == "hidden" |
+ || style->getPropertyValue("opacity") == "0" |
+ ); |
+ } |
+ |
+ bool matchName(Element& elem, const std::vector<String>& words) |
+ { |
+ String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribute().lower(); |
dglazkov
2015/10/22 16:30:31
We have lots of style machinery to do this correct
wychen
2015/10/23 02:51:30
Could you elaborate how to use StyleResolver to im
|
+ for (const String& word: words) { |
+ if (hay.find(word)) { |
+ return true; |
+ } |
+ } |
+ return false; |
+ } |
+ |
+ void walk() |
+ { |
+ walk(*m_document.body(), false); |
+ } |
+ |
+private: |
+ void walk(Element& root, bool underLi = false) |
+ { |
+ for (Node& node : NodeTraversal::childrenOf(root)) { |
+ if (node.isTextNode()) { |
+ String text = toText(node).data(); |
+ m_features.textContentLength += text.length(); |
+ continue; |
+ } |
+ if (!node.isElementNode()) { |
+ continue; |
+ } |
+ |
+ m_features.numElements++; |
+ Element& element = toElement(node); |
+ if (element.hasTagName(aTag)) { |
+ m_features.numAnchors++; |
+ } else if (element.hasTagName(formTag)) { |
+ m_features.numForms++; |
+ } else if (element.hasTagName(inputTag)) { |
+ if (element.getAttribute("type").lower() == "text") { |
+ m_features.numTextInput++; |
+ } else if (element.getAttribute("type").lower() == "pasword") { |
+ m_features.numPasswordInput++; |
+ } |
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
+ m_features.numPPRE++; |
+ if (!underLi && isVisible(element) |
+ && (!matchName(element, unlikelyCandidates) || matchName(element, okMaybeItsACandidate))) { |
+ unsigned len = trimmedTextContentLength(element); |
+ if (len >= 140) { |
+ m_features.mozScore += sqrt(len - 140); |
+ } |
+ m_features.mozScoreAllSqrt += sqrt(len); |
+ m_features.mozScoreAllLinear += len; |
+ } |
+ } |
+ walk(element, element.hasTagName(liTag) || underLi); |
+ } |
+ } |
+ |
+ std::vector<String> unlikelyCandidates, okMaybeItsACandidate; |
+ Document& m_document; |
+ WebDistillabilityFeatures& m_features; |
+}; |
+ |
+bool hasOGArticle(const Element& head) |
+{ |
+ for (const Node& node : NodeTraversal::childrenOf(head)) { |
dglazkov
2015/10/22 16:30:31
If you're traversing things in the method above, m
wychen
2015/10/23 02:51:30
hasOGArticle traverses head, while walk() traverse
|
+ if (!node.isElementNode()) |
+ continue; |
+ const Element& element = toElement(node); |
+ if (!element.hasTagName(metaTag)) |
+ continue; |
+ if ((element.getAttribute("name") == ("og:type")) || (element.getAttribute("property") == ("og:type"))) { |
+ WTF::CString content = element.getAttribute("content").upper().utf8(); |
+ if ((content) == "ARTICLE") { |
+ return true; |
+ } |
+ } |
+ } |
+ return false; |
+}; |
+ |
+} // namespace |
+ |
+DocumentStatisticsCollector::DocumentStatisticsCollector() |
+ : m_readyToCollect(false) |
+{ |
+} |
+ |
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
+{ |
+ WebDistillabilityFeatures features({0}); |
+ if (!m_readyToCollect) |
+ return features; |
+ |
+ if (!document.frame() || !document.frame()->isMainFrame()) |
+ return features; |
+ |
+ if (!document.hasFinishedParsing()) |
+ return features; |
+ |
+ ASSERT(document.body()); |
+ |
+ // First, traverse the DOM tree and collect statistics. |
+ ExtractFeatureWalker walker(document, features); |
+ walker.walk(); |
+ |
+ // Next, traverse the Layout tree and collect statistics on innerText length. |
+ features.innerTextLength += innerTextLength(*document.body()); |
+ |
+ features.openGraph = hasOGArticle(*document.head()); |
+ |
+ // The following DISTILLER_NDEBUG section would be gone when landing. |
+#ifndef DISTILLER_NDEBUG |
+ StringBuilder message; |
+ message.append("openGraph: "); |
+ message.appendNumber(features.openGraph); |
+ message.append(", numElements: "); |
+ message.appendNumber(features.numElements); |
+ message.append(", numAnchors: "); |
+ message.appendNumber(features.numAnchors); |
+ message.append(", numForms: "); |
+ message.appendNumber(features.numForms); |
+ message.append(", numTextInput: "); |
+ message.appendNumber(features.numTextInput); |
+ message.append(", numPasswordInput: "); |
+ message.appendNumber(features.numPasswordInput); |
+ message.append(", numPPRE: "); |
+ message.appendNumber(features.numPPRE); |
+ message.append(", innerTextLength: "); |
+ message.appendNumber(features.innerTextLength); |
+ message.append(", textContentLength: "); |
+ message.appendNumber(features.textContentLength); |
+ message.append(", mozScore: "); |
+ message.appendNumber(features.mozScore); |
+ message.append(", mozScoreAllSqrt: "); |
+ message.appendNumber(features.mozScoreAllSqrt); |
+ message.append(", mozScoreAllLinear: "); |
+ message.appendNumber(features.mozScoreAllLinear); |
+ |
+ RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString()); |
+ document.addConsoleMessage(consoleMessage); |
+#endif |
+ |
+ return features; |
+} |
+ |
+} |