Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(17)

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1248643004: Test distillability without JavaScript (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@early
Patch Set: fix oopsies Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4328e8ddd4732ee5fd26a11bea22485f8b02e9d3
--- /dev/null
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
@@ -0,0 +1,211 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "config.h"
+#include "DocumentStatisticsCollector.h"
+
+#include "core/HTMLNames.h"
+#include "core/css/CSSComputedStyleDeclaration.h"
+#include "core/editing/EphemeralRange.h"
+#include "core/editing/iterators/TextIterator.h"
+#include "core/editing/iterators/WordAwareIterator.h"
+#include "core/html/HTMLHeadElement.h"
+#include "core/inspector/ConsoleMessage.h"
+#include "platform/text/TextBreakIterator.h"
+#include "public/platform/WebDistillability.h"
+#include "wtf/text/StringBuilder.h"
+
+using namespace WTF;
+using namespace Unicode;
+
+namespace blink {
+
+using namespace HTMLNames;
+
+namespace {
+
+unsigned trimmedTextContentLength(Element& root)
+{
+ // TODO(wychen): count the length without allocating the string.
+ return root.textContent().stripWhiteSpace().length();
dglazkov 2015/10/22 16:30:31 The TODO needs to be addressed before landing.
wychen 2015/10/23 02:51:30 Will do.
+}
+
+unsigned innerTextLength(Element& root)
+{
+ unsigned length = 0;
+ EphemeralRange range = EphemeralRange::rangeOfContents(root);
+ TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText);
+ for (; !it.atEnd(); it.advance()) {
+ length += it.length();
+ }
+ return length;
+}
+
+class ExtractFeatureWalker {
dglazkov 2015/10/22 16:30:31 This can just be a separate class, no need to hide
wychen 2015/10/23 02:51:30 Do you mean creating new cpp/h files for this clas
+public:
+ ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features) :
+ m_document(document),
+ m_features(features)
+ {
+ unlikelyCandidates = {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"};
dglazkov 2015/10/22 16:30:31 Should these be static? Why initialize them in con
wychen 2015/10/23 02:51:30 Made static.
+ okMaybeItsACandidate = {"and", "article", "body", "column", "main", "shadow"};
+ }
+
+ bool isVisible(Element& elem)
+ {
+ RefPtr<CSSStyleDeclaration> style =
+ m_document.domWindow()->getComputedStyle(&elem, String());
+ return !(
+ style->getPropertyValue("display") == "none"
+ || style->getPropertyValue("visibility") == "hidden"
+ || style->getPropertyValue("opacity") == "0"
+ );
+ }
+
+ bool matchName(Element& elem, const std::vector<String>& words)
+ {
+ String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribute().lower();
dglazkov 2015/10/22 16:30:31 We have lots of style machinery to do this correct
wychen 2015/10/23 02:51:30 Could you elaborate how to use StyleResolver to im
+ for (const String& word: words) {
+ if (hay.find(word)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void walk()
+ {
+ walk(*m_document.body(), false);
+ }
+
+private:
+ void walk(Element& root, bool underLi = false)
+ {
+ for (Node& node : NodeTraversal::childrenOf(root)) {
+ if (node.isTextNode()) {
+ String text = toText(node).data();
+ m_features.textContentLength += text.length();
+ continue;
+ }
+ if (!node.isElementNode()) {
+ continue;
+ }
+
+ m_features.numElements++;
+ Element& element = toElement(node);
+ if (element.hasTagName(aTag)) {
+ m_features.numAnchors++;
+ } else if (element.hasTagName(formTag)) {
+ m_features.numForms++;
+ } else if (element.hasTagName(inputTag)) {
+ if (element.getAttribute("type").lower() == "text") {
+ m_features.numTextInput++;
+ } else if (element.getAttribute("type").lower() == "pasword") {
+ m_features.numPasswordInput++;
+ }
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
+ m_features.numPPRE++;
+ if (!underLi && isVisible(element)
+ && (!matchName(element, unlikelyCandidates) || matchName(element, okMaybeItsACandidate))) {
+ unsigned len = trimmedTextContentLength(element);
+ if (len >= 140) {
+ m_features.mozScore += sqrt(len - 140);
+ }
+ m_features.mozScoreAllSqrt += sqrt(len);
+ m_features.mozScoreAllLinear += len;
+ }
+ }
+ walk(element, element.hasTagName(liTag) || underLi);
+ }
+ }
+
+ std::vector<String> unlikelyCandidates, okMaybeItsACandidate;
+ Document& m_document;
+ WebDistillabilityFeatures& m_features;
+};
+
+bool hasOGArticle(const Element& head)
+{
+ for (const Node& node : NodeTraversal::childrenOf(head)) {
dglazkov 2015/10/22 16:30:31 If you're traversing things in the method above, m
wychen 2015/10/23 02:51:30 hasOGArticle traverses head, while walk() traverse
+ if (!node.isElementNode())
+ continue;
+ const Element& element = toElement(node);
+ if (!element.hasTagName(metaTag))
+ continue;
+ if ((element.getAttribute("name") == ("og:type")) || (element.getAttribute("property") == ("og:type"))) {
+ WTF::CString content = element.getAttribute("content").upper().utf8();
+ if ((content) == "ARTICLE") {
+ return true;
+ }
+ }
+ }
+ return false;
+};
+
+} // namespace
+
+DocumentStatisticsCollector::DocumentStatisticsCollector()
+ : m_readyToCollect(false)
+{
+}
+
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
+{
+ WebDistillabilityFeatures features({0});
+ if (!m_readyToCollect)
+ return features;
+
+ if (!document.frame() || !document.frame()->isMainFrame())
+ return features;
+
+ if (!document.hasFinishedParsing())
+ return features;
+
+ ASSERT(document.body());
+
+ // First, traverse the DOM tree and collect statistics.
+ ExtractFeatureWalker walker(document, features);
+ walker.walk();
+
+ // Next, traverse the Layout tree and collect statistics on innerText length.
+ features.innerTextLength += innerTextLength(*document.body());
+
+ features.openGraph = hasOGArticle(*document.head());
+
+ // The following DISTILLER_NDEBUG section would be gone when landing.
+#ifndef DISTILLER_NDEBUG
+ StringBuilder message;
+ message.append("openGraph: ");
+ message.appendNumber(features.openGraph);
+ message.append(", numElements: ");
+ message.appendNumber(features.numElements);
+ message.append(", numAnchors: ");
+ message.appendNumber(features.numAnchors);
+ message.append(", numForms: ");
+ message.appendNumber(features.numForms);
+ message.append(", numTextInput: ");
+ message.appendNumber(features.numTextInput);
+ message.append(", numPasswordInput: ");
+ message.appendNumber(features.numPasswordInput);
+ message.append(", numPPRE: ");
+ message.appendNumber(features.numPPRE);
+ message.append(", innerTextLength: ");
+ message.appendNumber(features.innerTextLength);
+ message.append(", textContentLength: ");
+ message.appendNumber(features.textContentLength);
+ message.append(", mozScore: ");
+ message.appendNumber(features.mozScore);
+ message.append(", mozScoreAllSqrt: ");
+ message.appendNumber(features.mozScoreAllSqrt);
+ message.append(", mozScoreAllLinear: ");
+ message.appendNumber(features.mozScoreAllLinear);
+
+ RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString());
+ document.addConsoleMessage(consoleMessage);
+#endif
+
+ return features;
+}
+
+}

Powered by Google App Engine
This is Rietveld 408576698