Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
new file mode 100644 |
index 0000000000000000000000000000000000000000..4ddfe137383430834d0bd97e4a47fbe3d16931a5 |
--- /dev/null |
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
@@ -0,0 +1,251 @@ |
+// Copyright 2015 The Chromium Authors. All rights reserved. |
+// Use of this source code is governed by a BSD-style license that can be |
+// found in the LICENSE file. |
+ |
+#include "config.h" |
+#include "DocumentStatisticsCollector.h" |
+ |
+#include "core/HTMLNames.h" |
+#include "core/InputTypeNames.h" |
+#include "core/dom/ElementTraversal.h" |
+#include "core/dom/NodeComputedStyle.h" |
+#include "core/dom/Text.h" |
+#include "core/frame/FrameHost.h" |
+#include "core/html/HTMLHeadElement.h" |
+#include "core/html/HTMLInputElement.h" |
+#include "core/html/HTMLMetaElement.h" |
+#include "public/platform/Platform.h" |
+#include "public/platform/WebDistillability.h" |
+ |
+namespace blink { |
+ |
+using namespace HTMLNames; |
+ |
+namespace { |
+ |
+// Saturate the length of a paragraph to save time. |
+const int kTextContentLengthSaturation = 1000; |
+ |
+// Filter out short P elements. The threshold is set to around 2 English sentences. |
+const unsigned kParagraphLengthThreshold = 140; |
+ |
+// Saturate the scores to save time. The max is the score of 6 long paragraphs. |
+const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold) |
+const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation); |
+const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation; |
+ |
+unsigned textContentLengthSaturated(Element& root) |
+{ |
+ unsigned length = 0; |
+ // This skips shadow DOM intentionally, to match the JavaScript implementation. |
+ // We would like to use the same statistics extracted by the JavaScript implementation |
+ // on iOS, and JavaScript cannot peek deeply into shadow DOM except on modern Chrome |
+ // versions. |
+ // Given shadow DOM rarely appears in <P> elements in long-form articles, the overall |
+ // accuracy should not be largely affected. |
+ for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) { |
+ if (!node.isTextNode()) { |
+ continue; |
+ } |
+ length += toText(node).length(); |
+ if (length > kTextContentLengthSaturation) { |
+ return kTextContentLengthSaturation; |
+ } |
+ } |
+ return length; |
+} |
+ |
+bool isVisible(const Element& element) |
+{ |
+ const ComputedStyle* style = element.computedStyle(); |
+ if (!style) |
+ return false; |
+ return ( |
+ style->display() != NONE |
+ && style->visibility() != HIDDEN |
+ && style->opacity() != 0 |
+ ); |
+} |
+ |
+bool matchAttributes(const Element& element, const Vector<String>& words) |
+{ |
+ const String& classes = element.getClassAttribute(); |
+ const String& id = element.getIdAttribute(); |
+ for (const String& word : words) { |
+ if (classes.findIgnoringCase(word) != WTF::kNotFound |
+ || id.findIgnoringCase(word) != WTF::kNotFound) { |
+ return true; |
+ } |
+ } |
+ return false; |
+} |
+ |
+bool isGoodForScoring(bool underListItem, const WebDistillabilityFeatures& features, const Element& element) |
+{ |
+ DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ()); |
+ if (unlikelyCandidates.isEmpty()) { |
+ auto words = { |
+ "banner", |
+ "combx", |
+ "comment", |
+ "community", |
+ "disqus", |
+ "extra", |
+ "foot", |
+ "header", |
+ "menu", |
+ "related", |
+ "remark", |
+ "rss", |
+ "share", |
+ "shoutbox", |
+ "sidebar", |
+ "skyscraper", |
+ "sponsor", |
+ "ad-break", |
+ "agegate", |
+ "pagination", |
+ "pager", |
+ "popup" |
+ }; |
+ for (auto word : words) { |
+ unlikelyCandidates.append(word); |
+ } |
+ } |
+ DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ()); |
+ if (highlyLikelyCandidates.isEmpty()) { |
+ auto words = { |
+ "and", |
+ "article", |
+ "body", |
+ "column", |
+ "main", |
+ "shadow" |
+ }; |
+ for (auto word : words) { |
+ highlyLikelyCandidates.append(word); |
+ } |
+ } |
+ |
+ if (underListItem) |
+ return false; |
+ if (!isVisible(element)) |
+ return false; |
+ if (features.mozScore >= kMozScoreSaturation |
+ && features.mozScoreAllSqrt >= kMozScoreAllSqrtSaturation |
+ && features.mozScoreAllLinear >= kMozScoreAllLinearSaturation) |
+ return false; |
+ if (matchAttributes(element, unlikelyCandidates) && !matchAttributes(element, highlyLikelyCandidates)) |
+ return false; |
+ return true; |
+} |
+ |
+// underListItem denotes that at least one of the ancesters is <li> element. |
+void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false) |
+{ |
+ for (Node& node : NodeTraversal::childrenOf(root)) { |
+ if (!node.isElementNode()) { |
+ continue; |
+ } |
+ |
+ features.elementCount++; |
+ Element& element = toElement(node); |
+ if (element.hasTagName(aTag)) { |
+ features.anchorCount++; |
+ } else if (element.hasTagName(formTag)) { |
+ features.formCount++; |
+ } else if (element.hasTagName(inputTag)) { |
+ const HTMLInputElement& input = toHTMLInputElement(element); |
+ if (input.type() == InputTypeNames::text) { |
+ features.textInputCount++; |
+ } else if (input.type() == InputTypeNames::password) { |
+ features.passwordInputCount++; |
+ } |
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
+ if (element.hasTagName(pTag)) { |
+ features.pCount++; |
+ } else { |
+ features.preCount++; |
+ } |
+ if (isGoodForScoring(underListItem, features, element)) { |
+ unsigned length = textContentLengthSaturated(element); |
+ if (length >= kParagraphLengthThreshold) { |
+ features.mozScore += sqrt(length - kParagraphLengthThreshold); |
+ features.mozScore = std::min(features.mozScore, kMozScoreSaturation); |
+ } |
+ features.mozScoreAllSqrt += sqrt(length); |
+ features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation); |
+ |
+ features.mozScoreAllLinear += length; |
+ features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation); |
+ } |
+ } else if (element.hasTagName(liTag)) { |
+ underListItem = true; |
+ } |
+ collectFeatures(element, features, underListItem); |
+ } |
+} |
+ |
+bool hasOpenGraphArticle(const Element& head) |
+{ |
+ DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type")); |
+ DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property")); |
+ for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) { |
+ if (!isHTMLMetaElement(*child)) |
+ continue; |
+ const HTMLMetaElement& meta = toHTMLMetaElement(*child); |
+ |
+ if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) { |
+ if (equalIgnoringCase(meta.content(), "article")) { |
+ return true; |
+ } |
+ } |
+ } |
+ return false; |
+} |
+ |
+bool isMobileFriendly(Document& document) |
+{ |
+ FrameHost* frameHost = document.frameHost(); |
+ if (!frameHost) |
+ return false; |
+ return frameHost->visualViewport().shouldDisableDesktopWorkarounds(); |
+} |
+ |
+} // namespace |
+ |
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
+{ |
+ TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics"); |
+ |
+ WebDistillabilityFeatures features = WebDistillabilityFeatures(); |
+ |
+ if (!document.frame() || !document.frame()->isMainFrame()) |
+ return features; |
+ |
+ ASSERT(document.hasFinishedParsing()); |
+ |
+ HTMLElement* body = document.body(); |
+ HTMLElement* head = document.head(); |
+ |
+ if (!body || !head) |
+ return features; |
+ |
+ if (isMobileFriendly(document)) { |
+ features.isMobileFriendly = true; |
+ return features; |
+ } |
+ |
+ double startTime = monotonicallyIncreasingTime(); |
+ |
+ // Traverse the DOM tree and collect statistics. |
+ collectFeatures(*body, features); |
+ features.openGraph = hasOpenGraphArticle(*head); |
+ |
+ double elapsedTime = monotonicallyIncreasingTime() - startTime; |
+ Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", static_cast<int>(1e6 * elapsedTime), 1, 1000000, 50); |
+ |
+ return features; |
+} |
+ |
+} |