Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(290)

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comments, remove innerText Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b5ca742a108a8204064746e7babbb9506a0580ac
--- /dev/null
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
@@ -0,0 +1,256 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "config.h"
+#include "DocumentStatisticsCollector.h"
+
+#include "core/HTMLNames.h"
+#include "core/InputTypeNames.h"
+#include "core/css/CSSComputedStyleDeclaration.h"
+#include "core/dom/ElementTraversal.h"
+#include "core/editing/iterators/TextIterator.h"
+#include "core/html/HTMLHeadElement.h"
+#include "core/html/HTMLInputElement.h"
+#include "core/html/HTMLMetaElement.h"
+
+// TODO(wychen): The following lines will be gone before landing.
+#include "core/inspector/ConsoleMessage.h"
+
+#include "public/platform/WebDistillability.h"
+#include "wtf/text/StringBuilder.h"
+#include "wtf/text/StringImpl.h"
+
+namespace blink {
+
+using namespace HTMLNames;
+
+namespace {
+
+// Saturate the length of a paragraph to save time.
+const int kTextContentLengthSaturation = 1000;
+
+unsigned trimmedTextContentLength(Element& root)
+{
+ int firstNonWhite = -1, lastNonWhite = -1;
+ unsigned position = 0;
+ unsigned length = 0;
+ // TODO(wychen): scan backwards for lastNonWhite should be much faster in practice.
+ // This skips shadow dom intentionally.
+ for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
+ if (!node.isTextNode()) {
+ continue;
+ }
+ const String& text = toText(node).data();
+ for (unsigned i = 0; i < text.length(); i++) {
+ if (!isSpaceOrNewline(text[i])) {
+ if (firstNonWhite < 0) {
+ firstNonWhite = lastNonWhite = i + position;
+ } else {
+ lastNonWhite = i + position;
+ }
+ }
+ }
+ if (firstNonWhite >= 0) {
+ length = lastNonWhite - firstNonWhite + 1;
+ if (length > kTextContentLengthSaturation) {
wychen 2015/10/28 22:00:51 With saturations, the total cost of trimmedTextCon
+ return kTextContentLengthSaturation;
+ }
+ }
+ position += text.length();
+ }
+ return length;
+}
+
+bool isVisible(Element& element)
+{
+ const ComputedStyle* style = element.computedStyle();
+ if (!style) {
+ return false;
+ }
+ ASSERT(style->display() != NONE);
esprehn 2015/11/03 07:45:10 this isn't true, you can still have a style and be
wychen 2015/11/03 08:59:48 Done.
+ return (
+ style->visibility() != HIDDEN
+ && style->opacity() != 0
+ );
+}
+
+bool matchAttributes(Element& element, const Vector<String>& words)
+{
+ const String& classes = element.getClassAttribute();
+ const String& id = element.getIdAttribute();
+ for (const String& word : words) {
+ if (classes.findIgnoringCase(word) != WTF::kNotFound
+ || id.findIgnoringCase(word) != WTF::kNotFound) {
+ return true;
+ }
+ }
+ return false;
+}
+
+// underListItem denotes that at least one of the ancesters is <li> element.
+void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)
+{
+ DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
+ if (unlikelyCandidates.isEmpty()) {
+ for (auto word : {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}) {
+ unlikelyCandidates.append(word);
+ }
+ }
+ DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
+ if (highlyLikelyCandidates.isEmpty()) {
+ for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
+ highlyLikelyCandidates.append(word);
+ }
+ }
+ // Filter out short P elements. The threshold is set to around 2 English sentences.
+ const unsigned kParagraphLengthThreshold = 140;
+
+ // Saturate the scores to save time. The max is the score of 6 long paragraphs.
+ const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold);
+ const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturation);
+ const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;
+
+ for (Node& node : NodeTraversal::childrenOf(root)) {
+ if (node.isTextNode()) {
+ features.textContentLength += toText(node).length();
+ continue;
+ }
+ if (!node.isElementNode()) {
+ continue;
+ }
+
+ features.elementCount++;
+ Element& element = toElement(node);
+ if (element.hasTagName(aTag)) {
+ features.anchorCount++;
+ } else if (element.hasTagName(formTag)) {
+ features.formCount++;
+ } else if (element.hasTagName(inputTag)) {
+ const HTMLInputElement& input = toHTMLInputElement(element);
+ if (equalIgnoringCase(input.type(), InputTypeNames::text)) {
+ features.textInputCount++;
+ } else if (equalIgnoringCase(input.type(), InputTypeNames::password)) {
+ features.passwordInputCount++;
+ }
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
+ if (element.hasTagName(pTag)) {
+ features.pCount++;
+ } else {
+ features.preCount++;
+ }
+ if (!underListItem
+ && (features.mozScore < kMozScoreSaturation
+ || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation
+ || features.mozScoreAllLinear < kMozScoreAllLinearSaturation)
+ && isVisible(element)
+ && (!matchAttributes(element, unlikelyCandidates) || matchAttributes(element, highlyLikelyCandidates))
+ ) {
+ unsigned length = trimmedTextContentLength(element);
+ if (length >= kParagraphLengthThreshold) {
+ features.mozScore += sqrt(length - kParagraphLengthThreshold);
+ features.mozScore = std::min(features.mozScore, kMozScoreSaturation);
+ }
+ features.mozScoreAllSqrt += sqrt(length);
+ features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation);
+
+ features.mozScoreAllLinear += length;
+ features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation);
+ }
+ } else if (element.hasTagName(liTag)) {
+ underListItem = true;
+ }
+ collectFeatures(element, features, underListItem);
+ }
+}
+
+bool hasOpenGraphArticle(const Element& head)
+{
+ DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
+ DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
+ for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
+ if (!isHTMLMetaElement(*child))
+ continue;
+ const HTMLMetaElement& meta = toHTMLMetaElement(*child);
+
+ if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
+ if (equalIgnoringCase(meta.content(), "article")) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+} // namespace
+
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
+{
+ WebDistillabilityFeatures features = WebDistillabilityFeatures();
+
+ if (!document.frame() || !document.frame()->isMainFrame())
+ return features;
+
+ ASSERT(document.hasFinishedParsing());
+
+ if (!document.body() || !document.head())
+ return features;
+
+ // The following DISTILLER_NDEBUG sections would be gone when landing.
+#ifndef DISTILLER_NDEBUG
+ double startTime = WTF::currentTime();
+#endif
+
+ // First, traverse the DOM tree and collect statistics.
+ collectFeatures(*document.body(), features);
+
+#ifndef DISTILLER_NDEBUG
+ double elapsedTime = WTF::currentTime() - startTime;
+ startTime = WTF::currentTime();
+#endif
+
+ features.openGraph = hasOpenGraphArticle(*document.head());
+
+#ifndef DISTILLER_NDEBUG
+ double ogElapsedTime = WTF::currentTime() - startTime;
+#endif
+
+#ifndef DISTILLER_NDEBUG
+ StringBuilder message;
+ message.append("openGraph: ");
+ message.appendNumber(features.openGraph);
+ message.append(", elementCount: ");
+ message.appendNumber(features.elementCount);
+ message.append(", anchorCount: ");
+ message.appendNumber(features.anchorCount);
+ message.append(", formCount: ");
+ message.appendNumber(features.formCount);
+ message.append(", textInputCount: ");
+ message.appendNumber(features.textInputCount);
+ message.append(", passwordInputCount: ");
+ message.appendNumber(features.passwordInputCount);
+ message.append(", pCount: ");
+ message.appendNumber(features.pCount);
+ message.append(", preCount: ");
+ message.appendNumber(features.preCount);
+ message.append(", textContentLength: ");
+ message.appendNumber(features.textContentLength);
+ message.append(", mozScore: ");
+ message.appendNumber(features.mozScore);
+ message.append(", mozScoreAllSqrt: ");
+ message.appendNumber(features.mozScoreAllSqrt);
+ message.append(", mozScoreAllLinear: ");
+ message.appendNumber(features.mozScoreAllLinear);
+ message.append("\nElapsed time (ms): ");
+ message.appendNumber(elapsedTime * 1000);
+ message.append(", openGraph time (ms): ");
+ message.appendNumber(ogElapsedTime * 1000);
+
+ RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString());
+ document.addConsoleMessage(consoleMessage);
+#endif
+
+ return features;
+}
+
+}

Powered by Google App Engine
This is Rietveld 408576698