Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(287)

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: merge master Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ddfe137383430834d0bd97e4a47fbe3d16931a5
--- /dev/null
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
@@ -0,0 +1,251 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "config.h"
+#include "DocumentStatisticsCollector.h"
+
+#include "core/HTMLNames.h"
+#include "core/InputTypeNames.h"
+#include "core/dom/ElementTraversal.h"
+#include "core/dom/NodeComputedStyle.h"
+#include "core/dom/Text.h"
+#include "core/frame/FrameHost.h"
+#include "core/html/HTMLHeadElement.h"
+#include "core/html/HTMLInputElement.h"
+#include "core/html/HTMLMetaElement.h"
+#include "public/platform/Platform.h"
+#include "public/platform/WebDistillability.h"
+
+namespace blink {
+
+using namespace HTMLNames;
+
+namespace {
+
+// Saturate the length of a paragraph to save time.
+const int kTextContentLengthSaturation = 1000;
+
+// Filter out short P elements. The threshold is set to around 2 English sentences.
+const unsigned kParagraphLengthThreshold = 140;
+
+// Saturate the scores to save time. The max is the score of 6 long paragraphs.
+const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold)
+const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation);
+const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;
+
+unsigned textContentLengthSaturated(Element& root)
+{
+ unsigned length = 0;
+ // This skips shadow DOM intentionally, to match the JavaScript implementation.
+ // We would like to use the same statistics extracted by the JavaScript implementation
+ // on iOS, and JavaScript cannot peek deeply into shadow DOM except on modern Chrome
+ // versions.
+ // Given shadow DOM rarely appears in <P> elements in long-form articles, the overall
+ // accuracy should not be largely affected.
+ for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
+ if (!node.isTextNode()) {
+ continue;
+ }
+ length += toText(node).length();
+ if (length > kTextContentLengthSaturation) {
+ return kTextContentLengthSaturation;
+ }
+ }
+ return length;
+}
+
+bool isVisible(const Element& element)
+{
+ const ComputedStyle* style = element.computedStyle();
esprehn 2015/11/05 01:21:59 you need ASSERT(!element.document().needsLayoutTr
wychen 2015/11/05 01:47:40 Skipped.
+ if (!style)
+ return false;
+ return (
+ style->display() != NONE
+ && style->visibility() != HIDDEN
+ && style->opacity() != 0
+ );
+}
+
+bool matchAttributes(const Element& element, const Vector<String>& words)
+{
+ const String& classes = element.getClassAttribute();
+ const String& id = element.getIdAttribute();
+ for (const String& word : words) {
+ if (classes.findIgnoringCase(word) != WTF::kNotFound
+ || id.findIgnoringCase(word) != WTF::kNotFound) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool isGoodForScoring(bool underListItem, const WebDistillabilityFeatures& features, const Element& element)
+{
+ DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
+ if (unlikelyCandidates.isEmpty()) {
+ auto words = {
+ "banner",
+ "combx",
+ "comment",
+ "community",
+ "disqus",
+ "extra",
+ "foot",
+ "header",
+ "menu",
+ "related",
+ "remark",
+ "rss",
+ "share",
+ "shoutbox",
+ "sidebar",
+ "skyscraper",
+ "sponsor",
+ "ad-break",
+ "agegate",
+ "pagination",
+ "pager",
+ "popup"
+ };
+ for (auto word : words) {
+ unlikelyCandidates.append(word);
+ }
+ }
+ DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
+ if (highlyLikelyCandidates.isEmpty()) {
+ auto words = {
+ "and",
+ "article",
+ "body",
+ "column",
+ "main",
+ "shadow"
+ };
+ for (auto word : words) {
+ highlyLikelyCandidates.append(word);
+ }
+ }
+
+ if (underListItem)
+ return false;
+ if (!isVisible(element))
+ return false;
+ if (features.mozScore >= kMozScoreSaturation
+ && features.mozScoreAllSqrt >= kMozScoreAllSqrtSaturation
+ && features.mozScoreAllLinear >= kMozScoreAllLinearSaturation)
+ return false;
+ if (matchAttributes(element, unlikelyCandidates) && !matchAttributes(element, highlyLikelyCandidates))
esprehn 2015/11/05 01:21:58 I'd wrap at the && like you did above
wychen 2015/11/05 01:47:40 Done.
+ return false;
+ return true;
+}
+
+// underListItem denotes that at least one of the ancesters is <li> element.
+void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)
+{
+ for (Node& node : NodeTraversal::childrenOf(root)) {
+ if (!node.isElementNode()) {
+ continue;
+ }
+
+ features.elementCount++;
+ Element& element = toElement(node);
+ if (element.hasTagName(aTag)) {
+ features.anchorCount++;
+ } else if (element.hasTagName(formTag)) {
+ features.formCount++;
+ } else if (element.hasTagName(inputTag)) {
+ const HTMLInputElement& input = toHTMLInputElement(element);
+ if (input.type() == InputTypeNames::text) {
+ features.textInputCount++;
+ } else if (input.type() == InputTypeNames::password) {
+ features.passwordInputCount++;
+ }
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
+ if (element.hasTagName(pTag)) {
+ features.pCount++;
+ } else {
+ features.preCount++;
+ }
+ if (isGoodForScoring(underListItem, features, element)) {
esprehn 2015/11/05 01:21:58 I'd probably move the underListItem check out so t
wychen 2015/11/05 01:47:40 Done.
+ unsigned length = textContentLengthSaturated(element);
+ if (length >= kParagraphLengthThreshold) {
+ features.mozScore += sqrt(length - kParagraphLengthThreshold);
+ features.mozScore = std::min(features.mozScore, kMozScoreSaturation);
+ }
+ features.mozScoreAllSqrt += sqrt(length);
+ features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation);
+
+ features.mozScoreAllLinear += length;
+ features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation);
+ }
+ } else if (element.hasTagName(liTag)) {
+ underListItem = true;
+ }
+ collectFeatures(element, features, underListItem);
+ }
+}
+
+bool hasOpenGraphArticle(const Element& head)
+{
+ DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
+ DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
+ for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
+ if (!isHTMLMetaElement(*child))
+ continue;
+ const HTMLMetaElement& meta = toHTMLMetaElement(*child);
+
+ if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
+ if (equalIgnoringCase(meta.content(), "article")) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+bool isMobileFriendly(Document& document)
+{
+ FrameHost* frameHost = document.frameHost();
+ if (!frameHost)
+ return false;
+ return frameHost->visualViewport().shouldDisableDesktopWorkarounds();
esprehn 2015/11/05 01:21:59 in blink we'd usually write: if (FrameHost* frame
wychen 2015/11/05 01:47:40 Done. Is this style for performance? Like what LIK
esprehn 2015/11/05 01:54:17 Not for performance, it just makes the scope of yo
wychen 2015/11/05 02:00:00 I see. Thanks
+}
+
+} // namespace
+
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
+{
+ TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics");
+
+ WebDistillabilityFeatures features = WebDistillabilityFeatures();
+
+ if (!document.frame() || !document.frame()->isMainFrame())
+ return features;
+
+ ASSERT(document.hasFinishedParsing());
+
+ HTMLElement* body = document.body();
+ HTMLElement* head = document.head();
+
+ if (!body || !head)
+ return features;
+
+ if (isMobileFriendly(document)) {
+ features.isMobileFriendly = true;
esprehn 2015/11/05 01:21:58 so if it's mobile friendly we don't need to collec
wychen 2015/11/05 01:47:40 Yes. We currently only trigger Reader Mode on non-
+ return features;
+ }
+
+ double startTime = monotonicallyIncreasingTime();
+
+ // Traverse the DOM tree and collect statistics.
esprehn 2015/11/05 01:21:58 you either need to call updateLayoutTreeIfNeeded()
wychen 2015/11/05 01:47:40 I'll skip the assertion above. It might be slightl
+ collectFeatures(*body, features);
+ features.openGraph = hasOpenGraphArticle(*head);
+
+ double elapsedTime = monotonicallyIncreasingTime() - startTime;
+ Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", static_cast<int>(1e6 * elapsedTime), 1, 1000000, 50);
+
+ return features;
+}
+
+}

Powered by Google App Engine
This is Rietveld 408576698