third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp - Issue 1419033004: Add feature extraction for distillability to Blink

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: merge master Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.h ('k') | third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..4ddfe137383430834d0bd97e4a47fbe3d16931a5

--- /dev/null

+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

@@ -0,0 +1,251 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "config.h"

+#include "DocumentStatisticsCollector.h"

+#include "core/HTMLNames.h"

+#include "core/InputTypeNames.h"

+#include "core/dom/ElementTraversal.h"

+#include "core/dom/NodeComputedStyle.h"

+#include "core/dom/Text.h"

+#include "core/frame/FrameHost.h"

+#include "core/html/HTMLHeadElement.h"

+#include "core/html/HTMLInputElement.h"

+#include "core/html/HTMLMetaElement.h"

+#include "public/platform/Platform.h"

+#include "public/platform/WebDistillability.h"

+namespace blink {

+using namespace HTMLNames;

+namespace {

+// Saturate the length of a paragraph to save time.

+const int kTextContentLengthSaturation = 1000;

+// Filter out short P elements. The threshold is set to around 2 English sentences.

+const unsigned kParagraphLengthThreshold = 140;

+// Saturate the scores to save time. The max is the score of 6 long paragraphs.

+const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold)

+const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation);

+const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;

+unsigned textContentLengthSaturated(Element& root)

+ unsigned length = 0;

+ // This skips shadow DOM intentionally, to match the JavaScript implementation.

+ // We would like to use the same statistics extracted by the JavaScript implementation

+ // on iOS, and JavaScript cannot peek deeply into shadow DOM except on modern Chrome

+ // versions.

+ // Given shadow DOM rarely appears in <P> elements in long-form articles, the overall

+ // accuracy should not be largely affected.

+ for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {

+ if (!node.isTextNode()) {

+ continue;

+ }

+ length += toText(node).length();

+ if (length > kTextContentLengthSaturation) {

+ return kTextContentLengthSaturation;

+ }

+ return length;

+bool isVisible(const Element& element)

+ const ComputedStyle* style = element.computedStyle();

esprehn 2015/11/05 01:21:59 you need ASSERT(!element.document().needsLayoutTr

wychen 2015/11/05 01:47:40 Skipped.

+ if (!style)

+ return false;

+ return (

+ style->display() != NONE

+ && style->visibility() != HIDDEN

+ && style->opacity() != 0

+ );

+bool matchAttributes(const Element& element, const Vector<String>& words)

+ const String& classes = element.getClassAttribute();

+ const String& id = element.getIdAttribute();

+ for (const String& word : words) {

+ if (classes.findIgnoringCase(word) != WTF::kNotFound

+ || id.findIgnoringCase(word) != WTF::kNotFound) {

+ return true;

+ }

+ return false;

+bool isGoodForScoring(bool underListItem, const WebDistillabilityFeatures& features, const Element& element)

+ DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());

+ if (unlikelyCandidates.isEmpty()) {

+ auto words = {

+ "banner",

+ "combx",

+ "comment",

+ "community",

+ "disqus",

+ "extra",

+ "foot",

+ "header",

+ "menu",

+ "related",

+ "remark",

+ "rss",

+ "share",

+ "shoutbox",

+ "sidebar",

+ "skyscraper",

+ "sponsor",

+ "ad-break",

+ "agegate",

+ "pagination",

+ "pager",

+ "popup"

+ };

+ for (auto word : words) {

+ unlikelyCandidates.append(word);

+ }

+ DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());

+ if (highlyLikelyCandidates.isEmpty()) {

+ auto words = {

+ "and",

+ "article",

+ "body",

+ "column",

+ "main",

+ "shadow"

+ };

+ for (auto word : words) {

+ highlyLikelyCandidates.append(word);

+ }

+ if (underListItem)

+ return false;

+ if (!isVisible(element))

+ return false;

+ if (features.mozScore >= kMozScoreSaturation

+ && features.mozScoreAllSqrt >= kMozScoreAllSqrtSaturation

+ && features.mozScoreAllLinear >= kMozScoreAllLinearSaturation)

+ return false;

+ if (matchAttributes(element, unlikelyCandidates) && !matchAttributes(element, highlyLikelyCandidates))

esprehn 2015/11/05 01:21:58 I'd wrap at the && like you did above

wychen 2015/11/05 01:47:40 Done.

+ return false;

+ return true;

+// underListItem denotes that at least one of the ancesters is <li> element.

+void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)

+ for (Node& node : NodeTraversal::childrenOf(root)) {

+ if (!node.isElementNode()) {

+ continue;

+ }

+ features.elementCount++;

+ Element& element = toElement(node);

+ if (element.hasTagName(aTag)) {

+ features.anchorCount++;

+ } else if (element.hasTagName(formTag)) {

+ features.formCount++;

+ } else if (element.hasTagName(inputTag)) {

+ const HTMLInputElement& input = toHTMLInputElement(element);

+ if (input.type() == InputTypeNames::text) {

+ features.textInputCount++;

+ } else if (input.type() == InputTypeNames::password) {

+ features.passwordInputCount++;

+ }

+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {

+ if (element.hasTagName(pTag)) {

+ features.pCount++;

+ } else {

+ features.preCount++;

+ }

+ if (isGoodForScoring(underListItem, features, element)) {

esprehn 2015/11/05 01:21:58 I'd probably move the underListItem check out so t

wychen 2015/11/05 01:47:40 Done.

+ unsigned length = textContentLengthSaturated(element);

+ if (length >= kParagraphLengthThreshold) {

+ features.mozScore += sqrt(length - kParagraphLengthThreshold);

+ features.mozScore = std::min(features.mozScore, kMozScoreSaturation);

+ }

+ features.mozScoreAllSqrt += sqrt(length);

+ features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation);

+ features.mozScoreAllLinear += length;

+ features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation);

+ }

+ } else if (element.hasTagName(liTag)) {

+ underListItem = true;

+ }

+ collectFeatures(element, features, underListItem);

+ }

+bool hasOpenGraphArticle(const Element& head)

+ DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));

+ DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));

+ for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {

+ if (!isHTMLMetaElement(*child))

+ continue;

+ const HTMLMetaElement& meta = toHTMLMetaElement(*child);

+ if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {

+ if (equalIgnoringCase(meta.content(), "article")) {

+ return true;

+ }

+ return false;

+bool isMobileFriendly(Document& document)

+ FrameHost* frameHost = document.frameHost();

+ if (!frameHost)

+ return false;

+ return frameHost->visualViewport().shouldDisableDesktopWorkarounds();

esprehn 2015/11/05 01:21:59 in blink we'd usually write: if (FrameHost* frame

wychen 2015/11/05 01:47:40 Done. Is this style for performance? Like what LIK

esprehn 2015/11/05 01:54:17 Not for performance, it just makes the scope of yo

wychen 2015/11/05 02:00:00 I see. Thanks

+} // namespace

+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)

+ TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics");

+ WebDistillabilityFeatures features = WebDistillabilityFeatures();

+ if (!document.frame() || !document.frame()->isMainFrame())

+ return features;

+ ASSERT(document.hasFinishedParsing());

+ HTMLElement* body = document.body();

+ HTMLElement* head = document.head();

+ if (!body || !head)

+ return features;

+ if (isMobileFriendly(document)) {

+ features.isMobileFriendly = true;

esprehn 2015/11/05 01:21:58 so if it's mobile friendly we don't need to collec

wychen 2015/11/05 01:47:40 Yes. We currently only trigger Reader Mode on non-

+ return features;

+ }

+ double startTime = monotonicallyIncreasingTime();

+ // Traverse the DOM tree and collect statistics.

esprehn 2015/11/05 01:21:58 you either need to call updateLayoutTreeIfNeeded()

wychen 2015/11/05 01:47:40 I'll skip the assertion above. It might be slightl

+ collectFeatures(*body, features);

+ features.openGraph = hasOpenGraphArticle(*head);

+ double elapsedTime = monotonicallyIncreasingTime() - startTime;

+ Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", static_cast<int>(1e6 * elapsedTime), 1, 1000000, 50);

+ return features;