third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp - Issue 1419033004: Add feature extraction for distillability to Blink

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: add mobile friendly detection Created 5 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

« no previous file with comments | « third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.h ('k') | third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

new file mode 100644

index 0000000000000000000000000000000000000000..eb7e80b049858e6593243df4b021f1bc29588b84

--- /dev/null

+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

@@ -0,0 +1,201 @@

+// Use of this source code is governed by a BSD-style license that can be

+// found in the LICENSE file.

+#include "config.h"

+#include "DocumentStatisticsCollector.h"

+#include "core/HTMLNames.h"

+#include "core/InputTypeNames.h"

+#include "core/css/CSSComputedStyleDeclaration.h"

+#include "core/dom/ElementTraversal.h"

+#include "core/dom/NodeComputedStyle.h"

+#include "core/editing/iterators/TextIterator.h"

+#include "core/frame/FrameHost.h"

+#include "core/html/HTMLHeadElement.h"

+#include "core/html/HTMLInputElement.h"

+#include "core/html/HTMLMetaElement.h"

+#include "public/platform/WebDistillability.h"

+#include "wtf/text/StringBuilder.h"

+#include "wtf/text/StringImpl.h"

+namespace blink {

+using namespace HTMLNames;

+namespace {

+// Saturate the length of a paragraph to save time.

+const int kTextContentLengthSaturation = 1000;

+unsigned textContentLengthSaturated(Element& root)

+ unsigned length = 0;

+ // This skips shadow dom intentionally.

dglazkov 2015/11/03 04:45:37 Please explain why in the comment.

wychen 2015/11/03 07:13:41 Done.

+ for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {

+ if (!node.isTextNode()) {

+ continue;

+ }

+ length += toText(node).length();

+ if (length > kTextContentLengthSaturation) {

+ return kTextContentLengthSaturation;

+ }

+ return length;

+bool isVisible(Element& element)

+ const ComputedStyle* style = element.computedStyle();

+ if (!style) {

dglazkov 2015/11/03 04:45:37 Don't need braces here.

wychen 2015/11/03 07:13:41 Done.

+ return false;

+ }

+ ASSERT(style->display() != NONE);

+ return (

+ style->visibility() != HIDDEN

+ && style->opacity() != 0

+ );

+bool matchAttributes(Element& element, const Vector<String>& words)

+ const String& classes = element.getClassAttribute();

+ const String& id = element.getIdAttribute();

+ for (const String& word : words) {

+ if (classes.findIgnoringCase(word) != WTF::kNotFound

+ || id.findIgnoringCase(word) != WTF::kNotFound) {

+ return true;

+ }

+ return false;

+// underListItem denotes that at least one of the ancesters is <li> element.

+void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)

+ DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());

+ if (unlikelyCandidates.isEmpty()) {

+ for (auto word : {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}) {

+ unlikelyCandidates.append(word);

+ }

+ DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());

+ if (highlyLikelyCandidates.isEmpty()) {

+ for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {

+ highlyLikelyCandidates.append(word);

+ }

+ // Filter out short P elements. The threshold is set to around 2 English sentences.

+ const unsigned kParagraphLengthThreshold = 140;

+ // Saturate the scores to save time. The max is the score of 6 long paragraphs.

+ const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold);

+ const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturation);

+ const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;

+ for (Node& node : NodeTraversal::childrenOf(root)) {

+ if (node.isTextNode()) {

+ features.textContentLength += toText(node).length();

+ continue;

+ }

+ if (!node.isElementNode()) {

+ continue;

+ }

+ features.elementCount++;

+ Element& element = toElement(node);

+ if (element.hasTagName(aTag)) {

+ features.anchorCount++;

+ } else if (element.hasTagName(formTag)) {

+ features.formCount++;

+ } else if (element.hasTagName(inputTag)) {

+ const HTMLInputElement& input = toHTMLInputElement(element);

+ if (equalIgnoringCase(input.type(), InputTypeNames::text)) {

+ features.textInputCount++;

+ } else if (equalIgnoringCase(input.type(), InputTypeNames::password)) {

+ features.passwordInputCount++;

+ }

+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {

+ if (element.hasTagName(pTag)) {

+ features.pCount++;

+ } else {

+ features.preCount++;

+ }

+ if (!underListItem

+ && (features.mozScore < kMozScoreSaturation

+ || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation

+ || features.mozScoreAllLinear < kMozScoreAllLinearSaturation)

+ && isVisible(element)

+ && (!matchAttributes(element, unlikelyCandidates) || matchAttributes(element, highlyLikelyCandidates))

+ ) {

+ unsigned length = textContentLengthSaturated(element);

dglazkov 2015/11/03 04:45:37 Is this an O(NxM) built in here?

wychen 2015/11/03 07:13:41 I'm not quite sure about what you meant here. tex

+ if (length >= kParagraphLengthThreshold) {

+ features.mozScore += sqrt(length - kParagraphLengthThreshold);

+ features.mozScore = std::min(features.mozScore, kMozScoreSaturation);

+ }

+ features.mozScoreAllSqrt += sqrt(length);

+ features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation);

+ features.mozScoreAllLinear += length;

+ features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation);

+ }

+ } else if (element.hasTagName(liTag)) {

+ underListItem = true;

+ }

+ collectFeatures(element, features, underListItem);

+ }

+bool hasOpenGraphArticle(const Element& head)

+ DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));

+ DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));

+ for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {

+ if (!isHTMLMetaElement(*child))

+ continue;

+ const HTMLMetaElement& meta = toHTMLMetaElement(*child);

+ if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {

+ if (equalIgnoringCase(meta.content(), "article")) {

+ return true;

+ }

+ return false;

+bool isMobileFriendly(Document& document)

+ FrameHost* frameHost = document.frameHost();

+ if (!frameHost)

+ return false;

+ return frameHost->visualViewport().shouldDisableDesktopWorkarounds();

+} // namespace

+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)

+ WebDistillabilityFeatures features = WebDistillabilityFeatures();

+ if (!document.frame() || !document.frame()->isMainFrame())

+ return features;

+ ASSERT(document.hasFinishedParsing());

+ if (!document.body() || !document.head())

dglazkov 2015/11/03 04:45:37 Both of these are traversals, so might be good to

wychen 2015/11/03 07:13:41 Done.

+ return features;

+ if (isMobileFriendly(document)) {

+ features.isMobileFriendly = true;

+ return features;

+ }

+ // Traverse the DOM tree and collect statistics.

+ collectFeatures(*document.body(), features);

+ features.openGraph = hasOpenGraphArticle(*document.head());

+ return features;