| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
|
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..dcfbf98e092e431ca7698079608e901f34d39a09
|
| --- /dev/null
|
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
|
| @@ -0,0 +1,187 @@
|
| +// Copyright 2015 The Chromium Authors. All rights reserved.
|
| +// Use of this source code is governed by a BSD-style license that can be
|
| +// found in the LICENSE file.
|
| +
|
| +#include "config.h"
|
| +#include "DocumentStatisticsCollector.h"
|
| +
|
| +#include "core/HTMLNames.h"
|
| +#include "core/InputTypeNames.h"
|
| +#include "core/css/CSSComputedStyleDeclaration.h"
|
| +#include "core/dom/ElementTraversal.h"
|
| +#include "core/dom/NodeComputedStyle.h"
|
| +#include "core/editing/iterators/TextIterator.h"
|
| +#include "core/html/HTMLHeadElement.h"
|
| +#include "core/html/HTMLInputElement.h"
|
| +#include "core/html/HTMLMetaElement.h"
|
| +#include "public/platform/WebDistillability.h"
|
| +#include "wtf/text/StringBuilder.h"
|
| +#include "wtf/text/StringImpl.h"
|
| +
|
| +namespace blink {
|
| +
|
| +using namespace HTMLNames;
|
| +
|
| +namespace {
|
| +
|
| +// Saturate the length of a paragraph to save time.
|
| +const int kTextContentLengthSaturation = 1000;
|
| +
|
| +unsigned textContentLengthSaturated(Element& root)
|
| +{
|
| + unsigned length = 0;
|
| + // This skips shadow dom intentionally.
|
| + for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
|
| + if (!node.isTextNode()) {
|
| + continue;
|
| + }
|
| + length += toText(node).length();
|
| + if (length > kTextContentLengthSaturation) {
|
| + return kTextContentLengthSaturation;
|
| + }
|
| + }
|
| + return length;
|
| +}
|
| +
|
| +bool isVisible(Element& element)
|
| +{
|
| + const ComputedStyle* style = element.computedStyle();
|
| + if (!style) {
|
| + return false;
|
| + }
|
| + ASSERT(style->display() != NONE);
|
| + return (
|
| + style->visibility() != HIDDEN
|
| + && style->opacity() != 0
|
| + );
|
| +}
|
| +
|
| +bool matchAttributes(Element& element, const Vector<String>& words)
|
| +{
|
| + const String& classes = element.getClassAttribute();
|
| + const String& id = element.getIdAttribute();
|
| + for (const String& word : words) {
|
| + if (classes.findIgnoringCase(word) != WTF::kNotFound
|
| + || id.findIgnoringCase(word) != WTF::kNotFound) {
|
| + return true;
|
| + }
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +// underListItem denotes that at least one of the ancesters is <li> element.
|
| +void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)
|
| +{
|
| + DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
|
| + if (unlikelyCandidates.isEmpty()) {
|
| + for (auto word : {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}) {
|
| + unlikelyCandidates.append(word);
|
| + }
|
| + }
|
| + DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
|
| + if (highlyLikelyCandidates.isEmpty()) {
|
| + for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
|
| + highlyLikelyCandidates.append(word);
|
| + }
|
| + }
|
| + // Filter out short P elements. The threshold is set to around 2 English sentences.
|
| + const unsigned kParagraphLengthThreshold = 140;
|
| +
|
| + // Saturate the scores to save time. The max is the score of 6 long paragraphs.
|
| + const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold);
|
| + const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturation);
|
| + const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;
|
| +
|
| + for (Node& node : NodeTraversal::childrenOf(root)) {
|
| + if (node.isTextNode()) {
|
| + features.textContentLength += toText(node).length();
|
| + continue;
|
| + }
|
| + if (!node.isElementNode()) {
|
| + continue;
|
| + }
|
| +
|
| + features.elementCount++;
|
| + Element& element = toElement(node);
|
| + if (element.hasTagName(aTag)) {
|
| + features.anchorCount++;
|
| + } else if (element.hasTagName(formTag)) {
|
| + features.formCount++;
|
| + } else if (element.hasTagName(inputTag)) {
|
| + const HTMLInputElement& input = toHTMLInputElement(element);
|
| + if (equalIgnoringCase(input.type(), InputTypeNames::text)) {
|
| + features.textInputCount++;
|
| + } else if (equalIgnoringCase(input.type(), InputTypeNames::password)) {
|
| + features.passwordInputCount++;
|
| + }
|
| + } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
|
| + if (element.hasTagName(pTag)) {
|
| + features.pCount++;
|
| + } else {
|
| + features.preCount++;
|
| + }
|
| + if (!underListItem
|
| + && (features.mozScore < kMozScoreSaturation
|
| + || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation
|
| + || features.mozScoreAllLinear < kMozScoreAllLinearSaturation)
|
| + && isVisible(element)
|
| + && (!matchAttributes(element, unlikelyCandidates) || matchAttributes(element, highlyLikelyCandidates))
|
| + ) {
|
| + unsigned length = textContentLengthSaturated(element);
|
| + if (length >= kParagraphLengthThreshold) {
|
| + features.mozScore += sqrt(length - kParagraphLengthThreshold);
|
| + features.mozScore = std::min(features.mozScore, kMozScoreSaturation);
|
| + }
|
| + features.mozScoreAllSqrt += sqrt(length);
|
| + features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation);
|
| +
|
| + features.mozScoreAllLinear += length;
|
| + features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation);
|
| + }
|
| + } else if (element.hasTagName(liTag)) {
|
| + underListItem = true;
|
| + }
|
| + collectFeatures(element, features, underListItem);
|
| + }
|
| +}
|
| +
|
| +bool hasOpenGraphArticle(const Element& head)
|
| +{
|
| + DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
|
| + DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
|
| + for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
|
| + if (!isHTMLMetaElement(*child))
|
| + continue;
|
| + const HTMLMetaElement& meta = toHTMLMetaElement(*child);
|
| +
|
| + if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
|
| + if (equalIgnoringCase(meta.content(), "article")) {
|
| + return true;
|
| + }
|
| + }
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +} // namespace
|
| +
|
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
|
| +{
|
| + WebDistillabilityFeatures features = WebDistillabilityFeatures();
|
| +
|
| + if (!document.frame() || !document.frame()->isMainFrame())
|
| + return features;
|
| +
|
| + ASSERT(document.hasFinishedParsing());
|
| +
|
| + if (!document.body() || !document.head())
|
| + return features;
|
| +
|
| + // Traverse the DOM tree and collect statistics.
|
| + collectFeatures(*document.body(), features);
|
| + features.openGraph = hasOpenGraphArticle(*document.head());
|
| +
|
| + return features;
|
| +}
|
| +
|
| +}
|
|
|