| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
 | 
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
 | 
| new file mode 100644
 | 
| index 0000000000000000000000000000000000000000..4ddfe137383430834d0bd97e4a47fbe3d16931a5
 | 
| --- /dev/null
 | 
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
 | 
| @@ -0,0 +1,251 @@
 | 
| +// Copyright 2015 The Chromium Authors. All rights reserved.
 | 
| +// Use of this source code is governed by a BSD-style license that can be
 | 
| +// found in the LICENSE file.
 | 
| +
 | 
| +#include "config.h"
 | 
| +#include "DocumentStatisticsCollector.h"
 | 
| +
 | 
| +#include "core/HTMLNames.h"
 | 
| +#include "core/InputTypeNames.h"
 | 
| +#include "core/dom/ElementTraversal.h"
 | 
| +#include "core/dom/NodeComputedStyle.h"
 | 
| +#include "core/dom/Text.h"
 | 
| +#include "core/frame/FrameHost.h"
 | 
| +#include "core/html/HTMLHeadElement.h"
 | 
| +#include "core/html/HTMLInputElement.h"
 | 
| +#include "core/html/HTMLMetaElement.h"
 | 
| +#include "public/platform/Platform.h"
 | 
| +#include "public/platform/WebDistillability.h"
 | 
| +
 | 
| +namespace blink {
 | 
| +
 | 
| +using namespace HTMLNames;
 | 
| +
 | 
| +namespace {
 | 
| +
 | 
| +// Saturate the length of a paragraph to save time.
 | 
| +const int kTextContentLengthSaturation = 1000;
 | 
| +
 | 
| +// Filter out short P elements. The threshold is set to around 2 English sentences.
 | 
| +const unsigned kParagraphLengthThreshold = 140;
 | 
| +
 | 
| +// Saturate the scores to save time. The max is the score of 6 long paragraphs.
 | 
| +const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold)
 | 
| +const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation);
 | 
| +const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation;
 | 
| +
 | 
| +unsigned textContentLengthSaturated(Element& root)
 | 
| +{
 | 
| +    unsigned length = 0;
 | 
| +    // This skips shadow DOM intentionally, to match the JavaScript implementation.
 | 
| +    // We would like to use the same statistics extracted by the JavaScript implementation
 | 
| +    // on iOS, and JavaScript cannot peek deeply into shadow DOM except on modern Chrome
 | 
| +    // versions.
 | 
| +    // Given shadow DOM rarely appears in <P> elements in long-form articles, the overall
 | 
| +    // accuracy should not be largely affected.
 | 
| +    for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
 | 
| +        if (!node.isTextNode()) {
 | 
| +            continue;
 | 
| +        }
 | 
| +        length += toText(node).length();
 | 
| +        if (length > kTextContentLengthSaturation) {
 | 
| +            return kTextContentLengthSaturation;
 | 
| +        }
 | 
| +    }
 | 
| +    return length;
 | 
| +}
 | 
| +
 | 
| +bool isVisible(const Element& element)
 | 
| +{
 | 
| +    const ComputedStyle* style = element.computedStyle();
 | 
| +    if (!style)
 | 
| +        return false;
 | 
| +    return (
 | 
| +        style->display() != NONE
 | 
| +        && style->visibility() != HIDDEN
 | 
| +        && style->opacity() != 0
 | 
| +    );
 | 
| +}
 | 
| +
 | 
| +bool matchAttributes(const Element& element, const Vector<String>& words)
 | 
| +{
 | 
| +    const String& classes = element.getClassAttribute();
 | 
| +    const String& id = element.getIdAttribute();
 | 
| +    for (const String& word : words) {
 | 
| +        if (classes.findIgnoringCase(word) != WTF::kNotFound
 | 
| +            || id.findIgnoringCase(word) != WTF::kNotFound) {
 | 
| +            return true;
 | 
| +        }
 | 
| +    }
 | 
| +    return false;
 | 
| +}
 | 
| +
 | 
| +bool isGoodForScoring(bool underListItem, const WebDistillabilityFeatures& features, const Element& element)
 | 
| +{
 | 
| +    DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
 | 
| +    if (unlikelyCandidates.isEmpty()) {
 | 
| +        auto words = {
 | 
| +            "banner",
 | 
| +            "combx",
 | 
| +            "comment",
 | 
| +            "community",
 | 
| +            "disqus",
 | 
| +            "extra",
 | 
| +            "foot",
 | 
| +            "header",
 | 
| +            "menu",
 | 
| +            "related",
 | 
| +            "remark",
 | 
| +            "rss",
 | 
| +            "share",
 | 
| +            "shoutbox",
 | 
| +            "sidebar",
 | 
| +            "skyscraper",
 | 
| +            "sponsor",
 | 
| +            "ad-break",
 | 
| +            "agegate",
 | 
| +            "pagination",
 | 
| +            "pager",
 | 
| +            "popup"
 | 
| +        };
 | 
| +        for (auto word : words) {
 | 
| +            unlikelyCandidates.append(word);
 | 
| +        }
 | 
| +    }
 | 
| +    DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
 | 
| +    if (highlyLikelyCandidates.isEmpty()) {
 | 
| +        auto words = {
 | 
| +            "and",
 | 
| +            "article",
 | 
| +            "body",
 | 
| +            "column",
 | 
| +            "main",
 | 
| +            "shadow"
 | 
| +        };
 | 
| +        for (auto word : words) {
 | 
| +            highlyLikelyCandidates.append(word);
 | 
| +        }
 | 
| +    }
 | 
| +
 | 
| +    if (underListItem)
 | 
| +        return false;
 | 
| +    if (!isVisible(element))
 | 
| +        return false;
 | 
| +    if (features.mozScore >= kMozScoreSaturation
 | 
| +        && features.mozScoreAllSqrt >= kMozScoreAllSqrtSaturation
 | 
| +        && features.mozScoreAllLinear >= kMozScoreAllLinearSaturation)
 | 
| +        return false;
 | 
| +    if (matchAttributes(element, unlikelyCandidates) && !matchAttributes(element, highlyLikelyCandidates))
 | 
| +        return false;
 | 
| +    return true;
 | 
| +}
 | 
| +
 | 
| +// underListItem denotes that at least one of the ancesters is <li> element.
 | 
| +void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)
 | 
| +{
 | 
| +    for (Node& node : NodeTraversal::childrenOf(root)) {
 | 
| +        if (!node.isElementNode()) {
 | 
| +            continue;
 | 
| +        }
 | 
| +
 | 
| +        features.elementCount++;
 | 
| +        Element& element = toElement(node);
 | 
| +        if (element.hasTagName(aTag)) {
 | 
| +            features.anchorCount++;
 | 
| +        } else if (element.hasTagName(formTag)) {
 | 
| +            features.formCount++;
 | 
| +        } else if (element.hasTagName(inputTag)) {
 | 
| +            const HTMLInputElement& input = toHTMLInputElement(element);
 | 
| +            if (input.type() == InputTypeNames::text) {
 | 
| +                features.textInputCount++;
 | 
| +            } else if (input.type() == InputTypeNames::password) {
 | 
| +                features.passwordInputCount++;
 | 
| +            }
 | 
| +        } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
 | 
| +            if (element.hasTagName(pTag)) {
 | 
| +                features.pCount++;
 | 
| +            } else {
 | 
| +                features.preCount++;
 | 
| +            }
 | 
| +            if (isGoodForScoring(underListItem, features, element)) {
 | 
| +                unsigned length = textContentLengthSaturated(element);
 | 
| +                if (length >= kParagraphLengthThreshold) {
 | 
| +                    features.mozScore += sqrt(length - kParagraphLengthThreshold);
 | 
| +                    features.mozScore = std::min(features.mozScore, kMozScoreSaturation);
 | 
| +                }
 | 
| +                features.mozScoreAllSqrt += sqrt(length);
 | 
| +                features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation);
 | 
| +
 | 
| +                features.mozScoreAllLinear += length;
 | 
| +                features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation);
 | 
| +            }
 | 
| +        } else if (element.hasTagName(liTag)) {
 | 
| +            underListItem = true;
 | 
| +        }
 | 
| +        collectFeatures(element, features, underListItem);
 | 
| +    }
 | 
| +}
 | 
| +
 | 
| +bool hasOpenGraphArticle(const Element& head)
 | 
| +{
 | 
| +    DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type"));
 | 
| +    DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property"));
 | 
| +    for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) {
 | 
| +        if (!isHTMLMetaElement(*child))
 | 
| +            continue;
 | 
| +        const HTMLMetaElement& meta = toHTMLMetaElement(*child);
 | 
| +
 | 
| +        if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) {
 | 
| +            if (equalIgnoringCase(meta.content(), "article")) {
 | 
| +                return true;
 | 
| +            }
 | 
| +        }
 | 
| +    }
 | 
| +    return false;
 | 
| +}
 | 
| +
 | 
| +bool isMobileFriendly(Document& document)
 | 
| +{
 | 
| +    FrameHost* frameHost = document.frameHost();
 | 
| +    if (!frameHost)
 | 
| +        return false;
 | 
| +    return frameHost->visualViewport().shouldDisableDesktopWorkarounds();
 | 
| +}
 | 
| +
 | 
| +} // namespace
 | 
| +
 | 
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
 | 
| +{
 | 
| +    TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics");
 | 
| +
 | 
| +    WebDistillabilityFeatures features = WebDistillabilityFeatures();
 | 
| +
 | 
| +    if (!document.frame() || !document.frame()->isMainFrame())
 | 
| +        return features;
 | 
| +
 | 
| +    ASSERT(document.hasFinishedParsing());
 | 
| +
 | 
| +    HTMLElement* body = document.body();
 | 
| +    HTMLElement* head = document.head();
 | 
| +
 | 
| +    if (!body || !head)
 | 
| +        return features;
 | 
| +
 | 
| +    if (isMobileFriendly(document)) {
 | 
| +        features.isMobileFriendly = true;
 | 
| +        return features;
 | 
| +    }
 | 
| +
 | 
| +    double startTime = monotonicallyIncreasingTime();
 | 
| +
 | 
| +    // Traverse the DOM tree and collect statistics.
 | 
| +    collectFeatures(*body, features);
 | 
| +    features.openGraph = hasOpenGraphArticle(*head);
 | 
| +
 | 
| +    double elapsedTime = monotonicallyIncreasingTime() - startTime;
 | 
| +    Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", static_cast<int>(1e6 * elapsedTime), 1, 1000000, 50);
 | 
| +
 | 
| +    return features;
 | 
| +}
 | 
| +
 | 
| +}
 | 
| 
 |