| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp | 
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp | 
| new file mode 100644 | 
| index 0000000000000000000000000000000000000000..695cecacf91063c9cf9df59f066948ce7c75d934 | 
| --- /dev/null | 
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp | 
| @@ -0,0 +1,253 @@ | 
| +// Copyright 2015 The Chromium Authors. All rights reserved. | 
| +// Use of this source code is governed by a BSD-style license that can be | 
| +// found in the LICENSE file. | 
| + | 
| +#include "config.h" | 
| +#include "DocumentStatisticsCollector.h" | 
| + | 
| +#include "core/HTMLNames.h" | 
| +#include "core/InputTypeNames.h" | 
| +#include "core/dom/ElementTraversal.h" | 
| +#include "core/dom/NodeComputedStyle.h" | 
| +#include "core/dom/Text.h" | 
| +#include "core/frame/FrameHost.h" | 
| +#include "core/html/HTMLHeadElement.h" | 
| +#include "core/html/HTMLInputElement.h" | 
| +#include "core/html/HTMLMetaElement.h" | 
| +#include "public/platform/Platform.h" | 
| +#include "public/platform/WebDistillability.h" | 
| + | 
| +namespace blink { | 
| + | 
| +using namespace HTMLNames; | 
| + | 
| +namespace { | 
| + | 
| +// Saturate the length of a paragraph to save time. | 
| +const int kTextContentLengthSaturation = 1000; | 
| + | 
| +// Filter out short P elements. The threshold is set to around 2 English sentences. | 
| +const unsigned kParagraphLengthThreshold = 140; | 
| + | 
| +// Saturate the scores to save time. The max is the score of 6 long paragraphs. | 
| +const double kMozScoreSaturation = 175.954539583; // 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold) | 
| +const double kMozScoreAllSqrtSaturation = 189.73665961; // 6 * sqrt(kTextContentLengthSaturation); | 
| +const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation; | 
| + | 
| +unsigned textContentLengthSaturated(Element& root) | 
| +{ | 
| +    unsigned length = 0; | 
| +    // This skips shadow DOM intentionally, to match the JavaScript implementation. | 
| +    // We would like to use the same statistics extracted by the JavaScript implementation | 
| +    // on iOS, and JavaScript cannot peek deeply into shadow DOM except on modern Chrome | 
| +    // versions. | 
| +    // Given shadow DOM rarely appears in <P> elements in long-form articles, the overall | 
| +    // accuracy should not be largely affected. | 
| +    for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) { | 
| +        if (!node.isTextNode()) { | 
| +            continue; | 
| +        } | 
| +        length += toText(node).length(); | 
| +        if (length > kTextContentLengthSaturation) { | 
| +            return kTextContentLengthSaturation; | 
| +        } | 
| +    } | 
| +    return length; | 
| +} | 
| + | 
| +bool isVisible(const Element& element) | 
| +{ | 
| +    const ComputedStyle* style = element.computedStyle(); | 
| +    if (!style) | 
| +        return false; | 
| +    return ( | 
| +        style->display() != NONE | 
| +        && style->visibility() != HIDDEN | 
| +        && style->opacity() != 0 | 
| +    ); | 
| +} | 
| + | 
| +bool matchAttributes(const Element& element, const Vector<String>& words) | 
| +{ | 
| +    const String& classes = element.getClassAttribute(); | 
| +    const String& id = element.getIdAttribute(); | 
| +    for (const String& word : words) { | 
| +        if (classes.findIgnoringCase(word) != WTF::kNotFound | 
| +            || id.findIgnoringCase(word) != WTF::kNotFound) { | 
| +            return true; | 
| +        } | 
| +    } | 
| +    return false; | 
| +} | 
| + | 
| +bool isGoodForScoring(const WebDistillabilityFeatures& features, const Element& element) | 
| +{ | 
| +    DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ()); | 
| +    if (unlikelyCandidates.isEmpty()) { | 
| +        auto words = { | 
| +            "banner", | 
| +            "combx", | 
| +            "comment", | 
| +            "community", | 
| +            "disqus", | 
| +            "extra", | 
| +            "foot", | 
| +            "header", | 
| +            "menu", | 
| +            "related", | 
| +            "remark", | 
| +            "rss", | 
| +            "share", | 
| +            "shoutbox", | 
| +            "sidebar", | 
| +            "skyscraper", | 
| +            "sponsor", | 
| +            "ad-break", | 
| +            "agegate", | 
| +            "pagination", | 
| +            "pager", | 
| +            "popup" | 
| +        }; | 
| +        for (auto word : words) { | 
| +            unlikelyCandidates.append(word); | 
| +        } | 
| +    } | 
| +    DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ()); | 
| +    if (highlyLikelyCandidates.isEmpty()) { | 
| +        auto words = { | 
| +            "and", | 
| +            "article", | 
| +            "body", | 
| +            "column", | 
| +            "main", | 
| +            "shadow" | 
| +        }; | 
| +        for (auto word : words) { | 
| +            highlyLikelyCandidates.append(word); | 
| +        } | 
| +    } | 
| + | 
| +    if (!isVisible(element)) | 
| +        return false; | 
| +    if (features.mozScore >= kMozScoreSaturation | 
| +        && features.mozScoreAllSqrt >= kMozScoreAllSqrtSaturation | 
| +        && features.mozScoreAllLinear >= kMozScoreAllLinearSaturation) | 
| +        return false; | 
| +    if (matchAttributes(element, unlikelyCandidates) | 
| +        && !matchAttributes(element, highlyLikelyCandidates)) | 
| +        return false; | 
| +    return true; | 
| +} | 
| + | 
| +// underListItem denotes that at least one of the ancesters is <li> element. | 
| +void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false) | 
| +{ | 
| +    for (Node& node : NodeTraversal::childrenOf(root)) { | 
| +        if (!node.isElementNode()) { | 
| +            continue; | 
| +        } | 
| + | 
| +        features.elementCount++; | 
| +        Element& element = toElement(node); | 
| +        if (element.hasTagName(aTag)) { | 
| +            features.anchorCount++; | 
| +        } else if (element.hasTagName(formTag)) { | 
| +            features.formCount++; | 
| +        } else if (element.hasTagName(inputTag)) { | 
| +            const HTMLInputElement& input = toHTMLInputElement(element); | 
| +            if (input.type() == InputTypeNames::text) { | 
| +                features.textInputCount++; | 
| +            } else if (input.type() == InputTypeNames::password) { | 
| +                features.passwordInputCount++; | 
| +            } | 
| +        } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { | 
| +            if (element.hasTagName(pTag)) { | 
| +                features.pCount++; | 
| +            } else { | 
| +                features.preCount++; | 
| +            } | 
| +            if (!underListItem && isGoodForScoring(features, element)) { | 
| +                unsigned length = textContentLengthSaturated(element); | 
| +                if (length >= kParagraphLengthThreshold) { | 
| +                    features.mozScore += sqrt(length - kParagraphLengthThreshold); | 
| +                    features.mozScore = std::min(features.mozScore, kMozScoreSaturation); | 
| +                } | 
| +                features.mozScoreAllSqrt += sqrt(length); | 
| +                features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation); | 
| + | 
| +                features.mozScoreAllLinear += length; | 
| +                features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation); | 
| +            } | 
| +        } else if (element.hasTagName(liTag)) { | 
| +            underListItem = true; | 
| +        } | 
| +        collectFeatures(element, features, underListItem); | 
| +    } | 
| +} | 
| + | 
| +bool hasOpenGraphArticle(const Element& head) | 
| +{ | 
| +    DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type")); | 
| +    DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property")); | 
| +    for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) { | 
| +        if (!isHTMLMetaElement(*child)) | 
| +            continue; | 
| +        const HTMLMetaElement& meta = toHTMLMetaElement(*child); | 
| + | 
| +        if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) { | 
| +            if (equalIgnoringCase(meta.content(), "article")) { | 
| +                return true; | 
| +            } | 
| +        } | 
| +    } | 
| +    return false; | 
| +} | 
| + | 
| +bool isMobileFriendly(Document& document) | 
| +{ | 
| +    if (FrameHost* frameHost = document.frameHost()) | 
| +        return frameHost->visualViewport().shouldDisableDesktopWorkarounds(); | 
| +    return false; | 
| +} | 
| + | 
| +} // namespace | 
| + | 
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) | 
| +{ | 
| +    TRACE_EVENT0("blink", "DocumentStatisticsCollector::collectStatistics"); | 
| + | 
| +    WebDistillabilityFeatures features = WebDistillabilityFeatures(); | 
| + | 
| +    if (!document.frame() || !document.frame()->isMainFrame()) | 
| +        return features; | 
| + | 
| +    ASSERT(document.hasFinishedParsing()); | 
| + | 
| +    HTMLElement* body = document.body(); | 
| +    HTMLElement* head = document.head(); | 
| + | 
| +    if (!body || !head) | 
| +        return features; | 
| + | 
| +    if (isMobileFriendly(document)) { | 
| +        features.isMobileFriendly = true; | 
| +        // We only trigger Reader Mode on non-mobile-friendly pages for now. | 
| +        return features; | 
| +    } | 
| + | 
| +    double startTime = monotonicallyIncreasingTime(); | 
| + | 
| +    // This should be cheap since collectStatistics is only called right after layout. | 
| +    document.updateLayoutTreeIfNeeded(); | 
| + | 
| +    // Traverse the DOM tree and collect statistics. | 
| +    collectFeatures(*body, features); | 
| +    features.openGraph = hasOpenGraphArticle(*head); | 
| + | 
| +    double elapsedTime = monotonicallyIncreasingTime() - startTime; | 
| +    Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", static_cast<int>(1e6 * elapsedTime), 1, 1000000, 50); | 
| + | 
| +    return features; | 
| +} | 
| + | 
| +} | 
|  |