Chromium Code Reviews| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..a8fe9e84c1de43e79c15495d012786401fb60f75 |
| --- /dev/null |
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| @@ -0,0 +1,233 @@ |
| +// Copyright 2015 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "config.h" |
| +#include "DocumentStatisticsCollector.h" |
| + |
| +#include "core/HTMLNames.h" |
| +#include "core/InputTypeNames.h" |
| +#include "core/css/CSSComputedStyleDeclaration.h" |
|
esprehn
2015/11/03 07:45:10
don't need this.
wychen
2015/11/03 08:59:49
Done.
|
| +#include "core/dom/ElementTraversal.h" |
| +#include "core/dom/NodeComputedStyle.h" |
| +#include "core/editing/iterators/TextIterator.h" |
|
esprehn
2015/11/03 07:45:10
don't need this.
wychen
2015/11/03 08:59:49
Done.
|
| +#include "core/frame/FrameHost.h" |
| +#include "core/html/HTMLHeadElement.h" |
| +#include "core/html/HTMLInputElement.h" |
| +#include "core/html/HTMLMetaElement.h" |
| +#include "public/platform/Platform.h" |
| +#include "public/platform/WebDistillability.h" |
| +#include "wtf/text/StringBuilder.h" |
| +#include "wtf/text/StringImpl.h" |
|
esprehn
2015/11/03 07:45:10
you don't need StringImpl, StringBuilder
wychen
2015/11/03 08:59:48
Done.
Just curious, did you use IDE to check unne
|
| + |
| +namespace blink { |
| + |
| +using namespace HTMLNames; |
| + |
| +namespace { |
| + |
| +// Saturate the length of a paragraph to save time. |
| +const int kTextContentLengthSaturation = 1000; |
| + |
| +unsigned textContentLengthSaturated(Element& root) |
| +{ |
| + unsigned length = 0; |
| + // This skips shadow dom intentionally, to match the JavaScript implementation. |
|
esprehn
2015/11/03 07:45:10
Why?
wychen
2015/11/03 08:59:48
Done.
|
| + for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) { |
| + if (!node.isTextNode()) { |
| + continue; |
| + } |
| + length += toText(node).length(); |
| + if (length > kTextContentLengthSaturation) { |
| + return kTextContentLengthSaturation; |
| + } |
| + } |
| + return length; |
| +} |
| + |
| +bool isVisible(Element& element) |
| +{ |
| + const ComputedStyle* style = element.computedStyle(); |
| + if (!style) |
| + return false; |
| + ASSERT(style->display() != NONE); |
|
esprehn
2015/11/03 07:45:10
this assert is wrong, you can have a style and be
wychen
2015/11/03 08:59:49
Done.
|
| + return ( |
| + style->visibility() != HIDDEN |
| + && style->opacity() != 0 |
| + ); |
| +} |
| + |
| +bool matchAttributes(Element& element, const Vector<String>& words) |
| +{ |
| + const String& classes = element.getClassAttribute(); |
| + const String& id = element.getIdAttribute(); |
| + for (const String& word : words) { |
| + if (classes.findIgnoringCase(word) != WTF::kNotFound |
| + || id.findIgnoringCase(word) != WTF::kNotFound) { |
| + return true; |
| + } |
| + } |
| + return false; |
| +} |
| + |
| +// underListItem denotes that at least one of the ancesters is <li> element. |
| +void collectFeatures(Element& root, WebDistillabilityFeatures& features, bool underListItem = false) |
| +{ |
| + DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ()); |
| + if (unlikelyCandidates.isEmpty()) { |
| + auto words = { |
| + "banner", |
| + "combx", |
| + "comment", |
| + "community", |
| + "disqus", |
| + "extra", |
| + "foot", |
| + "header", |
| + "menu", |
| + "related", |
| + "remark", |
| + "rss", |
| + "share", |
| + "shoutbox", |
| + "sidebar", |
| + "skyscraper", |
| + "sponsor", |
| + "ad-break", |
| + "agegate", |
| + "pagination", |
| + "pager", |
| + "popup" |
| + }; |
| + for (auto word : words) { |
| + unlikelyCandidates.append(word); |
| + } |
| + } |
| + DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ()); |
| + if (highlyLikelyCandidates.isEmpty()) { |
| + for (auto word : {"and", "article", "body", "column", "main", "shadow"}) { |
|
esprehn
2015/11/03 07:45:10
I'd wrap this one too.
wychen
2015/11/03 08:59:48
Done.
|
| + highlyLikelyCandidates.append(word); |
| + } |
| + } |
| + // Filter out short P elements. The threshold is set to around 2 English sentences. |
| + const unsigned kParagraphLengthThreshold = 140; |
| + |
| + // Saturate the scores to save time. The max is the score of 6 long paragraphs. |
| + const double kMozScoreSaturation = 6 * sqrt(kTextContentLengthSaturation - kParagraphLengthThreshold); |
| + const double kMozScoreAllSqrtSaturation = 6 * sqrt(kTextContentLengthSaturation); |
| + const double kMozScoreAllLinearSaturation = 6 * kTextContentLengthSaturation; |
| + |
| + for (Node& node : NodeTraversal::childrenOf(root)) { |
| + if (node.isTextNode()) { |
| + features.textContentLength += toText(node).length(); |
|
esprehn
2015/11/03 07:45:10
this is going to add the length of every inline <s
wychen
2015/11/03 08:59:48
It is possible that innerTextLength/textContentLen
|
| + continue; |
| + } |
| + if (!node.isElementNode()) { |
| + continue; |
| + } |
| + |
| + features.elementCount++; |
| + Element& element = toElement(node); |
| + if (element.hasTagName(aTag)) { |
| + features.anchorCount++; |
| + } else if (element.hasTagName(formTag)) { |
| + features.formCount++; |
| + } else if (element.hasTagName(inputTag)) { |
| + const HTMLInputElement& input = toHTMLInputElement(element); |
| + if (equalIgnoringCase(input.type(), InputTypeNames::text)) { |
|
esprehn
2015/11/03 07:45:10
ditto == InputTypeNames::text
wychen
2015/11/03 08:59:48
Good catch! I guess AtomicString == is faster.
|
| + features.textInputCount++; |
| + } else if (equalIgnoringCase(input.type(), InputTypeNames::password)) { |
|
esprehn
2015/11/03 07:45:10
this is always lowercase, you can just do == ::pas
wychen
2015/11/03 08:59:48
Done.
|
| + features.passwordInputCount++; |
| + } |
| + } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
| + if (element.hasTagName(pTag)) { |
| + features.pCount++; |
| + } else { |
| + features.preCount++; |
| + } |
| + if (!underListItem |
| + && (features.mozScore < kMozScoreSaturation |
| + || features.mozScoreAllSqrt < kMozScoreAllSqrtSaturation |
| + || features.mozScoreAllLinear < kMozScoreAllLinearSaturation) |
| + && isVisible(element) |
|
esprehn
2015/11/03 07:45:10
this is a crazy set of conditions, in blink we try
wychen
2015/11/03 08:59:48
Done.
|
| + && (!matchAttributes(element, unlikelyCandidates) || matchAttributes(element, highlyLikelyCandidates)) |
| + ) { |
| + unsigned length = textContentLengthSaturated(element); |
| + if (length >= kParagraphLengthThreshold) { |
| + features.mozScore += sqrt(length - kParagraphLengthThreshold); |
| + features.mozScore = std::min(features.mozScore, kMozScoreSaturation); |
| + } |
| + features.mozScoreAllSqrt += sqrt(length); |
| + features.mozScoreAllSqrt = std::min(features.mozScoreAllSqrt, kMozScoreAllSqrtSaturation); |
| + |
| + features.mozScoreAllLinear += length; |
| + features.mozScoreAllLinear = std::min(features.mozScoreAllLinear, kMozScoreAllLinearSaturation); |
| + } |
| + } else if (element.hasTagName(liTag)) { |
| + underListItem = true; |
| + } |
| + collectFeatures(element, features, underListItem); |
| + } |
| +} |
| + |
| +bool hasOpenGraphArticle(const Element& head) |
| +{ |
| + DEFINE_STATIC_LOCAL(AtomicString, ogType, ("og:type")); |
| + DEFINE_STATIC_LOCAL(AtomicString, propertyAttr, ("property")); |
| + for (const Element* child = ElementTraversal::firstChild(head); child; child = ElementTraversal::nextSibling(*child)) { |
| + if (!isHTMLMetaElement(*child)) |
| + continue; |
| + const HTMLMetaElement& meta = toHTMLMetaElement(*child); |
| + |
| + if (meta.name() == ogType || meta.getAttribute(propertyAttr) == ogType) { |
| + if (equalIgnoringCase(meta.content(), "article")) { |
| + return true; |
| + } |
| + } |
| + } |
| + return false; |
| +} |
| + |
| +bool isMobileFriendly(Document& document) |
| +{ |
| + FrameHost* frameHost = document.frameHost(); |
| + if (!frameHost) |
| + return false; |
| + return frameHost->visualViewport().shouldDisableDesktopWorkarounds(); |
| +} |
| + |
| +} // namespace |
| + |
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
| +{ |
| + WebDistillabilityFeatures features = WebDistillabilityFeatures(); |
| + |
| + if (!document.frame() || !document.frame()->isMainFrame()) |
| + return features; |
| + |
| + ASSERT(document.hasFinishedParsing()); |
| + |
|
esprehn
2015/11/03 07:45:10
TRACE_EVENT0("DocumentStatisticsCollector::collect
wychen
2015/11/03 08:59:49
Done.
|
| + HTMLElement* body = document.body(); |
|
esprehn
2015/11/03 07:45:10
needs a trace macro
wychen
2015/11/03 08:59:48
I don't understand this comment. Did you mean DEFI
|
| + HTMLElement* head = document.head(); |
| + |
| + if (!body || !head) |
| + return features; |
| + |
| + if (isMobileFriendly(document)) { |
| + features.isMobileFriendly = true; |
| + return features; |
| + } |
| + |
| + double startTime = monotonicallyIncreasingTime(); |
| + |
| + // Traverse the DOM tree and collect statistics. |
| + collectFeatures(*body, features); |
| + features.openGraph = hasOpenGraphArticle(*head); |
| + |
| + double elapsedTime = monotonicallyIncreasingTime() - startTime; |
| + Platform::current()->histogramCustomCounts("WebCore.DistillabilityUs", static_cast<int>(1e6 * elapsedTime), 1, 1000000, 50); |
| + |
| + return features; |
| +} |
| + |
| +} |