Chromium Code Reviews| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..198407839a2ed689af80f49f41c919082ada4c3a |
| --- /dev/null |
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| @@ -0,0 +1,230 @@ |
| +// Copyright 2015 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "config.h" |
| +#include "DocumentStatisticsCollector.h" |
| + |
| +#include "core/HTMLNames.h" |
| +#include "core/css/CSSComputedStyleDeclaration.h" |
| +#include "core/editing/iterators/TextIterator.h" |
| +#include "core/html/HTMLHeadElement.h" |
| +#include "core/html/HTMLInputElement.h" |
| +#include "core/html/HTMLMetaElement.h" |
| + |
| +// TODO(wychen): The following lines will be gone before landing. |
| +#include "core/inspector/ConsoleMessage.h" |
| + |
| +#include "public/platform/WebDistillability.h" |
| +#include "wtf/text/StringBuilder.h" |
| +#include "wtf/text/StringImpl.h" |
| + |
| +namespace blink { |
| + |
| +using namespace HTMLNames; |
| + |
| +namespace { |
| + |
| +unsigned trimmedTextContentLength(Element& root) |
|
esprehn
2015/10/26 21:43:09
Do pages usually have enough whitespace for this t
wychen
2015/10/27 23:52:12
Trimming the white spaces should make this less no
|
| +{ |
| + int firstNonWhite = -1, lastNonWhite = -1; |
| + unsigned position = 0; |
| + // TODO(wychen): scan backwards for lastNonWhite should be much faster in practice. |
| + for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) { |
|
esprehn
2015/10/26 21:43:08
This doesn't understand shadow dom. ex.
<p><conte
wychen
2015/10/27 23:52:12
The visibility model used here is the same as Java
|
| + if (!node.isTextNode()) { |
| + continue; |
| + } |
| + const String& text = toText(node).data(); |
| + for (unsigned i = 0; i < text.length(); i++) { |
| + if (!isSpaceOrNewline(text[i])) { |
| + if (firstNonWhite < 0) { |
| + firstNonWhite = lastNonWhite = i + position; |
| + } else { |
| + lastNonWhite = i + position; |
| + } |
| + } |
| + } |
| + position += text.length(); |
| + } |
| + if (firstNonWhite < 0) { |
| + return 0; |
| + } |
| + return lastNonWhite - firstNonWhite + 1; |
| +} |
| + |
| +unsigned innerTextLength(Element& root) |
| +{ |
| + unsigned length = 0; |
| + EphemeralRange range = EphemeralRange::rangeOfContents(root); |
| + TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText); |
|
esprehn
2015/10/26 21:43:09
Why do you care about innerText length at all? Tha
wychen
2015/10/27 23:52:12
The length ratio between innerText and textContent
|
| + for (; !it.atEnd(); it.advance()) { |
| + length += it.length(); |
| + } |
| + return length; |
| +} |
| + |
| +bool isVisible(Element& element) |
| +{ |
| + const blink::ComputedStyle* style = element.ensureComputedStyle(); |
|
esprehn
2015/10/26 21:43:08
this forces a style computation on elements that w
wychen
2015/10/27 23:52:12
This statistics collection happens right after the
|
| + return !( |
|
esprehn
2015/10/26 21:43:09
run demorgans
wychen
2015/10/27 23:52:12
Done.
|
| + style->display() == NONE |
| + || style->visibility() == HIDDEN |
| + || style->opacity() == 0 |
| + ); |
| +} |
| + |
| +bool matchName(Element& element, const Vector<String>& words) |
| +{ |
| + if (element.hasClass()) { |
| + const String& hay = element.getClassAttribute(); |
| + for (const String& word: words) { |
|
esprehn
2015/10/26 21:43:09
missing space
wychen
2015/10/27 23:52:12
Done.
|
| + if (hay.findIgnoringCase(word) != WTF::kNotFound) { |
| + return true; |
| + } |
| + } |
| + } |
| + if (UNLIKELY(element.hasID())) { |
|
esprehn
2015/10/26 21:43:09
remove UNLIKELY
wychen
2015/10/27 23:52:12
Done.
|
| + const String& hay = element.getIdAttribute(); |
|
esprehn
2015/10/26 21:43:09
these already have a hasID() check in them, so I'd
wychen
2015/10/27 23:52:11
Done.
|
| + for (const String& word: words) { |
|
esprehn
2015/10/26 21:43:08
missing space
wychen
2015/10/27 23:52:12
Done.
|
| + if (hay.findIgnoringCase(word) != WTF::kNotFound) { |
| + return true; |
| + } |
| + } |
| + } |
| + return false; |
| +} |
| + |
| +// underListItem denotes that at least one of the ancesters is <li> element. |
| +void walk(Element& root, WebDistillabilityFeatures& features, bool underListItem = false) |
|
esprehn
2015/10/26 21:43:08
needs a better name. collectFeatures?
wychen
2015/10/27 23:52:12
Done.
|
| +{ |
| + DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ()); |
| + if (unlikelyCandidates.size() == 0) { |
|
esprehn
2015/10/26 21:43:09
isEmpty()
wychen
2015/10/27 23:52:11
Done.
|
| + for (auto word : {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}) { |
| + unlikelyCandidates.append(word); |
| + } |
| + } |
| + DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ()); |
| + if (highlyLikelyCandidates.size() == 0) { |
|
esprehn
2015/10/26 21:43:08
isEmpty()
wychen
2015/10/27 23:52:12
Done.
|
| + for (auto word : {"and", "article", "body", "column", "main", "shadow"}) { |
| + highlyLikelyCandidates.append(word); |
| + } |
| + } |
| + const unsigned kParagraphLengthThreshold = 140; |
|
esprehn
2015/10/26 21:43:08
why did you pick 140? Add a comment.
wychen
2015/10/27 23:52:12
Done.
|
| + |
| + for (Node& node : NodeTraversal::childrenOf(root)) { |
| + if (node.isTextNode()) { |
| + features.textContentLength += toText(node).length(); |
| + continue; |
| + } |
| + if (!node.isElementNode()) { |
| + continue; |
| + } |
| + |
| + features.elementCount++; |
| + Element& element = toElement(node); |
| + if (element.hasTagName(aTag)) { |
| + features.anchorCount++; |
| + } else if (element.hasTagName(formTag)) { |
| + features.formCount++; |
| + } else if (element.hasTagName(inputTag)) { |
| + const HTMLInputElement& input = toHTMLInputElement(element); |
| + if (equalIgnoringCase(input.type(), "text")) { |
|
esprehn
2015/10/26 21:43:09
input.type() == InputTypeNames::text
wychen
2015/10/27 23:52:12
Done.
|
| + features.textInputCount++; |
| + } else if (equalIgnoringCase(input.type(), "password")) { |
|
esprehn
2015/10/26 21:43:08
== InputTypeNames::password
wychen
2015/10/27 23:52:12
Done.
|
| + features.passwordInputCount++; |
| + } |
| + } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
| + if (element.hasTagName(pTag)) { |
| + features.pCount++; |
| + } else { |
| + features.preCount++; |
| + } |
| + if (!underListItem && isVisible(element) |
| + && (!matchName(element, unlikelyCandidates) || matchName(element, highlyLikelyCandidates))) { |
|
esprehn
2015/10/26 21:43:09
matchAttributes? It's not really related to name a
wychen
2015/10/27 23:52:12
Done.
|
| + unsigned length = trimmedTextContentLength(element); |
| + if (length >= kParagraphLengthThreshold) { |
| + features.mozScore += sqrt(length - kParagraphLengthThreshold); |
| + } |
| + features.mozScoreAllSqrt += sqrt(length); |
| + features.mozScoreAllLinear += length; |
| + } |
| + } |
| + walk(element, features, element.hasTagName(liTag) || underListItem); |
|
esprehn
2015/10/26 21:43:09
this checks hasTagName(liTag) for every element, e
wychen
2015/10/27 23:52:12
Done.
|
| + } |
| +} |
| + |
| +bool hasOpenGraphArticle(const Element& head) |
| +{ |
| + for (const Node& node : NodeTraversal::childrenOf(head)) { |
|
esprehn
2015/10/26 21:43:09
for (const Element* child = ElementTraversal::firs
wychen
2015/10/27 23:52:12
Done.
|
| + if (!node.isElementNode()) |
| + continue; |
| + const Element& element = toElement(node); |
| + if (!isHTMLMetaElement(element)) |
| + continue; |
| + const HTMLMetaElement& meta = toHTMLMetaElement(element); |
| + if (meta.name() == "og:type" || element.getAttribute("property") == "og:type") { |
|
esprehn
2015/10/26 21:43:09
You want to declare static local AtomicString vari
wychen
2015/10/27 23:52:12
Done.
|
| + if (equalIgnoringCase(meta.content(), "article")) { |
| + return true; |
| + } |
| + } |
| + } |
| + return false; |
| +} |
| + |
| +} // namespace |
| + |
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
| +{ |
| + WebDistillabilityFeatures features = WebDistillabilityFeatures(); |
| + |
| + if (!document.frame() || !document.frame()->isMainFrame()) |
| + return features; |
| + |
| + ASSERT(document.hasFinishedParsing()); |
| + |
| + if (!document.body() || !document.head()) |
| + return features; |
| + |
| + // First, traverse the DOM tree and collect statistics. |
| + walk(*document.body(), features); |
| + features.openGraph = hasOpenGraphArticle(*document.head()); |
| + |
| + // Next, traverse the Layout tree and collect statistics on innerText length. |
|
esprehn
2015/10/26 21:43:08
this needs to do document->updateLayout() so it's
wychen
2015/10/27 23:52:12
Done. I'm curious when should updateLayoutIgnorePe
|
| + features.innerTextLength += innerTextLength(*document.body()); |
|
esprehn
2015/10/26 21:43:09
this really seems unnecessary, you can just collec
wychen
2015/10/27 23:52:12
There seems to be much more than visibility in Tex
|
| + |
| + // The following DISTILLER_NDEBUG section would be gone when landing. |
| +#ifndef DISTILLER_NDEBUG |
| + StringBuilder message; |
| + message.append("openGraph: "); |
| + message.appendNumber(features.openGraph); |
| + message.append(", elementCount: "); |
| + message.appendNumber(features.elementCount); |
| + message.append(", anchorCount: "); |
| + message.appendNumber(features.anchorCount); |
| + message.append(", formCount: "); |
| + message.appendNumber(features.formCount); |
| + message.append(", textInputCount: "); |
| + message.appendNumber(features.textInputCount); |
| + message.append(", passwordInputCount: "); |
| + message.appendNumber(features.passwordInputCount); |
| + message.append(", pCount: "); |
| + message.appendNumber(features.pCount); |
| + message.append(", innerTextLength: "); |
| + message.appendNumber(features.innerTextLength); |
| + message.append(", textContentLength: "); |
| + message.appendNumber(features.textContentLength); |
| + message.append(", mozScore: "); |
| + message.appendNumber(features.mozScore); |
| + message.append(", mozScoreAllSqrt: "); |
| + message.appendNumber(features.mozScoreAllSqrt); |
| + message.append(", mozScoreAllLinear: "); |
| + message.appendNumber(features.mozScoreAllLinear); |
| + |
| + RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString()); |
| + document.addConsoleMessage(consoleMessage); |
| +#endif |
| + |
| + return features; |
| +} |
| + |
| +} |