Chromium Code Reviews| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..4328e8ddd4732ee5fd26a11bea22485f8b02e9d3 |
| --- /dev/null |
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| @@ -0,0 +1,211 @@ |
| +// Copyright 2015 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "config.h" |
| +#include "DocumentStatisticsCollector.h" |
| + |
| +#include "core/HTMLNames.h" |
| +#include "core/css/CSSComputedStyleDeclaration.h" |
| +#include "core/editing/EphemeralRange.h" |
| +#include "core/editing/iterators/TextIterator.h" |
| +#include "core/editing/iterators/WordAwareIterator.h" |
| +#include "core/html/HTMLHeadElement.h" |
| +#include "core/inspector/ConsoleMessage.h" |
| +#include "platform/text/TextBreakIterator.h" |
| +#include "public/platform/WebDistillability.h" |
| +#include "wtf/text/StringBuilder.h" |
| + |
| +using namespace WTF; |
| +using namespace Unicode; |
| + |
| +namespace blink { |
| + |
| +using namespace HTMLNames; |
| + |
| +namespace { |
| + |
| +unsigned trimmedTextContentLength(Element& root) |
| +{ |
| + // TODO(wychen): count the length without allocating the string. |
| + return root.textContent().stripWhiteSpace().length(); |
|
dglazkov
2015/10/22 16:30:31
The TODO needs to be addressed before landing.
wychen
2015/10/23 02:51:30
Will do.
|
| +} |
| + |
| +unsigned innerTextLength(Element& root) |
| +{ |
| + unsigned length = 0; |
| + EphemeralRange range = EphemeralRange::rangeOfContents(root); |
| + TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText); |
| + for (; !it.atEnd(); it.advance()) { |
| + length += it.length(); |
| + } |
| + return length; |
| +} |
| + |
| +class ExtractFeatureWalker { |
|
dglazkov
2015/10/22 16:30:31
This can just be a separate class, no need to hide
wychen
2015/10/23 02:51:30
Do you mean creating new cpp/h files for this clas
|
| +public: |
| + ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features) : |
| + m_document(document), |
| + m_features(features) |
| + { |
| + unlikelyCandidates = {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}; |
|
dglazkov
2015/10/22 16:30:31
Should these be static? Why initialize them in con
wychen
2015/10/23 02:51:30
Made static.
|
| + okMaybeItsACandidate = {"and", "article", "body", "column", "main", "shadow"}; |
| + } |
| + |
| + bool isVisible(Element& elem) |
| + { |
| + RefPtr<CSSStyleDeclaration> style = |
| + m_document.domWindow()->getComputedStyle(&elem, String()); |
| + return !( |
| + style->getPropertyValue("display") == "none" |
| + || style->getPropertyValue("visibility") == "hidden" |
| + || style->getPropertyValue("opacity") == "0" |
| + ); |
| + } |
| + |
| + bool matchName(Element& elem, const std::vector<String>& words) |
| + { |
| + String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribute().lower(); |
|
dglazkov
2015/10/22 16:30:31
We have lots of style machinery to do this correct
wychen
2015/10/23 02:51:30
Could you elaborate how to use StyleResolver to im
|
| + for (const String& word: words) { |
| + if (hay.find(word)) { |
| + return true; |
| + } |
| + } |
| + return false; |
| + } |
| + |
| + void walk() |
| + { |
| + walk(*m_document.body(), false); |
| + } |
| + |
| +private: |
| + void walk(Element& root, bool underLi = false) |
| + { |
| + for (Node& node : NodeTraversal::childrenOf(root)) { |
| + if (node.isTextNode()) { |
| + String text = toText(node).data(); |
| + m_features.textContentLength += text.length(); |
| + continue; |
| + } |
| + if (!node.isElementNode()) { |
| + continue; |
| + } |
| + |
| + m_features.numElements++; |
| + Element& element = toElement(node); |
| + if (element.hasTagName(aTag)) { |
| + m_features.numAnchors++; |
| + } else if (element.hasTagName(formTag)) { |
| + m_features.numForms++; |
| + } else if (element.hasTagName(inputTag)) { |
| + if (element.getAttribute("type").lower() == "text") { |
| + m_features.numTextInput++; |
| + } else if (element.getAttribute("type").lower() == "pasword") { |
| + m_features.numPasswordInput++; |
| + } |
| + } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
| + m_features.numPPRE++; |
| + if (!underLi && isVisible(element) |
| + && (!matchName(element, unlikelyCandidates) || matchName(element, okMaybeItsACandidate))) { |
| + unsigned len = trimmedTextContentLength(element); |
| + if (len >= 140) { |
| + m_features.mozScore += sqrt(len - 140); |
| + } |
| + m_features.mozScoreAllSqrt += sqrt(len); |
| + m_features.mozScoreAllLinear += len; |
| + } |
| + } |
| + walk(element, element.hasTagName(liTag) || underLi); |
| + } |
| + } |
| + |
| + std::vector<String> unlikelyCandidates, okMaybeItsACandidate; |
| + Document& m_document; |
| + WebDistillabilityFeatures& m_features; |
| +}; |
| + |
| +bool hasOGArticle(const Element& head) |
| +{ |
| + for (const Node& node : NodeTraversal::childrenOf(head)) { |
|
dglazkov
2015/10/22 16:30:31
If you're traversing things in the method above, m
wychen
2015/10/23 02:51:30
hasOGArticle traverses head, while walk() traverse
|
| + if (!node.isElementNode()) |
| + continue; |
| + const Element& element = toElement(node); |
| + if (!element.hasTagName(metaTag)) |
| + continue; |
| + if ((element.getAttribute("name") == ("og:type")) || (element.getAttribute("property") == ("og:type"))) { |
| + WTF::CString content = element.getAttribute("content").upper().utf8(); |
| + if ((content) == "ARTICLE") { |
| + return true; |
| + } |
| + } |
| + } |
| + return false; |
| +}; |
| + |
| +} // namespace |
| + |
| +DocumentStatisticsCollector::DocumentStatisticsCollector() |
| + : m_readyToCollect(false) |
| +{ |
| +} |
| + |
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
| +{ |
| + WebDistillabilityFeatures features({0}); |
| + if (!m_readyToCollect) |
| + return features; |
| + |
| + if (!document.frame() || !document.frame()->isMainFrame()) |
| + return features; |
| + |
| + if (!document.hasFinishedParsing()) |
| + return features; |
| + |
| + ASSERT(document.body()); |
| + |
| + // First, traverse the DOM tree and collect statistics. |
| + ExtractFeatureWalker walker(document, features); |
| + walker.walk(); |
| + |
| + // Next, traverse the Layout tree and collect statistics on innerText length. |
| + features.innerTextLength += innerTextLength(*document.body()); |
| + |
| + features.openGraph = hasOGArticle(*document.head()); |
| + |
| + // The following DISTILLER_NDEBUG section would be gone when landing. |
| +#ifndef DISTILLER_NDEBUG |
| + StringBuilder message; |
| + message.append("openGraph: "); |
| + message.appendNumber(features.openGraph); |
| + message.append(", numElements: "); |
| + message.appendNumber(features.numElements); |
| + message.append(", numAnchors: "); |
| + message.appendNumber(features.numAnchors); |
| + message.append(", numForms: "); |
| + message.appendNumber(features.numForms); |
| + message.append(", numTextInput: "); |
| + message.appendNumber(features.numTextInput); |
| + message.append(", numPasswordInput: "); |
| + message.appendNumber(features.numPasswordInput); |
| + message.append(", numPPRE: "); |
| + message.appendNumber(features.numPPRE); |
| + message.append(", innerTextLength: "); |
| + message.appendNumber(features.innerTextLength); |
| + message.append(", textContentLength: "); |
| + message.appendNumber(features.textContentLength); |
| + message.append(", mozScore: "); |
| + message.appendNumber(features.mozScore); |
| + message.append(", mozScoreAllSqrt: "); |
| + message.appendNumber(features.mozScoreAllSqrt); |
| + message.append(", mozScoreAllLinear: "); |
| + message.appendNumber(features.mozScoreAllLinear); |
| + |
| + RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString()); |
| + document.addConsoleMessage(consoleMessage); |
| +#endif |
| + |
| + return features; |
| +} |
| + |
| +} |