Chromium Code Reviews| Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| new file mode 100644 |
| index 0000000000000000000000000000000000000000..2bc6f51ecf34caa996cac44f3962360c111b6390 |
| --- /dev/null |
| +++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp |
| @@ -0,0 +1,204 @@ |
| +// Copyright 2015 The Chromium Authors. All rights reserved. |
| +// Use of this source code is governed by a BSD-style license that can be |
| +// found in the LICENSE file. |
| + |
| +#include "config.h" |
| +#include "DocumentStatisticsCollector.h" |
| + |
| +#include "core/HTMLNames.h" |
| +#include "core/css/CSSComputedStyleDeclaration.h" |
| +#include "core/editing/EphemeralRange.h" |
| +#include "core/editing/iterators/TextIterator.h" |
| +#include "core/editing/iterators/WordAwareIterator.h" |
| +#include "core/html/HTMLHeadElement.h" |
| +#include "core/inspector/ConsoleMessage.h" |
| +#include "platform/text/TextBreakIterator.h" |
| +#include "public/platform/WebDistillability.h" |
| +#include "wtf/text/StringBuilder.h" |
| + |
| +using namespace WTF; |
|
esprehn
2015/10/23 04:59:14
don't using namespace WTF
wychen
2015/10/26 20:40:51
Done.
|
| +using namespace Unicode; |
|
esprehn
2015/10/23 04:59:14
why do you need this?
wychen
2015/10/26 20:40:51
Done.
|
| + |
| +namespace blink { |
| + |
| +using namespace HTMLNames; |
| + |
| +namespace { |
| + |
| +unsigned trimmedTextContentLength(Element& root) |
| +{ |
| + // TODO(wychen): count the length without allocating the string. |
| + return root.textContent().stripWhiteSpace().length(); |
|
esprehn
2015/10/23 04:59:14
this is very expensive, you're creating a huge str
wychen
2015/10/23 18:56:35
Would this be acceptable if it's rewritten so that
wychen
2015/10/26 20:40:52
Done.
|
| +} |
| + |
| +unsigned innerTextLength(Element& root) |
| +{ |
| + unsigned length = 0; |
| + EphemeralRange range = EphemeralRange::rangeOfContents(root); |
|
esprehn
2015/10/23 04:59:15
Why do you need innerText length? This is going to
wychen
2015/10/23 18:56:35
This is one of the features we fed to the machine
|
| + TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText); |
| + for (; !it.atEnd(); it.advance()) { |
| + length += it.length(); |
| + } |
| + return length; |
| +} |
| + |
| +class ExtractFeatureWalker { |
|
wychen
2015/10/23 02:55:47
Do you mean creating new cpp/h files for this clas
|
| +public: |
| + ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features) : |
| + m_document(document), |
| + m_features(features) |
| + { |
| + } |
| + |
| + bool isVisible(Element& elem) |
|
esprehn
2015/10/23 04:59:14
element, don't abbreviate in blink
wychen
2015/10/26 20:40:51
Done.
|
| + { |
| + RefPtr<CSSStyleDeclaration> style = |
| + m_document.domWindow()->getComputedStyle(&elem, String()); |
|
esprehn
2015/10/23 04:59:15
don't call through the domWindow(), you should alm
wychen
2015/10/26 20:40:52
Done.
|
| + return !( |
| + style->getPropertyValue("display") == "none" |
|
esprehn
2015/10/23 04:59:14
You want to use ComputedStyle and the API surface
wychen
2015/10/26 20:40:51
Done.
|
| + || style->getPropertyValue("visibility") == "hidden" |
| + || style->getPropertyValue("opacity") == "0" |
|
esprehn
2015/10/23 04:59:14
these properties are not inherited, this code does
wychen
2015/10/23 18:56:35
Opacity is indeed not inherited, but visibility is
|
| + ); |
| + } |
| + |
| + bool matchName(Element& elem, const std::vector<String>& words) |
|
esprehn
2015/10/23 04:59:15
don't use std::vector in blink, don't abbreviate.
wychen
2015/10/23 18:56:34
I'll use Vector instead.
|
| + { |
| + String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribute().lower(); |
|
wychen
2015/10/23 02:55:47
treat
esprehn
2015/10/23 04:59:14
don't combine concat the strings like this, also d
wychen
2015/10/23 18:56:35
Since partial matching is expected, I'll be using
|
| + for (const String& word: words) { |
| + if (hay.find(word)) { |
| + return true; |
| + } |
| + } |
| + return false; |
| + } |
| + |
| + void walk() |
| + { |
| + walk(*m_document.body(), false); |
|
esprehn
2015/10/23 04:59:15
what is false?
wychen
2015/10/23 18:56:35
The second argument should've been removed since t
|
| + m_features.openGraph = hasOGArticle(*m_document.head()); |
|
esprehn
2015/10/23 04:59:14
don't abbreviate, what is OG?
wychen
2015/10/23 18:56:35
Is hasOpenGraphArticle() a self-explanatory name?
|
| + } |
| + |
| +private: |
| + void walk(Element& root, bool underLi = false) |
|
esprehn
2015/10/23 04:59:15
what is underLi?
wychen
2015/10/26 20:40:51
Added comment.
|
| + { |
| + DEFINE_STATIC_LOCAL(std::vector<String>, unlikelyCandidates, ({"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"})); |
|
esprehn
2015/10/23 04:59:14
don't use std::vector, use WTF types, also I'm pre
wychen
2015/10/23 18:56:35
If we only want exact matching, this would be a br
|
| + DEFINE_STATIC_LOCAL(std::vector<String>, okMaybeItsACandidate, ({"and", "article", "body", "column", "main", "shadow"})); |
|
esprehn
2015/10/23 04:59:14
ditto
|
| + |
| + for (Node& node : NodeTraversal::childrenOf(root)) { |
|
esprehn
2015/10/23 04:59:15
this doesn't handle shadow dom
wychen
2015/10/23 18:56:35
Could you elaborate how shadow dom should be handl
|
| + if (node.isTextNode()) { |
| + String text = toText(node).data(); |
| + m_features.textContentLength += text.length(); |
|
esprehn
2015/10/23 04:59:14
toText(node).length()
wychen
2015/10/26 20:40:52
Done.
|
| + continue; |
| + } |
| + if (!node.isElementNode()) { |
| + continue; |
| + } |
| + |
| + m_features.numElements++; |
| + Element& element = toElement(node); |
| + if (element.hasTagName(aTag)) { |
| + m_features.numAnchors++; |
|
esprehn
2015/10/23 04:59:15
anchorCount
formCount
etc.
don't abbreviate wit
wychen
2015/10/26 20:40:51
Done.
|
| + } else if (element.hasTagName(formTag)) { |
| + m_features.numForms++; |
| + } else if (element.hasTagName(inputTag)) { |
| + if (element.getAttribute("type").lower() == "text") { |
|
esprehn
2015/10/23 04:59:14
typeAttr, you also want to use equalIgnoringCase.
wychen
2015/10/26 20:40:51
Done.
|
| + m_features.numTextInput++; |
| + } else if (element.getAttribute("type").lower() == "pasword") { |
|
esprehn
2015/10/23 04:59:14
ditto
wychen
2015/10/26 20:40:51
Done.
|
| + m_features.numPasswordInput++; |
| + } |
| + } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) { |
| + m_features.numPPRE++; |
|
esprehn
2015/10/23 04:59:14
Lets count these separately, then you can sum on t
wychen
2015/10/26 20:40:51
Done.
|
| + if (!underLi && isVisible(element) |
|
esprehn
2015/10/23 04:59:14
underListItem, also this isVisible thing doesn't w
wychen
2015/10/26 20:40:52
Done.
|
| + && (!matchName(element, unlikelyCandidates) || matchName(element, okMaybeItsACandidate))) { |
|
esprehn
2015/10/23 04:59:14
okMaybeItsACandidate needs a better name, both set
wychen
2015/10/23 18:56:35
Does highlyLikelyCandidates sound better?
|
| + unsigned len = trimmedTextContentLength(element); |
| + if (len >= 140) { |
|
esprehn
2015/10/23 05:17:30
if you only care about this we can write something
wychen
2015/10/26 20:40:52
I've saturated the length so that we could early o
|
| + m_features.mozScore += sqrt(len - 140); |
|
esprehn
2015/10/23 04:59:14
why 140? this needs a comment in the code or a con
wychen
2015/10/26 20:40:51
Done.
|
| + } |
| + m_features.mozScoreAllSqrt += sqrt(len); |
| + m_features.mozScoreAllLinear += len; |
| + } |
| + } |
| + walk(element, element.hasTagName(liTag) || underLi); |
|
esprehn
2015/10/23 04:59:14
why do you care about ancestor li tags, what's the
wychen
2015/10/23 18:56:35
This is part of the scoring heuristics. <p> or <pr
|
| + } |
| + } |
| + |
| + bool hasOGArticle(const Element& head) |
| + { |
| + for (const Node& node : NodeTraversal::childrenOf(head)) { |
|
esprehn
2015/10/23 04:59:15
again this doesn't handle shadow dom, but this is
wychen
2015/10/26 20:40:52
Acknowledged.
|
| + if (!node.isElementNode()) |
| + continue; |
| + const Element& element = toElement(node); |
| + if (!element.hasTagName(metaTag)) |
| + continue; |
| + if ((element.getAttribute("name") == ("og:type")) || (element.getAttribute("property") == ("og:type"))) { |
|
esprehn
2015/10/23 04:59:14
toHTMLMetaElement and then use ->name(), also remo
wychen
2015/10/26 20:40:52
Done, but getAttribute("property") is still there.
|
| + WTF::CString content = element.getAttribute("content").upper().utf8(); |
|
esprehn
2015/10/23 04:59:14
don't convert to CString, you almost never want th
wychen
2015/10/26 20:40:51
Done.
|
| + if ((content) == "ARTICLE") { |
|
esprehn
2015/10/23 04:59:14
You want to use ->content() and equalIgnoringCase
wychen
2015/10/26 20:40:51
Done.
|
| + return true; |
| + } |
| + } |
| + } |
| + return false; |
| + }; |
| + |
| +private: |
| + Document& m_document; |
| + WebDistillabilityFeatures& m_features; |
| +}; |
| + |
| +} // namespace |
| + |
| +WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document) |
| +{ |
| + WebDistillabilityFeatures features({0}); |
|
esprehn
2015/10/23 04:59:14
Can we add a constructor for the struct instead an
wychen
2015/10/26 20:40:52
I'll use an initializer instead of adding a constr
|
| + |
| + if (!document.frame() || !document.frame()->isMainFrame()) |
| + return features; |
| + |
| + if (!document.hasFinishedParsing()) |
|
esprehn
2015/10/23 04:59:15
how does this ever get called when we're still par
wychen
2015/10/26 20:40:51
Done.
|
| + return features; |
| + |
| + ASSERT(document.body()); |
|
esprehn
2015/10/23 04:59:15
this doesn't have to be true, you can remove the b
wychen
2015/10/26 20:40:52
Done.
|
| + |
| + // First, traverse the DOM tree and collect statistics. |
| + ExtractFeatureWalker walker(document, features); |
|
esprehn
2015/10/23 04:59:15
why do you need a walker at all instead of just st
wychen
2015/10/26 20:40:51
Done.
|
| + walker.walk(); |
|
esprehn
2015/10/23 04:59:14
just use recursive functions
wychen
2015/10/26 20:40:51
Done.
|
| + |
| + // Next, traverse the Layout tree and collect statistics on innerText length. |
| + features.innerTextLength += innerTextLength(*document.body()); |
|
esprehn
2015/10/23 04:59:14
you can remove the body, also this doesn't have to
wychen
2015/10/23 18:56:35
The ratio between innerText.length() and textConte
wychen
2015/10/26 20:40:51
Done.
|
| + |
| + // The following DISTILLER_NDEBUG section would be gone when landing. |
| +#ifndef DISTILLER_NDEBUG |
| + StringBuilder message; |
| + message.append("openGraph: "); |
| + message.appendNumber(features.openGraph); |
| + message.append(", numElements: "); |
| + message.appendNumber(features.numElements); |
| + message.append(", numAnchors: "); |
| + message.appendNumber(features.numAnchors); |
| + message.append(", numForms: "); |
| + message.appendNumber(features.numForms); |
| + message.append(", numTextInput: "); |
| + message.appendNumber(features.numTextInput); |
| + message.append(", numPasswordInput: "); |
| + message.appendNumber(features.numPasswordInput); |
| + message.append(", numPPRE: "); |
| + message.appendNumber(features.numPPRE); |
| + message.append(", innerTextLength: "); |
| + message.appendNumber(features.innerTextLength); |
| + message.append(", textContentLength: "); |
| + message.appendNumber(features.textContentLength); |
| + message.append(", mozScore: "); |
| + message.appendNumber(features.mozScore); |
| + message.append(", mozScoreAllSqrt: "); |
| + message.appendNumber(features.mozScoreAllSqrt); |
| + message.append(", mozScoreAllLinear: "); |
| + message.appendNumber(features.mozScoreAllLinear); |
| + |
| + RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString()); |
| + document.addConsoleMessage(consoleMessage); |
|
esprehn
2015/10/23 04:59:14
just fprintf, no reason to console log your debug
wychen
2015/10/26 20:40:52
I just wanted to quickly get message to adb log.
|
| +#endif |
| + |
| + return features; |
| +} |
| + |
| +} |