Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(78)

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bc6f51ecf34caa996cac44f3962360c111b6390
--- /dev/null
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
@@ -0,0 +1,204 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "config.h"
+#include "DocumentStatisticsCollector.h"
+
+#include "core/HTMLNames.h"
+#include "core/css/CSSComputedStyleDeclaration.h"
+#include "core/editing/EphemeralRange.h"
+#include "core/editing/iterators/TextIterator.h"
+#include "core/editing/iterators/WordAwareIterator.h"
+#include "core/html/HTMLHeadElement.h"
+#include "core/inspector/ConsoleMessage.h"
+#include "platform/text/TextBreakIterator.h"
+#include "public/platform/WebDistillability.h"
+#include "wtf/text/StringBuilder.h"
+
+using namespace WTF;
esprehn 2015/10/23 04:59:14 don't using namespace WTF
wychen 2015/10/26 20:40:51 Done.
+using namespace Unicode;
esprehn 2015/10/23 04:59:14 why do you need this?
wychen 2015/10/26 20:40:51 Done.
+
+namespace blink {
+
+using namespace HTMLNames;
+
+namespace {
+
+unsigned trimmedTextContentLength(Element& root)
+{
+ // TODO(wychen): count the length without allocating the string.
+ return root.textContent().stripWhiteSpace().length();
esprehn 2015/10/23 04:59:14 this is very expensive, you're creating a huge str
wychen 2015/10/23 18:56:35 Would this be acceptable if it's rewritten so that
wychen 2015/10/26 20:40:52 Done.
+}
+
+unsigned innerTextLength(Element& root)
+{
+ unsigned length = 0;
+ EphemeralRange range = EphemeralRange::rangeOfContents(root);
esprehn 2015/10/23 04:59:15 Why do you need innerText length? This is going to
wychen 2015/10/23 18:56:35 This is one of the features we fed to the machine
+ TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText);
+ for (; !it.atEnd(); it.advance()) {
+ length += it.length();
+ }
+ return length;
+}
+
+class ExtractFeatureWalker {
wychen 2015/10/23 02:55:47 Do you mean creating new cpp/h files for this clas
+public:
+ ExtractFeatureWalker(Document& document, WebDistillabilityFeatures& features) :
+ m_document(document),
+ m_features(features)
+ {
+ }
+
+ bool isVisible(Element& elem)
esprehn 2015/10/23 04:59:14 element, don't abbreviate in blink
wychen 2015/10/26 20:40:51 Done.
+ {
+ RefPtr<CSSStyleDeclaration> style =
+ m_document.domWindow()->getComputedStyle(&elem, String());
esprehn 2015/10/23 04:59:15 don't call through the domWindow(), you should alm
wychen 2015/10/26 20:40:52 Done.
+ return !(
+ style->getPropertyValue("display") == "none"
esprehn 2015/10/23 04:59:14 You want to use ComputedStyle and the API surface
wychen 2015/10/26 20:40:51 Done.
+ || style->getPropertyValue("visibility") == "hidden"
+ || style->getPropertyValue("opacity") == "0"
esprehn 2015/10/23 04:59:14 these properties are not inherited, this code does
wychen 2015/10/23 18:56:35 Opacity is indeed not inherited, but visibility is
+ );
+ }
+
+ bool matchName(Element& elem, const std::vector<String>& words)
esprehn 2015/10/23 04:59:15 don't use std::vector in blink, don't abbreviate.
wychen 2015/10/23 18:56:34 I'll use Vector instead.
+ {
+ String hay = elem.getClassAttribute().lower() + " " + elem.getIdAttribute().lower();
wychen 2015/10/23 02:55:47 treat
esprehn 2015/10/23 04:59:14 don't combine concat the strings like this, also d
wychen 2015/10/23 18:56:35 Since partial matching is expected, I'll be using
+ for (const String& word: words) {
+ if (hay.find(word)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ void walk()
+ {
+ walk(*m_document.body(), false);
esprehn 2015/10/23 04:59:15 what is false?
wychen 2015/10/23 18:56:35 The second argument should've been removed since t
+ m_features.openGraph = hasOGArticle(*m_document.head());
esprehn 2015/10/23 04:59:14 don't abbreviate, what is OG?
wychen 2015/10/23 18:56:35 Is hasOpenGraphArticle() a self-explanatory name?
+ }
+
+private:
+ void walk(Element& root, bool underLi = false)
esprehn 2015/10/23 04:59:15 what is underLi?
wychen 2015/10/26 20:40:51 Added comment.
+ {
+ DEFINE_STATIC_LOCAL(std::vector<String>, unlikelyCandidates, ({"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}));
esprehn 2015/10/23 04:59:14 don't use std::vector, use WTF types, also I'm pre
wychen 2015/10/23 18:56:35 If we only want exact matching, this would be a br
+ DEFINE_STATIC_LOCAL(std::vector<String>, okMaybeItsACandidate, ({"and", "article", "body", "column", "main", "shadow"}));
esprehn 2015/10/23 04:59:14 ditto
+
+ for (Node& node : NodeTraversal::childrenOf(root)) {
esprehn 2015/10/23 04:59:15 this doesn't handle shadow dom
wychen 2015/10/23 18:56:35 Could you elaborate how shadow dom should be handl
+ if (node.isTextNode()) {
+ String text = toText(node).data();
+ m_features.textContentLength += text.length();
esprehn 2015/10/23 04:59:14 toText(node).length()
wychen 2015/10/26 20:40:52 Done.
+ continue;
+ }
+ if (!node.isElementNode()) {
+ continue;
+ }
+
+ m_features.numElements++;
+ Element& element = toElement(node);
+ if (element.hasTagName(aTag)) {
+ m_features.numAnchors++;
esprehn 2015/10/23 04:59:15 anchorCount formCount etc. don't abbreviate wit
wychen 2015/10/26 20:40:51 Done.
+ } else if (element.hasTagName(formTag)) {
+ m_features.numForms++;
+ } else if (element.hasTagName(inputTag)) {
+ if (element.getAttribute("type").lower() == "text") {
esprehn 2015/10/23 04:59:14 typeAttr, you also want to use equalIgnoringCase.
wychen 2015/10/26 20:40:51 Done.
+ m_features.numTextInput++;
+ } else if (element.getAttribute("type").lower() == "pasword") {
esprehn 2015/10/23 04:59:14 ditto
wychen 2015/10/26 20:40:51 Done.
+ m_features.numPasswordInput++;
+ }
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
+ m_features.numPPRE++;
esprehn 2015/10/23 04:59:14 Lets count these separately, then you can sum on t
wychen 2015/10/26 20:40:51 Done.
+ if (!underLi && isVisible(element)
esprehn 2015/10/23 04:59:14 underListItem, also this isVisible thing doesn't w
wychen 2015/10/26 20:40:52 Done.
+ && (!matchName(element, unlikelyCandidates) || matchName(element, okMaybeItsACandidate))) {
esprehn 2015/10/23 04:59:14 okMaybeItsACandidate needs a better name, both set
wychen 2015/10/23 18:56:35 Does highlyLikelyCandidates sound better?
+ unsigned len = trimmedTextContentLength(element);
+ if (len >= 140) {
esprehn 2015/10/23 05:17:30 if you only care about this we can write something
wychen 2015/10/26 20:40:52 I've saturated the length so that we could early o
+ m_features.mozScore += sqrt(len - 140);
esprehn 2015/10/23 04:59:14 why 140? this needs a comment in the code or a con
wychen 2015/10/26 20:40:51 Done.
+ }
+ m_features.mozScoreAllSqrt += sqrt(len);
+ m_features.mozScoreAllLinear += len;
+ }
+ }
+ walk(element, element.hasTagName(liTag) || underLi);
esprehn 2015/10/23 04:59:14 why do you care about ancestor li tags, what's the
wychen 2015/10/23 18:56:35 This is part of the scoring heuristics. <p> or <pr
+ }
+ }
+
+ bool hasOGArticle(const Element& head)
+ {
+ for (const Node& node : NodeTraversal::childrenOf(head)) {
esprehn 2015/10/23 04:59:15 again this doesn't handle shadow dom, but this is
wychen 2015/10/26 20:40:52 Acknowledged.
+ if (!node.isElementNode())
+ continue;
+ const Element& element = toElement(node);
+ if (!element.hasTagName(metaTag))
+ continue;
+ if ((element.getAttribute("name") == ("og:type")) || (element.getAttribute("property") == ("og:type"))) {
esprehn 2015/10/23 04:59:14 toHTMLMetaElement and then use ->name(), also remo
wychen 2015/10/26 20:40:52 Done, but getAttribute("property") is still there.
+ WTF::CString content = element.getAttribute("content").upper().utf8();
esprehn 2015/10/23 04:59:14 don't convert to CString, you almost never want th
wychen 2015/10/26 20:40:51 Done.
+ if ((content) == "ARTICLE") {
esprehn 2015/10/23 04:59:14 You want to use ->content() and equalIgnoringCase
wychen 2015/10/26 20:40:51 Done.
+ return true;
+ }
+ }
+ }
+ return false;
+ };
+
+private:
+ Document& m_document;
+ WebDistillabilityFeatures& m_features;
+};
+
+} // namespace
+
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
+{
+ WebDistillabilityFeatures features({0});
esprehn 2015/10/23 04:59:14 Can we add a constructor for the struct instead an
wychen 2015/10/26 20:40:52 I'll use an initializer instead of adding a constr
+
+ if (!document.frame() || !document.frame()->isMainFrame())
+ return features;
+
+ if (!document.hasFinishedParsing())
esprehn 2015/10/23 04:59:15 how does this ever get called when we're still par
wychen 2015/10/26 20:40:51 Done.
+ return features;
+
+ ASSERT(document.body());
esprehn 2015/10/23 04:59:15 this doesn't have to be true, you can remove the b
wychen 2015/10/26 20:40:52 Done.
+
+ // First, traverse the DOM tree and collect statistics.
+ ExtractFeatureWalker walker(document, features);
esprehn 2015/10/23 04:59:15 why do you need a walker at all instead of just st
wychen 2015/10/26 20:40:51 Done.
+ walker.walk();
esprehn 2015/10/23 04:59:14 just use recursive functions
wychen 2015/10/26 20:40:51 Done.
+
+ // Next, traverse the Layout tree and collect statistics on innerText length.
+ features.innerTextLength += innerTextLength(*document.body());
esprehn 2015/10/23 04:59:14 you can remove the body, also this doesn't have to
wychen 2015/10/23 18:56:35 The ratio between innerText.length() and textConte
wychen 2015/10/26 20:40:51 Done.
+
+ // The following DISTILLER_NDEBUG section would be gone when landing.
+#ifndef DISTILLER_NDEBUG
+ StringBuilder message;
+ message.append("openGraph: ");
+ message.appendNumber(features.openGraph);
+ message.append(", numElements: ");
+ message.appendNumber(features.numElements);
+ message.append(", numAnchors: ");
+ message.appendNumber(features.numAnchors);
+ message.append(", numForms: ");
+ message.appendNumber(features.numForms);
+ message.append(", numTextInput: ");
+ message.appendNumber(features.numTextInput);
+ message.append(", numPasswordInput: ");
+ message.appendNumber(features.numPasswordInput);
+ message.append(", numPPRE: ");
+ message.appendNumber(features.numPPRE);
+ message.append(", innerTextLength: ");
+ message.appendNumber(features.innerTextLength);
+ message.append(", textContentLength: ");
+ message.appendNumber(features.textContentLength);
+ message.append(", mozScore: ");
+ message.appendNumber(features.mozScore);
+ message.append(", mozScoreAllSqrt: ");
+ message.appendNumber(features.mozScoreAllSqrt);
+ message.append(", mozScoreAllLinear: ");
+ message.appendNumber(features.mozScoreAllLinear);
+
+ RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString());
+ document.addConsoleMessage(consoleMessage);
esprehn 2015/10/23 04:59:14 just fprintf, no reason to console log your debug
wychen 2015/10/26 20:40:52 I just wanted to quickly get message to adb log.
+#endif
+
+ return features;
+}
+
+}

Powered by Google App Engine
This is Rietveld 408576698