Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(10)

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comments, add tests Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..198407839a2ed689af80f49f41c919082ada4c3a
--- /dev/null
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollector.cpp
@@ -0,0 +1,230 @@
+// Copyright 2015 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "config.h"
+#include "DocumentStatisticsCollector.h"
+
+#include "core/HTMLNames.h"
+#include "core/css/CSSComputedStyleDeclaration.h"
+#include "core/editing/iterators/TextIterator.h"
+#include "core/html/HTMLHeadElement.h"
+#include "core/html/HTMLInputElement.h"
+#include "core/html/HTMLMetaElement.h"
+
+// TODO(wychen): The following lines will be gone before landing.
+#include "core/inspector/ConsoleMessage.h"
+
+#include "public/platform/WebDistillability.h"
+#include "wtf/text/StringBuilder.h"
+#include "wtf/text/StringImpl.h"
+
+namespace blink {
+
+using namespace HTMLNames;
+
+namespace {
+
+unsigned trimmedTextContentLength(Element& root)
esprehn 2015/10/26 21:43:09 Do pages usually have enough whitespace for this t
wychen 2015/10/27 23:52:12 Trimming the white spaces should make this less no
+{
+ int firstNonWhite = -1, lastNonWhite = -1;
+ unsigned position = 0;
+ // TODO(wychen): scan backwards for lastNonWhite should be much faster in practice.
+ for (Node& node : NodeTraversal::inclusiveDescendantsOf(root)) {
esprehn 2015/10/26 21:43:08 This doesn't understand shadow dom. ex. <p><conte
wychen 2015/10/27 23:52:12 The visibility model used here is the same as Java
+ if (!node.isTextNode()) {
+ continue;
+ }
+ const String& text = toText(node).data();
+ for (unsigned i = 0; i < text.length(); i++) {
+ if (!isSpaceOrNewline(text[i])) {
+ if (firstNonWhite < 0) {
+ firstNonWhite = lastNonWhite = i + position;
+ } else {
+ lastNonWhite = i + position;
+ }
+ }
+ }
+ position += text.length();
+ }
+ if (firstNonWhite < 0) {
+ return 0;
+ }
+ return lastNonWhite - firstNonWhite + 1;
+}
+
+unsigned innerTextLength(Element& root)
+{
+ unsigned length = 0;
+ EphemeralRange range = EphemeralRange::rangeOfContents(root);
+ TextIteratorAlgorithm<EditingStrategy> it(range.startPosition(), range.endPosition(), TextIteratorForInnerText);
esprehn 2015/10/26 21:43:09 Why do you care about innerText length at all? Tha
wychen 2015/10/27 23:52:12 The length ratio between innerText and textContent
+ for (; !it.atEnd(); it.advance()) {
+ length += it.length();
+ }
+ return length;
+}
+
+bool isVisible(Element& element)
+{
+ const blink::ComputedStyle* style = element.ensureComputedStyle();
esprehn 2015/10/26 21:43:08 this forces a style computation on elements that w
wychen 2015/10/27 23:52:12 This statistics collection happens right after the
+ return !(
esprehn 2015/10/26 21:43:09 run demorgans
wychen 2015/10/27 23:52:12 Done.
+ style->display() == NONE
+ || style->visibility() == HIDDEN
+ || style->opacity() == 0
+ );
+}
+
+bool matchName(Element& element, const Vector<String>& words)
+{
+ if (element.hasClass()) {
+ const String& hay = element.getClassAttribute();
+ for (const String& word: words) {
esprehn 2015/10/26 21:43:09 missing space
wychen 2015/10/27 23:52:12 Done.
+ if (hay.findIgnoringCase(word) != WTF::kNotFound) {
+ return true;
+ }
+ }
+ }
+ if (UNLIKELY(element.hasID())) {
esprehn 2015/10/26 21:43:09 remove UNLIKELY
wychen 2015/10/27 23:52:12 Done.
+ const String& hay = element.getIdAttribute();
esprehn 2015/10/26 21:43:09 these already have a hasID() check in them, so I'd
wychen 2015/10/27 23:52:11 Done.
+ for (const String& word: words) {
esprehn 2015/10/26 21:43:08 missing space
wychen 2015/10/27 23:52:12 Done.
+ if (hay.findIgnoringCase(word) != WTF::kNotFound) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+// underListItem denotes that at least one of the ancesters is <li> element.
+void walk(Element& root, WebDistillabilityFeatures& features, bool underListItem = false)
esprehn 2015/10/26 21:43:08 needs a better name. collectFeatures?
wychen 2015/10/27 23:52:12 Done.
+{
+ DEFINE_STATIC_LOCAL(Vector<String>, unlikelyCandidates, ());
+ if (unlikelyCandidates.size() == 0) {
esprehn 2015/10/26 21:43:09 isEmpty()
wychen 2015/10/27 23:52:11 Done.
+ for (auto word : {"banner", "combx", "comment", "community", "disqus", "extra", "foot", "header", "menu", "related", "remark", "rss", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "ad-break", "agegate", "pagination", "pager", "popup"}) {
+ unlikelyCandidates.append(word);
+ }
+ }
+ DEFINE_STATIC_LOCAL(Vector<String>, highlyLikelyCandidates, ());
+ if (highlyLikelyCandidates.size() == 0) {
esprehn 2015/10/26 21:43:08 isEmpty()
wychen 2015/10/27 23:52:12 Done.
+ for (auto word : {"and", "article", "body", "column", "main", "shadow"}) {
+ highlyLikelyCandidates.append(word);
+ }
+ }
+ const unsigned kParagraphLengthThreshold = 140;
esprehn 2015/10/26 21:43:08 why did you pick 140? Add a comment.
wychen 2015/10/27 23:52:12 Done.
+
+ for (Node& node : NodeTraversal::childrenOf(root)) {
+ if (node.isTextNode()) {
+ features.textContentLength += toText(node).length();
+ continue;
+ }
+ if (!node.isElementNode()) {
+ continue;
+ }
+
+ features.elementCount++;
+ Element& element = toElement(node);
+ if (element.hasTagName(aTag)) {
+ features.anchorCount++;
+ } else if (element.hasTagName(formTag)) {
+ features.formCount++;
+ } else if (element.hasTagName(inputTag)) {
+ const HTMLInputElement& input = toHTMLInputElement(element);
+ if (equalIgnoringCase(input.type(), "text")) {
esprehn 2015/10/26 21:43:09 input.type() == InputTypeNames::text
wychen 2015/10/27 23:52:12 Done.
+ features.textInputCount++;
+ } else if (equalIgnoringCase(input.type(), "password")) {
esprehn 2015/10/26 21:43:08 == InputTypeNames::password
wychen 2015/10/27 23:52:12 Done.
+ features.passwordInputCount++;
+ }
+ } else if (element.hasTagName(pTag) || element.hasTagName(preTag)) {
+ if (element.hasTagName(pTag)) {
+ features.pCount++;
+ } else {
+ features.preCount++;
+ }
+ if (!underListItem && isVisible(element)
+ && (!matchName(element, unlikelyCandidates) || matchName(element, highlyLikelyCandidates))) {
esprehn 2015/10/26 21:43:09 matchAttributes? It's not really related to name a
wychen 2015/10/27 23:52:12 Done.
+ unsigned length = trimmedTextContentLength(element);
+ if (length >= kParagraphLengthThreshold) {
+ features.mozScore += sqrt(length - kParagraphLengthThreshold);
+ }
+ features.mozScoreAllSqrt += sqrt(length);
+ features.mozScoreAllLinear += length;
+ }
+ }
+ walk(element, features, element.hasTagName(liTag) || underListItem);
esprehn 2015/10/26 21:43:09 this checks hasTagName(liTag) for every element, e
wychen 2015/10/27 23:52:12 Done.
+ }
+}
+
+bool hasOpenGraphArticle(const Element& head)
+{
+ for (const Node& node : NodeTraversal::childrenOf(head)) {
esprehn 2015/10/26 21:43:09 for (const Element* child = ElementTraversal::firs
wychen 2015/10/27 23:52:12 Done.
+ if (!node.isElementNode())
+ continue;
+ const Element& element = toElement(node);
+ if (!isHTMLMetaElement(element))
+ continue;
+ const HTMLMetaElement& meta = toHTMLMetaElement(element);
+ if (meta.name() == "og:type" || element.getAttribute("property") == "og:type") {
esprehn 2015/10/26 21:43:09 You want to declare static local AtomicString vari
wychen 2015/10/27 23:52:12 Done.
+ if (equalIgnoringCase(meta.content(), "article")) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+} // namespace
+
+WebDistillabilityFeatures DocumentStatisticsCollector::collectStatistics(Document& document)
+{
+ WebDistillabilityFeatures features = WebDistillabilityFeatures();
+
+ if (!document.frame() || !document.frame()->isMainFrame())
+ return features;
+
+ ASSERT(document.hasFinishedParsing());
+
+ if (!document.body() || !document.head())
+ return features;
+
+ // First, traverse the DOM tree and collect statistics.
+ walk(*document.body(), features);
+ features.openGraph = hasOpenGraphArticle(*document.head());
+
+ // Next, traverse the Layout tree and collect statistics on innerText length.
esprehn 2015/10/26 21:43:08 this needs to do document->updateLayout() so it's
wychen 2015/10/27 23:52:12 Done. I'm curious when should updateLayoutIgnorePe
+ features.innerTextLength += innerTextLength(*document.body());
esprehn 2015/10/26 21:43:09 this really seems unnecessary, you can just collec
wychen 2015/10/27 23:52:12 There seems to be much more than visibility in Tex
+
+ // The following DISTILLER_NDEBUG section would be gone when landing.
+#ifndef DISTILLER_NDEBUG
+ StringBuilder message;
+ message.append("openGraph: ");
+ message.appendNumber(features.openGraph);
+ message.append(", elementCount: ");
+ message.appendNumber(features.elementCount);
+ message.append(", anchorCount: ");
+ message.appendNumber(features.anchorCount);
+ message.append(", formCount: ");
+ message.appendNumber(features.formCount);
+ message.append(", textInputCount: ");
+ message.appendNumber(features.textInputCount);
+ message.append(", passwordInputCount: ");
+ message.appendNumber(features.passwordInputCount);
+ message.append(", pCount: ");
+ message.appendNumber(features.pCount);
+ message.append(", innerTextLength: ");
+ message.appendNumber(features.innerTextLength);
+ message.append(", textContentLength: ");
+ message.appendNumber(features.textContentLength);
+ message.append(", mozScore: ");
+ message.appendNumber(features.mozScore);
+ message.append(", mozScoreAllSqrt: ");
+ message.appendNumber(features.mozScoreAllSqrt);
+ message.append(", mozScoreAllLinear: ");
+ message.appendNumber(features.mozScoreAllLinear);
+
+ RefPtrWillBeRawPtr<ConsoleMessage> consoleMessage = ConsoleMessage::create(ConsoleAPIMessageSource, DebugMessageLevel, message.toString());
+ document.addConsoleMessage(consoleMessage);
+#endif
+
+ return features;
+}
+
+}

Powered by Google App Engine
This is Rietveld 408576698