Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(166)

Unified Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comments, add tests Created 5 years, 2 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp
diff --git a/third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..986f6bd1ff4a2f357c234b2a1256d56495352dfd
--- /dev/null
+++ b/third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2015, Google Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
esprehn 2015/10/26 21:43:09 Use the modern short copyright.
wychen 2015/10/27 23:52:12 Done.
+ * met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ * * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "config.h"
+#include "core/dom/DocumentStatisticsCollector.h"
+
+#include "core/dom/Document.h"
+#include "core/dom/DocumentVisibilityObserver.h"
+#include "core/frame/FrameView.h"
+#include "core/html/HTMLHeadElement.h"
+#include "core/html/HTMLLinkElement.h"
+#include "core/testing/DummyPageHolder.h"
+#include "public/platform/WebDistillability.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace blink {
+
+class DocumentStatisticsCollectorTest : public ::testing::Test {
+protected:
+ void SetUp() override;
+
+#if ENABLE(OILPAN)
+ void TearDown() override
+ {
+ Heap::collectAllGarbage();
+ }
+#endif
+
+ Document& document() const { return m_dummyPageHolder->document(); }
+
+ void setHtmlInnerHTML(const char*);
esprehn 2015/10/26 21:43:09 const String&
wychen 2015/10/27 23:52:12 Done.
+
+private:
+ OwnPtr<DummyPageHolder> m_dummyPageHolder;
+};
+
+void DocumentStatisticsCollectorTest::SetUp()
+{
+ m_dummyPageHolder = DummyPageHolder::create(IntSize(800, 600));
+}
+
+void DocumentStatisticsCollectorTest::setHtmlInnerHTML(const char* htmlContent)
+{
+ document().documentElement()->setInnerHTML(String::fromUTF8(htmlContent), ASSERT_NO_EXCEPTION);
esprehn 2015/10/26 21:43:09 from fromtUTF8
wychen 2015/10/27 23:52:12 I'm not quite sure I understand this comment. For
+ document().view()->updateAllLifecyclePhases();
esprehn 2015/10/26 21:43:09 remove this, you don't need it.
wychen 2015/10/27 23:52:12 Without this line, there's an assertion error: ASS
+}
+
+// This test checks open graph articles can be recognized.
+TEST_F(DocumentStatisticsCollectorTest, HasOpenGraphArticle)
+{
+ setHtmlInnerHTML(
+ "<head>"
+ // Note the case-insensitive matching of the word "article".
+ " <meta property='og:type' content='arTiclE' />"
+ "</head>"
+ );
+ WebDistillabilityFeatures features = DocumentStatisticsCollector::collectStatistics(document());
+
+ EXPECT_EQ(true, features.openGraph);
+}
+
+// This test checks non-existence of open graph articles can be recognized.
+TEST_F(DocumentStatisticsCollectorTest, NoOpenGraphArticle)
+{
+ setHtmlInnerHTML(
+ "<head>"
+ " <meta property='og:type' content='movie' />"
+ "</head>"
+ );
+ WebDistillabilityFeatures features = DocumentStatisticsCollector::collectStatistics(document());
+
+ EXPECT_EQ(false, features.openGraph);
+}
+
+// This test checks element counts are correct.
+TEST_F(DocumentStatisticsCollectorTest, CountElements)
+{
+ setHtmlInnerHTML(
+ "<form>"
+ " <input type='text'>"
+ " <input type='password'>"
+ "</form>"
+ "<pre></pre>"
+ "<p><a> </a></p>"
+ "<ul><li><p><a> </a></p></li></ul>"
+ );
+ WebDistillabilityFeatures features = DocumentStatisticsCollector::collectStatistics(document());
+
+ EXPECT_EQ(false, features.openGraph);
+
+ EXPECT_EQ(10u, features.elementCount);
+ EXPECT_EQ(2u, features.anchorCount);
+ EXPECT_EQ(1u, features.formCount);
+ EXPECT_EQ(1u, features.textInputCount);
+ EXPECT_EQ(1u, features.passwordInputCount);
+ EXPECT_EQ(2u, features.pCount);
+ EXPECT_EQ(1u, features.preCount);
+}
+
+// This test checks score calculations are correct.
+TEST_F(DocumentStatisticsCollectorTest, CountScore)
+{
+ setHtmlInnerHTML(
+ "<p class='menu' id='article'> 1 </p>" // trimmedTextContentLength = 1
+ "<ul><li><p>12</p></li></ul>" // trimmedTextContentLength = 2, skipped because under li
+ "<p class='menu'>123</p>" // trimmedTextContentLength = 3, skipped because unlikelyCandidates
+ "<p>"
+ "12345678901234567890123456789012345678901234567890"
+ "12345678901234567890123456789012345678901234567890"
+ "12345678901234567890123456789012345678901234"
+ "</p>" // trimmedTextContentLength = 144
+ "<p style='display:none'>12345</p>" // trimmedTextContentLength = 5, skipped because invisible
+ "<div style='visibility:hidden'><p>123456</p></div>" // trimmedTextContentLength = 6, skipped because invisible
+ "<p style='opacity:0'>1234567</p>" // trimmedTextContentLength = 7, skipped because invisible
+ "<p> <a href='#'> 12345 </a> 9 <b> </b> </p>" // trimmedTextContentLength = 9
+ );
+ WebDistillabilityFeatures features = DocumentStatisticsCollector::collectStatistics(document());
+
+ const unsigned kParagraphLengthThreshold = 140;
+
+ EXPECT_DOUBLE_EQ(features.mozScore, sqrt(144 - kParagraphLengthThreshold));
+ EXPECT_DOUBLE_EQ(features.mozScoreAllSqrt, 1 + sqrt(144) + sqrt(9));
+ EXPECT_DOUBLE_EQ(features.mozScoreAllLinear, 1 + 144 + 9);
+}
+
+} // namespace blink

Powered by Google App Engine
This is Rietveld 408576698