Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(21)

Side by Side Diff: third_party/WebKit/Source/core/dom/DocumentStatisticsCollectorTest.cpp

Issue 1419033004: Add feature extraction for distillability to Blink (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: address comments, add tests Created 5 years, 1 month ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 /*
2 * Copyright (c) 2015, Google Inc. All rights reserved.
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are
esprehn 2015/10/26 21:43:09 Use the modern short copyright.
wychen 2015/10/27 23:52:12 Done.
6 * met:
7 *
8 * * Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * * Redistributions in binary form must reproduce the above
11 * copyright notice, this list of conditions and the following disclaimer
12 * in the documentation and/or other materials provided with the
13 * distribution.
14 * * Neither the name of Google Inc. nor the names of its
15 * contributors may be used to endorse or promote products derived from
16 * this software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "config.h"
32 #include "core/dom/DocumentStatisticsCollector.h"
33
34 #include "core/dom/Document.h"
35 #include "core/dom/DocumentVisibilityObserver.h"
36 #include "core/frame/FrameView.h"
37 #include "core/html/HTMLHeadElement.h"
38 #include "core/html/HTMLLinkElement.h"
39 #include "core/testing/DummyPageHolder.h"
40 #include "public/platform/WebDistillability.h"
41 #include <gmock/gmock.h>
42 #include <gtest/gtest.h>
43
44 namespace blink {
45
46 class DocumentStatisticsCollectorTest : public ::testing::Test {
47 protected:
48 void SetUp() override;
49
50 #if ENABLE(OILPAN)
51 void TearDown() override
52 {
53 Heap::collectAllGarbage();
54 }
55 #endif
56
57 Document& document() const { return m_dummyPageHolder->document(); }
58
59 void setHtmlInnerHTML(const char*);
esprehn 2015/10/26 21:43:09 const String&
wychen 2015/10/27 23:52:12 Done.
60
61 private:
62 OwnPtr<DummyPageHolder> m_dummyPageHolder;
63 };
64
65 void DocumentStatisticsCollectorTest::SetUp()
66 {
67 m_dummyPageHolder = DummyPageHolder::create(IntSize(800, 600));
68 }
69
70 void DocumentStatisticsCollectorTest::setHtmlInnerHTML(const char* htmlContent)
71 {
72 document().documentElement()->setInnerHTML(String::fromUTF8(htmlContent), AS SERT_NO_EXCEPTION);
esprehn 2015/10/26 21:43:09 from fromtUTF8
wychen 2015/10/27 23:52:12 I'm not quite sure I understand this comment. For
73 document().view()->updateAllLifecyclePhases();
esprehn 2015/10/26 21:43:09 remove this, you don't need it.
wychen 2015/10/27 23:52:12 Without this line, there's an assertion error: ASS
74 }
75
76 // This test checks open graph articles can be recognized.
77 TEST_F(DocumentStatisticsCollectorTest, HasOpenGraphArticle)
78 {
79 setHtmlInnerHTML(
80 "<head>"
81 // Note the case-insensitive matching of the word "article".
82 " <meta property='og:type' content='arTiclE' />"
83 "</head>"
84 );
85 WebDistillabilityFeatures features = DocumentStatisticsCollector::collectSta tistics(document());
86
87 EXPECT_EQ(true, features.openGraph);
88 }
89
90 // This test checks non-existence of open graph articles can be recognized.
91 TEST_F(DocumentStatisticsCollectorTest, NoOpenGraphArticle)
92 {
93 setHtmlInnerHTML(
94 "<head>"
95 " <meta property='og:type' content='movie' />"
96 "</head>"
97 );
98 WebDistillabilityFeatures features = DocumentStatisticsCollector::collectSta tistics(document());
99
100 EXPECT_EQ(false, features.openGraph);
101 }
102
103 // This test checks element counts are correct.
104 TEST_F(DocumentStatisticsCollectorTest, CountElements)
105 {
106 setHtmlInnerHTML(
107 "<form>"
108 " <input type='text'>"
109 " <input type='password'>"
110 "</form>"
111 "<pre></pre>"
112 "<p><a> </a></p>"
113 "<ul><li><p><a> </a></p></li></ul>"
114 );
115 WebDistillabilityFeatures features = DocumentStatisticsCollector::collectSta tistics(document());
116
117 EXPECT_EQ(false, features.openGraph);
118
119 EXPECT_EQ(10u, features.elementCount);
120 EXPECT_EQ(2u, features.anchorCount);
121 EXPECT_EQ(1u, features.formCount);
122 EXPECT_EQ(1u, features.textInputCount);
123 EXPECT_EQ(1u, features.passwordInputCount);
124 EXPECT_EQ(2u, features.pCount);
125 EXPECT_EQ(1u, features.preCount);
126 }
127
128 // This test checks score calculations are correct.
129 TEST_F(DocumentStatisticsCollectorTest, CountScore)
130 {
131 setHtmlInnerHTML(
132 "<p class='menu' id='article'> 1 </p>" // trimmedTextContentLength = 1
133 "<ul><li><p>12</p></li></ul>" // trimmedTextContentLength = 2, skipped b ecause under li
134 "<p class='menu'>123</p>" // trimmedTextContentLength = 3, skipped becau se unlikelyCandidates
135 "<p>"
136 "12345678901234567890123456789012345678901234567890"
137 "12345678901234567890123456789012345678901234567890"
138 "12345678901234567890123456789012345678901234"
139 "</p>" // trimmedTextContentLength = 144
140 "<p style='display:none'>12345</p>" // trimmedTextContentLength = 5, ski pped because invisible
141 "<div style='visibility:hidden'><p>123456</p></div>" // trimmedTextConte ntLength = 6, skipped because invisible
142 "<p style='opacity:0'>1234567</p>" // trimmedTextContentLength = 7, skip ped because invisible
143 "<p> <a href='#'> 12345 </a> 9 <b> </b> </p>" // trimmedTextContentLen gth = 9
144 );
145 WebDistillabilityFeatures features = DocumentStatisticsCollector::collectSta tistics(document());
146
147 const unsigned kParagraphLengthThreshold = 140;
148
149 EXPECT_DOUBLE_EQ(features.mozScore, sqrt(144 - kParagraphLengthThreshold));
150 EXPECT_DOUBLE_EQ(features.mozScoreAllSqrt, 1 + sqrt(144) + sqrt(9));
151 EXPECT_DOUBLE_EQ(features.mozScoreAllLinear, 1 + 144 + 9);
152 }
153
154 } // namespace blink
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698