Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(178)

Side by Side Diff: javatests/org/chromium/distiller/PageParameterParserTest.java

Issue 1178633002: implement parser for new pagination algorithm (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr chris's comments, fixes for dataset Created 5 years, 4 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 package org.chromium.distiller;
6
7 import com.google.gwt.dom.client.BaseElement;
8 import com.google.gwt.dom.client.Document;
9
10 public class PageParameterParserTest extends DomDistillerJsTestCase {
11 private static final String BASE_URL = "http://www.test.com/";
12 private static final String TEST_URL = BASE_URL + "foo/bar";
13
14 public void testBasic() {
15 PageParamInfo info = processDocument(
16 "1<br>" +
17 "<a href=\"/foo/bar/2\">2</a>");
18 assertEquals(2, info.mAllPageInfo.size());
19
20 info = processDocument(
21 "1<br>" +
22 "<a href=\"/foo/bar/2\">2</a>" +
23 "<a href=\"/foo/bar/3\">3</a>");
24 assertEquals(3, info.mAllPageInfo.size());
25 }
26
27 public void testRejectOnlyPage2LinkWithoutCurrentPageText() {
28 // Although there is a digital outlink to 2nd page, there is no plain te xt "1"
29 // before it, so there is no pagination.
30 PageParamInfo info = processDocument(
31 "If there were a '1', pagination should be detected. But there isn't ." +
32 "<a href=\"/foo/bar/2\">2</a>" +
33 "Main content");
34 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
35 }
36
37 public void testRejectNonAdjacentOutlinks() {
38 PageParamInfo info = processDocument(
39 "1<br>" +
40 "Unrelated terms<br>" +
41 "<a href=\"/foo/bar/2\">2</a>" +
42 "Unrelated terms<br>" +
43 "<a href=\"/foo/bar/3\">3</a>" +
44 "<a href=\"/foo/bar/all\">All</a>");
45 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
46 }
47
48 public void testAcceptAdjacentOutlinks() {
49 PageParamInfo info = processDocumentWithoutBase(
50 "Unrelated link: <a href=\"http://www.test.com/other/2\">2</a>" +
51 "<p>Main content</p>" +
52 "1<br>" +
53 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
54 "<a href=\"http://www.test.com/foo/bar/3\">3</a>",
55 TEST_URL);
56 assertEquals(3, info.mAllPageInfo.size());
57 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
58 assertEquals(1, page.mPageNum);
59 assertEquals(BASE_URL + "foo/bar", page.mUrl);
60 page = info.mAllPageInfo.get(1);
61 assertEquals(2, page.mPageNum);
62 assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
63 page = info.mAllPageInfo.get(2);
64 assertEquals(3, page.mPageNum);
65 assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
66 assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
67 }
68
69 public void testAcceptDuplicatePatterns() {
70 PageParamInfo info = processDocument(
71 "1<br>" +
72 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
73 "<a href=\"http://www.test.com/foo/bar/3\">3</a>" +
74 "<p>Main content</p>" +
75 "1<br>" +
76 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
77 "<a href=\"http://www.test.com/foo/bar/3\">3</a>");
78 assertEquals(3, info.mAllPageInfo.size());
79 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
80 assertEquals(1, page.mPageNum);
81 assertEquals(BASE_URL + "foo/bar", page.mUrl);
82 page = info.mAllPageInfo.get(1);
83 assertEquals(2, page.mPageNum);
84 assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
85 page = info.mAllPageInfo.get(2);
86 assertEquals(3, page.mPageNum);
87 assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
88 assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
89 }
90
91 public void testPreferPageNumber() {
92 PageParamInfo info = processDocument(
93 "<a href=\"http://www.test.com/foo/bar/size-25\">25</a>" +
94 "<a href=\"http://www.test.com/foo/bar/size-50\">50</a>" +
95 "<a href=\"http://www.test.com/foo/bar/size-100\">100</a>" +
96 "<p>Main content</p>" +
97 "1<br>" +
98 "<a href=\"http://www.test.com/foo/bar/2\">2</a>" +
99 "<a href=\"http://www.test.com/foo/bar/3\">3</a>");
100 assertEquals(PageParamInfo.Type.PAGE_NUMBER, info.mType);
101 assertEquals(3, info.mAllPageInfo.size());
102 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
103 assertEquals(1, page.mPageNum);
104 assertEquals(BASE_URL + "foo/bar", page.mUrl);
105 page = info.mAllPageInfo.get(1);
106 assertEquals(2, page.mPageNum);
107 assertEquals(BASE_URL + "foo/bar/2", page.mUrl);
108 page = info.mAllPageInfo.get(2);
109 assertEquals(3, page.mPageNum);
110 assertEquals(BASE_URL + "foo/bar/3", page.mUrl);
111 assertEquals(BASE_URL + "foo/bar/2", info.mNextPagingUrl);
112 }
113
114 public void testRejectMultiplePageNumberPatterns() {
115 PageParamInfo info = processDocumentWithoutBase(
116 "<a href=\"http://www.google.com/test/list.php?start=10\">2</a>" +
117 "<a href=\"http://www.google.com/test/list.php?start=20\">3</a>" +
118 "<a href=\"http://www.google.com/test/list.php?start=30\">4</a>" +
119 "<p>Main content</p>" +
120 "<a href=\"http://www.google.com/test/list.php?offset=10\">2</a>" +
121 "<a href=\"http://www.google.com/test/list.php?offset=20\">3</a>" +
122 "<a href=\"http://www.google.com/test/list.php?offset=30\">4</a>" +
123 "<a href=\"http://www.google.com/test/list.php?offset=all\">All</a>" ,
124 "http://www.google.com/test/list.php");
125
126 assertEquals(PageParamInfo.Type.PAGE_NUMBER, info.mType);
127 assertEquals(4, info.mAllPageInfo.size());
128 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
129 assertEquals(1, page.mPageNum);
130 assertEquals("http://www.google.com/test/list.php", page.mUrl);
131 page = info.mAllPageInfo.get(1);
132 assertEquals(2, page.mPageNum);
133 assertEquals("http://www.google.com/test/list.php?start=10", page.mUrl);
134 page = info.mAllPageInfo.get(2);
135 assertEquals(3, page.mPageNum);
136 assertEquals("http://www.google.com/test/list.php?start=20", page.mUrl);
137 page = info.mAllPageInfo.get(3);
138 assertEquals(4, page.mPageNum);
139 assertEquals("http://www.google.com/test/list.php?start=30", page.mUrl);
140 assertTrue(info.mFormula != null);
141 assertEquals(10, info.mFormula.mCoefficient);
142 assertEquals(-10, info.mFormula.mDelta);
143 assertEquals("http://www.google.com/test/list.php?start=10", info.mNextP agingUrl);
144 }
145
146 public void testInvalidAndVoidLinks() {
147 PageParamInfo info = processDocument(
148 "1<br>" +
149 "<a href=\"javascript:void(0)\">2</a>");
150 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
151 }
152
153 public void testDifferentHostLinks() {
154 PageParamInfo info = processDocumentWithoutBase(
155 "1<br>" +
156 "<a href=\"http://www.foo.com/foo/bar/2\">2</a>",
157 TEST_URL);
158 PageParameterDetectorTest.expectEmptyPageParamInfo(info);
159 }
160
161 public void testWhitespaceSibling() {
162 PageParamInfo info = processDocument(
163 "1<br>" +
164 " " +
165 "<a href=\"/foo/bar/2\">2</a>");
166 assertEquals(2, info.mAllPageInfo.size());
167 }
168
169 public void testPunctuationSibling() {
170 PageParamInfo info = processDocument(
171 "<a href=\"/foo/bar/1\">1</a>" +
172 "," +
173 "<a href=\"/foo/bar/2\">2</a>");
174 assertEquals(2, info.mAllPageInfo.size());
175 }
176
177 public void testParentSibling0() {
wychen 2015/09/21 23:08:03 Should we add tests for things like this to test s
kuan 2015/10/02 15:59:17 Done. fyi, i already had testPuncationSibling() t
178 PageParamInfo info = processDocumentWithoutBase(
179 "<div>begin" +
180 "<strong>1</strong>" +
181 "<div><a href=\"http://www.test.com/foo/bar/2\">2</a></div>" +
182 "<div><a href=\"http://www.test.com/foo/bar/3\">3</a></div>" +
183 "end</div>",
184 TEST_URL);
185 assertEquals(3, info.mAllPageInfo.size());
186 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
187 assertEquals(1, page.mPageNum);
188 assertEquals(TEST_URL, page.mUrl);
189 page = info.mAllPageInfo.get(1);
190 assertEquals(2, page.mPageNum);
191 assertEquals(TEST_URL + "/2", page.mUrl);
192 page = info.mAllPageInfo.get(2);
193 assertEquals(3, page.mPageNum);
194 assertEquals(TEST_URL + "/3", page.mUrl);
195 assertEquals("http://www.test.com/foo/bar/2", info.mNextPagingUrl);
196 }
197
198 public void testParentSibling1() {
199 PageParamInfo info = processDocumentWithoutBase(
200 "<div>begin" +
201 "<div><a href=\"http://www.test.com/foo/bar\">1</a></div>" +
202 "<strong>2</strong>" +
203 "<div><a href=\"http://www.test.com/foo/bar/3\">3</a></div>" +
204 "end</div>",
205 "http://www.test.com/foo/bar/2");
206 assertEquals(2, info.mAllPageInfo.size());
207 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
208 assertEquals(1, page.mPageNum);
209 assertEquals(TEST_URL, page.mUrl);
210 page = info.mAllPageInfo.get(1);
211 assertEquals(3, page.mPageNum);
212 assertEquals(TEST_URL + "/3", page.mUrl);
213 assertEquals("http://www.test.com/foo/bar/3", info.mNextPagingUrl);
214 }
215
216 public void testParentSibling2() {
217 PageParamInfo info = processDocumentWithoutBase(
218 "<div>begin" +
219 "<div><a href=\"http://www.test.com/foo/bar\">1</a></div>" +
220 "<div><a href=\"http://www.test.com/foo/bar/2\">2</a></div>" +
221 "<strong>3</strong>" +
222 "end</div>",
223 "http://www.test.com/foo/bar/3");
224 assertEquals(2, info.mAllPageInfo.size());
225 PageParamInfo.PageInfo page = info.mAllPageInfo.get(0);
226 assertEquals(1, page.mPageNum);
227 assertEquals(TEST_URL, page.mUrl);
228 page = info.mAllPageInfo.get(1);
229 assertEquals(2, page.mPageNum);
230 assertEquals(TEST_URL + "/2", page.mUrl);
231 assertTrue(info.mNextPagingUrl.isEmpty());
232 }
233
234 private PageParamInfo processDocument(String content) {
235 // Create and add a <base> element so that all anchors are based off it.
236 BaseElement baseTag = Document.get().createBaseElement();
237 baseTag.setHref(BASE_URL);
238 mHead.appendChild(baseTag);
239
240 // Append content to body.
241 mBody.setInnerHTML(content);
242
243 PageParamInfo info = PageParameterParser.parse(TEST_URL, null);
244 mHead.removeChild(baseTag);
245 return info;
246 }
247
248 private PageParamInfo processDocumentWithoutBase(String content, String orig inalUrl) {
249 // Append content to body.
250 mBody.setInnerHTML(content);
251 return PageParameterParser.parse(originalUrl, null);
252 }
253
254 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698