Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(462)

Side by Side Diff: java/org/chromium/distiller/PageParamInfo.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: addr all comments Created 5 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2015 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import java.util.ArrayList; 7 import java.util.ArrayList;
8 import java.util.HashSet;
8 import java.util.List; 9 import java.util.List;
10 import java.util.Set;
9 11
10 /** 12 /**
11 * This class stores information about the page parameter detected from potentia l pagination URLs 13 * This class stores information about the page parameter detected from potentia l pagination URLs
12 * with numeric anchor text: 14 * with numeric anchor text:
13 * - type of page parameter detected 15 * - type of page parameter detected
14 * - list of pagination URLs with their page numbers 16 * - list of pagination URLs with their page numbers
15 * - coefficient and delta values of the linear formula formed by the pagination URLs: 17 * - coefficient and delta values of the linear formula formed by the pagination URLs:
16 * pageParamValue = coefficient * pageNum + delta 18 * pageParamValue = coefficient * pageNum + delta.
17 * - detected single page URL.
18 */ 19 */
19 class PageParamInfo implements Cloneable { 20 class PageParamInfo implements Cloneable {
21 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2;
22
23 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0;
cjhopman 2015/04/24 02:38:56 can these be private?
kuan 2015/04/24 17:38:00 i'm using them in PageParamInfoTest.java.
24 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1;
20 25
21 /** 26 /**
22 * Stores potential pagination info: 27 * Stores potential pagination info:
23 * - page number represented as original plain text in document URL 28 * - page number represented as original plain text in document URL
24 * - if the info is extracted from an anchor, its HRef. 29 * - if the info is extracted from an anchor, its HRef.
25 */ 30 */
26 static class PageInfo implements Comparable<PageInfo> { 31 static class PageInfo implements Comparable<PageInfo> {
27 int mPageNum; 32 int mPageNum;
28 String mUrl; 33 String mUrl;
29 34
(...skipping 13 matching lines...) Expand all
43 48
44 @Override 49 @Override
45 public String toString() { // For debugging. 50 public String toString() { // For debugging.
46 return new String("pg" + mPageNum + ": " + mUrl); 51 return new String("pg" + mPageNum + ": " + mUrl);
47 } 52 }
48 } 53 }
49 54
50 /** 55 /**
51 * Stores the coefficient and delta values of the linear formula: 56 * Stores the coefficient and delta values of the linear formula:
52 * pageParamValue = coefficient * pageNum + delta. 57 * pageParamValue = coefficient * pageNum + delta.
53 * See PageParamterDetector.getPageParamLinearFormula() for details. 58 * See getLinearFormula() for details.
54 */ 59 */
55 static class LinearFormula { 60 static class LinearFormula {
56 final int mCoefficient; 61 final int mCoefficient;
57 final int mDelta; 62 final int mDelta;
58 63
59 LinearFormula(int coefficient, int delta) { 64 LinearFormula(int coefficient, int delta) {
60 mCoefficient = coefficient; 65 mCoefficient = coefficient;
61 mDelta = delta; 66 mDelta = delta;
62 } 67 }
63 68
64 @Override 69 @Override
65 public String toString() { // For debugging. 70 public String toString() { // For debugging.
66 return new String("coefficient=" + mCoefficient + ", delta=" + mDelt a); 71 return new String("coefficient=" + mCoefficient + ", delta=" + mDelt a);
67 } 72 }
68 } 73 }
69 74
70 /** 75 /**
71 * Types of page parameter values in paging URLs. 76 * Types of page parameter values in paging URLs.
72 */ 77 */
73 static enum Type { 78 static enum Type {
74 UNSET, // Initialized type to indicate empty PageParamInfo. 79 UNSET, // Initialized type to indicate empty PageParamInfo.
75 PAGE_NUMBER, // Value is a page number. 80 PAGE_NUMBER, // Value is a page number.
76 UNKNOWN, // None of the above. 81 UNKNOWN, // None of the above.
77 } 82 }
78 83
79 Type mType = Type.UNSET; 84 Type mType = Type.UNSET;
cjhopman 2015/04/24 02:38:56 All these private?
kuan 2015/04/24 17:38:00 i'll be using them in upcoming PageParameterDetect
80 List<PageInfo> mAllPageInfo = new ArrayList<PageInfo>(); 85 List<PageInfo> mAllPageInfo = new ArrayList<PageInfo>();
81 LinearFormula mFormula = null; 86 LinearFormula mFormula = null;
82 String mSinglePageUrl = ""; 87
88 PageParamInfo() {
89 }
90
91 PageParamInfo(Type type, LinearFormula formula) {
92 mType = type;
93 mFormula = formula;
94 }
95
96 /**
97 * Evaluates if the given list of PageLinkInfo's is a list of paging URLs:
98 * - page numbers in list of PageLinkInfo's must be adjacent
99 * - page numbers in list of ascending numbers must either
100 * - be consecutive and form a page number sequence, or
101 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b
102 * - if there's only 1 PageLinkInfo, the first ascending number must be page 1, first page URL
103 * must match page pattern, and the only outlink must be 2nd or 3rd page.
104 *
105 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null.
106 *
107 * @param allLinkInfo the list of PageLinkInfo's to evaluate
108 * @param pagePattern the URL pattern to use
109 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
110 * @param firstPageUrl the URL of the PageInfo with mPageNum=1
111 */
112 static PageParamInfo evaluate(PageParameterDetector.PagePattern pagePattern,
113 List<PageLinkInfo> allLinkInfo, List<PageInfo> ascendingNumbers, Str ing firstPageUrl) {
114 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) {
115 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers);
116 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null;
117
118 LinearFormula linearFormula = getLinearFormula(allLinkInfo);
119
120 // Type.PAGE_NUMBER: ascending numbers must be consecutive and of a page
121 // number sequence.
122 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null;
cjhopman 2015/04/24 02:38:56 nit: can this be "== 0" like when checking a flag
kuan 2015/04/24 17:38:00 Done. duh, what i did was weird :(
123 if (!isPageNumberSequence(ascendingNumbers)) return null;
124 PageParamInfo pageParamInfo = new PageParamInfo(Type.PAGE_NUMBER, li nearFormula);
cjhopman 2015/04/24 02:38:56 Can the PageParamInfo constructor take a List<Page
kuan 2015/04/24 17:38:00 Done.
125 final List<PageInfo> allPageInfo = pageParamInfo.mAllPageInfo;
126 for (PageLinkInfo link : allLinkInfo) {
127 allPageInfo.add(new PageInfo(link.mPageNum,
128 ascendingNumbers.get(link.mPosInAscendingList).mUrl));
129 }
130 return pageParamInfo;
131 }
132
133 // Most of news article have no more than 3 pages and the first page pro bably doesn't have
134 // any page parameter. If the first page url matches the the page patte rn, we treat it as
135 // the first page of this pattern.
136 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) {
137 final PageLinkInfo onlyLink = allLinkInfo.get(0);
138 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 &&
139 onlyLink.mPosInAscendingList == 1;
140 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 &&
141 onlyLink.mPosInAscendingList == 2 &&
142 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3
143 // elements; check if previous element is previous page.
144 ascendingNumbers.get(1).mPageNum == 2;
145 // 1 PageLinkInfo means ascendingNumbers has >= 1 element.
146 if (ascendingNumbers.get(0).mPageNum == 1 &&
147 (secondPageIsOutlink || thirdPageIsOutlink) &&
148 pagePattern.isPagingUrl(firstPageUrl)) {
149 // Has valid PageParamInfo, create and populate it.
150 int coefficient;
151 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum;
152 if (delta == 0 || delta == 1) {
153 coefficient = 1;
154 } else {
155 coefficient = onlyLink.mPageParamValue;
156 delta = 0;
157 }
158 PageParamInfo pageParamInfo = new PageParamInfo(Type.PAGE_NUMBER ,
159 new LinearFormula(coefficient, delta));
160 final List<PageInfo> allPageInfo = pageParamInfo.mAllPageInfo;
161 allPageInfo.add(new PageInfo(1, firstPageUrl));
162 allPageInfo.add(new PageInfo(onlyLink.mPageNum,
163 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) );
164 return pageParamInfo;
165 }
166 }
167
168 return null;
169 }
83 170
84 /** 171 /**
85 * Returns a new PageParamInfo that is a clone of this object. 172 * Returns a new PageParamInfo that is a clone of this object.
86 */ 173 */
87 public Object clone() { 174 public Object clone() {
88 PageParamInfo info = new PageParamInfo(); 175 PageParamInfo info = new PageParamInfo(this.mType, this.mFormula);
89 info.mType = this.mType;
90 info.mAllPageInfo = this.mAllPageInfo; 176 info.mAllPageInfo = this.mAllPageInfo;
91 info.mFormula = this.mFormula;
92 info.mSinglePageUrl = this.mSinglePageUrl;
93 return info; 177 return info;
94 } 178 }
95 179
96 /** 180 /**
97 * Evaluates which PageParamInfo is better based on the properties of PagePa ramInfo. 181 * Evaluates which PageParamInfo is better based on the properties of PagePa ramInfo.
98 * We prefer the one if the list of PageParameterDetector.LinkInfo forms a l inear formula, see 182 * We prefer the one if the list of PageLinkInfo forms a linear formula, see getLinearFormula().
99 * PageParamterDetector.getPageParamLinearFormula().
100 * 183 *
101 * Returns 1 if this is better, -1 if other is better and 0 if they are equa l. 184 * Returns 1 if this is better, -1 if other is better and 0 if they are equa l.
102 * 185 *
103 */ 186 */
104 int compareTo(PageParamInfo other) { 187 int compareTo(PageParamInfo other) {
105 // We prefer the one where the LinkInfo array forms a linear formula, se e isLinearFormula. 188 // We prefer the one where the LinkInfo array forms a linear formula, se e isLinearFormula.
106 if (mFormula != null && other.mFormula == null) return 1; 189 if (mFormula != null && other.mFormula == null) return 1;
107 if (mFormula == null && other.mFormula != null) return -1; 190 if (mFormula == null && other.mFormula != null) return -1;
108 191
109 if (mType == other.mType) return 0; 192 if (mType == other.mType) return 0;
110 193
111 // For different page param types, we prefer PAGE_NUMBER. 194 // For different page param types, we prefer PAGE_NUMBER.
112 if (mType == Type.PAGE_NUMBER) return 1; 195 if (mType == Type.PAGE_NUMBER) return 1;
113 if (other.mType == Type.PAGE_NUMBER) return -1; 196 if (other.mType == Type.PAGE_NUMBER) return -1;
114 // Can't decide as they have unknown page type. 197 // Can't decide as they have unknown page type.
115 return 0; 198 return 0;
116 } 199 }
117 200
118 @Override 201 @Override
119 public String toString() { // For debugging. 202 public String toString() { // For debugging.
120 String str = new String("Type: " + mType + "\nPageInfo: " + mAllPageInfo. size()); 203 String str = new String("Type: " + mType + "\nPageInfo: " + mAllPageInfo. size());
121 for (PageInfo page : mAllPageInfo) { 204 for (PageInfo page : mAllPageInfo) {
122 str += "\n " + page.toString(); 205 str += "\n " + page.toString();
123 } 206 }
124 str += "\nsinglePageUrl: " + mSinglePageUrl + 207 str += "\nformula: " + (mFormula == null ? "null" : mFormula.toString());
125 "\nformula: " + (mFormula == null ? "null" : mFormula.toString()) ;
126 return str; 208 return str;
127 } 209 }
128 210
211 /**
212 * Detects if page numbers in list of PageLinkInfo's are adjacent, and if pa ge numbers in list
213 * of PageInfo's are consecutive.
214 *
215 * For adjacency, the page numbers in list of PageLinkInfo's must either be adjacent, or
216 * separated by at most 1 plain text number which must represent the current page number in one
217 * of the PageInfo's.
218 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list
219 * of PageLinkInfo's, or between a PageLinkInfo and a plain text number.
220 *
221 * Returns a int value that is a combination of bits:
222 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent
223 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive.
224 *
225 * @param allLinkInfo the list of PageLinkInfo's to evaluate
226 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
227 */
228 static int arePageNumsAdjacentAndConsecutive(List<PageLinkInfo> allLinkInfo,
229 List<PageInfo> ascendingNumbers) {
230 int result = 0;
231
232 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is
233 // current page number respresented in plain text.
234 int firstPos = -1;
235 int lastPos = -1;
236 int gapPos = -1;
237 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique.
238 for (PageLinkInfo linkInfo : allLinkInfo) {
239 final int currPos = linkInfo.mPosInAscendingList;
240 if (lastPos == -1) {
241 firstPos = currPos;
242 } else if (currPos != lastPos + 1) {
243 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6
244 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not
245 // adjacent.
246 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1 ) return result;
247 gapPos = currPos - 1;
248 }
249 // Make sure page param value, i.e. page number represented in plain text, is unique.
250 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result;
251 lastPos = currPos;
252 } // for all LinkInfo's
253
254 result |= PAGE_NUM_ADJACENT_MASK;
255
256 // Now, determine if page numbers in ascendingNumbers are consecutive.
257
258 // First, handle the gap.
259 if (gapPos != -1) {
260 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu lt;
261 // The "gap" should represent current page number in plain text.
262 // Check if its adjacent page numbers are consecutive.
263 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected.
264 // This can eliminate links affecting the number of items on a page.
265 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum;
266 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 &&
267 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) {
268 return result | PAGE_NUM_CONSECUTIVE_MASK;
269 }
270 return result;
271 }
272
273 // There is no gap. Check if at least one of the following cases is sat isfied:
274 // Case #1: "[1] [2] ..." or "1 [2] ... ".
275 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 &&
276 ascendingNumbers.get(1).mPageNum == 2) {
277 return result | PAGE_NUM_CONSECUTIVE_MASK;
278 }
279 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern.
280 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 &&
281 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) {
282 return result | PAGE_NUM_CONSECUTIVE_MASK;
283 }
284 // Case #3: "... [n-1] [n]" or "... [n - 1] n".
285 final int numbersSize = ascendingNumbers.size();
286 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) &&
287 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 ==
288 ascendingNumbers.get(numbersSize - 1).mPageNum) {
289 return result | PAGE_NUM_CONSECUTIVE_MASK;
290 }
291 // Case #4: "... [i-1] [i] [i+1] ...".
292 for (int i = firstPos + 1; i < lastPos; i++) {
293 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) {
294 return result | PAGE_NUM_CONSECUTIVE_MASK;
295 }
296 }
297
298 // Otherwise, there's no pair of consecutive values.
299 return result;
300 }
301
302 /**
303 * Determines if the list of PageLinkInfo's form a linear formula:
304 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0).
305 *
306 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2
307 * PageLinkInfo's, and then validated against the remaining PageLinkInfo's.
308 * The order of page numbers doesn't matter.
309 *
310 * Returns LinearFormula, containing the coefficient and delta, if the page
311 * parameter forumla could be determined. Otherwise, returns null.
312 *
313 * @param allLinkInfo the list of PageLinkInfo's to evaluate
314 */
315 // TODO(kuan): As this gets rolled out, reassesss the necessity of non-1 coe fficient support.
316 private static LinearFormula getLinearFormula(List<PageLinkInfo> allLinkInfo ) {
317 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null;
318
319 final PageLinkInfo firstLink = allLinkInfo.get(0);
320 final PageLinkInfo secondLink = allLinkInfo.get(1);
321
322 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) {
323 return null;
324 }
325
326 int deltaX = secondLink.mPageNum - firstLink.mPageNum;
327 if (deltaX == 0) return null;
328
329 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue;
330 int coefficient = deltaY / deltaX;
331 if (coefficient == 0) return null;
332
333 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ;
334 if (delta != 0 && delta != -coefficient) return null;
335
336 // Check if the remaining elements are on the same linear map.
337 for (int i = 2; i < allLinkInfo.size(); i++) {
338 final PageLinkInfo link = allLinkInfo.get(i);
339 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null;
340 }
341
342 return new LinearFormula(coefficient, delta);
343 }
344
345 /**
346 * Returns true if page numbers in list of PageInfo's form a sequence, based on a pipeline of
347 * rules:
348 * - first PageInfo must have a URL unless it is the first page
349 * - there's only one plain number without URL in list
350 * - if only two pages, they must be siblings - 2nd page number must follow 1st
351 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be
352 * head/tail or have URLs.
353 *
354 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
355 */
356 private static boolean isPageNumberSequence(List<PageInfo> ascendingNumbers) {
357 if (ascendingNumbers.size() <= 1) return false;
358
359 // The first one must have a URL unless it is the first page.
360 final PageInfo firstPage = ascendingNumbers.get(0);
361 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false;
362
363 // There's only one plain number without URL in ascending numbers group.
364 boolean hasPlainNum = false;
365 for (PageInfo page : ascendingNumbers) {
366 if (page.mUrl.isEmpty()) {
367 if (hasPlainNum) return false;
368 hasPlainNum = true;
369 }
370 }
371
372 // If there are only two pages, they must be siblings.
373 if (ascendingNumbers.size() == 2) {
374 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum;
375 }
376
377 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e.
378 for (int i = 1; i < ascendingNumbers.size(); i++) {
379 // If two adjacent numbers are not consecutive, we accept them only when:
380 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n].
381 // 2) both of them have URLs.
382 final PageInfo currPage = ascendingNumbers.get(i);
383 final PageInfo prevPage = ascendingNumbers.get(i - 1);
384 if (currPage.mPageNum - prevPage.mPageNum != 1) {
385 if (i != 1 && i != ascendingNumbers.size() - 1) return false;
386 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f alse;
387 }
388 }
389
390 return true;
391 }
392
129 } 393 }
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698