OLD | NEW |
---|---|
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import java.util.ArrayList; | 7 import java.util.ArrayList; |
8 import java.util.HashSet; | |
8 import java.util.List; | 9 import java.util.List; |
10 import java.util.Set; | |
9 | 11 |
10 /** | 12 /** |
11 * This class stores information about the page parameter detected from potentia l pagination URLs | 13 * This class stores information about the page parameter detected from potentia l pagination URLs |
12 * with numeric anchor text: | 14 * with numeric anchor text: |
13 * - type of page parameter detected | 15 * - type of page parameter detected |
14 * - list of pagination URLs with their page numbers | 16 * - list of pagination URLs with their page numbers |
15 * - coefficient and delta values of the linear formula formed by the pagination URLs: | 17 * - coefficient and delta values of the linear formula formed by the pagination URLs: |
16 * pageParamValue = coefficient * pageNum + delta | 18 * pageParamValue = coefficient * pageNum + delta. |
17 * - detected single page URL. | |
18 */ | 19 */ |
19 class PageParamInfo implements Cloneable { | 20 class PageParamInfo implements Cloneable { |
21 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; | |
22 | |
23 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; | |
cjhopman
2015/04/24 02:38:56
can these be private?
kuan
2015/04/24 17:38:00
i'm using them in PageParamInfoTest.java.
| |
24 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; | |
20 | 25 |
21 /** | 26 /** |
22 * Stores potential pagination info: | 27 * Stores potential pagination info: |
23 * - page number represented as original plain text in document URL | 28 * - page number represented as original plain text in document URL |
24 * - if the info is extracted from an anchor, its HRef. | 29 * - if the info is extracted from an anchor, its HRef. |
25 */ | 30 */ |
26 static class PageInfo implements Comparable<PageInfo> { | 31 static class PageInfo implements Comparable<PageInfo> { |
27 int mPageNum; | 32 int mPageNum; |
28 String mUrl; | 33 String mUrl; |
29 | 34 |
(...skipping 13 matching lines...) Expand all Loading... | |
43 | 48 |
44 @Override | 49 @Override |
45 public String toString() { // For debugging. | 50 public String toString() { // For debugging. |
46 return new String("pg" + mPageNum + ": " + mUrl); | 51 return new String("pg" + mPageNum + ": " + mUrl); |
47 } | 52 } |
48 } | 53 } |
49 | 54 |
50 /** | 55 /** |
51 * Stores the coefficient and delta values of the linear formula: | 56 * Stores the coefficient and delta values of the linear formula: |
52 * pageParamValue = coefficient * pageNum + delta. | 57 * pageParamValue = coefficient * pageNum + delta. |
53 * See PageParamterDetector.getPageParamLinearFormula() for details. | 58 * See getLinearFormula() for details. |
54 */ | 59 */ |
55 static class LinearFormula { | 60 static class LinearFormula { |
56 final int mCoefficient; | 61 final int mCoefficient; |
57 final int mDelta; | 62 final int mDelta; |
58 | 63 |
59 LinearFormula(int coefficient, int delta) { | 64 LinearFormula(int coefficient, int delta) { |
60 mCoefficient = coefficient; | 65 mCoefficient = coefficient; |
61 mDelta = delta; | 66 mDelta = delta; |
62 } | 67 } |
63 | 68 |
64 @Override | 69 @Override |
65 public String toString() { // For debugging. | 70 public String toString() { // For debugging. |
66 return new String("coefficient=" + mCoefficient + ", delta=" + mDelt a); | 71 return new String("coefficient=" + mCoefficient + ", delta=" + mDelt a); |
67 } | 72 } |
68 } | 73 } |
69 | 74 |
70 /** | 75 /** |
71 * Types of page parameter values in paging URLs. | 76 * Types of page parameter values in paging URLs. |
72 */ | 77 */ |
73 static enum Type { | 78 static enum Type { |
74 UNSET, // Initialized type to indicate empty PageParamInfo. | 79 UNSET, // Initialized type to indicate empty PageParamInfo. |
75 PAGE_NUMBER, // Value is a page number. | 80 PAGE_NUMBER, // Value is a page number. |
76 UNKNOWN, // None of the above. | 81 UNKNOWN, // None of the above. |
77 } | 82 } |
78 | 83 |
79 Type mType = Type.UNSET; | 84 Type mType = Type.UNSET; |
cjhopman
2015/04/24 02:38:56
All these private?
kuan
2015/04/24 17:38:00
i'll be using them in upcoming PageParameterDetect
| |
80 List<PageInfo> mAllPageInfo = new ArrayList<PageInfo>(); | 85 List<PageInfo> mAllPageInfo = new ArrayList<PageInfo>(); |
81 LinearFormula mFormula = null; | 86 LinearFormula mFormula = null; |
82 String mSinglePageUrl = ""; | 87 |
88 PageParamInfo() { | |
89 } | |
90 | |
91 PageParamInfo(Type type, LinearFormula formula) { | |
92 mType = type; | |
93 mFormula = formula; | |
94 } | |
95 | |
96 /** | |
97 * Evaluates if the given list of PageLinkInfo's is a list of paging URLs: | |
98 * - page numbers in list of PageLinkInfo's must be adjacent | |
99 * - page numbers in list of ascending numbers must either | |
100 * - be consecutive and form a page number sequence, or | |
101 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b | |
102 * - if there's only 1 PageLinkInfo, the first ascending number must be page 1, first page URL | |
103 * must match page pattern, and the only outlink must be 2nd or 3rd page. | |
104 * | |
105 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null. | |
106 * | |
107 * @param allLinkInfo the list of PageLinkInfo's to evaluate | |
108 * @param pagePattern the URL pattern to use | |
109 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
110 * @param firstPageUrl the URL of the PageInfo with mPageNum=1 | |
111 */ | |
112 static PageParamInfo evaluate(PageParameterDetector.PagePattern pagePattern, | |
113 List<PageLinkInfo> allLinkInfo, List<PageInfo> ascendingNumbers, Str ing firstPageUrl) { | |
114 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { | |
115 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers); | |
116 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; | |
117 | |
118 LinearFormula linearFormula = getLinearFormula(allLinkInfo); | |
119 | |
120 // Type.PAGE_NUMBER: ascending numbers must be consecutive and of a page | |
121 // number sequence. | |
122 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null; | |
cjhopman
2015/04/24 02:38:56
nit: can this be "== 0" like when checking a flag
kuan
2015/04/24 17:38:00
Done. duh, what i did was weird :(
| |
123 if (!isPageNumberSequence(ascendingNumbers)) return null; | |
124 PageParamInfo pageParamInfo = new PageParamInfo(Type.PAGE_NUMBER, li nearFormula); | |
cjhopman
2015/04/24 02:38:56
Can the PageParamInfo constructor take a List<Page
kuan
2015/04/24 17:38:00
Done.
| |
125 final List<PageInfo> allPageInfo = pageParamInfo.mAllPageInfo; | |
126 for (PageLinkInfo link : allLinkInfo) { | |
127 allPageInfo.add(new PageInfo(link.mPageNum, | |
128 ascendingNumbers.get(link.mPosInAscendingList).mUrl)); | |
129 } | |
130 return pageParamInfo; | |
131 } | |
132 | |
133 // Most of news article have no more than 3 pages and the first page pro bably doesn't have | |
134 // any page parameter. If the first page url matches the the page patte rn, we treat it as | |
135 // the first page of this pattern. | |
136 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { | |
137 final PageLinkInfo onlyLink = allLinkInfo.get(0); | |
138 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && | |
139 onlyLink.mPosInAscendingList == 1; | |
140 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && | |
141 onlyLink.mPosInAscendingList == 2 && | |
142 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3 | |
143 // elements; check if previous element is previous page. | |
144 ascendingNumbers.get(1).mPageNum == 2; | |
145 // 1 PageLinkInfo means ascendingNumbers has >= 1 element. | |
146 if (ascendingNumbers.get(0).mPageNum == 1 && | |
147 (secondPageIsOutlink || thirdPageIsOutlink) && | |
148 pagePattern.isPagingUrl(firstPageUrl)) { | |
149 // Has valid PageParamInfo, create and populate it. | |
150 int coefficient; | |
151 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; | |
152 if (delta == 0 || delta == 1) { | |
153 coefficient = 1; | |
154 } else { | |
155 coefficient = onlyLink.mPageParamValue; | |
156 delta = 0; | |
157 } | |
158 PageParamInfo pageParamInfo = new PageParamInfo(Type.PAGE_NUMBER , | |
159 new LinearFormula(coefficient, delta)); | |
160 final List<PageInfo> allPageInfo = pageParamInfo.mAllPageInfo; | |
161 allPageInfo.add(new PageInfo(1, firstPageUrl)); | |
162 allPageInfo.add(new PageInfo(onlyLink.mPageNum, | |
163 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) ); | |
164 return pageParamInfo; | |
165 } | |
166 } | |
167 | |
168 return null; | |
169 } | |
83 | 170 |
84 /** | 171 /** |
85 * Returns a new PageParamInfo that is a clone of this object. | 172 * Returns a new PageParamInfo that is a clone of this object. |
86 */ | 173 */ |
87 public Object clone() { | 174 public Object clone() { |
88 PageParamInfo info = new PageParamInfo(); | 175 PageParamInfo info = new PageParamInfo(this.mType, this.mFormula); |
89 info.mType = this.mType; | |
90 info.mAllPageInfo = this.mAllPageInfo; | 176 info.mAllPageInfo = this.mAllPageInfo; |
91 info.mFormula = this.mFormula; | |
92 info.mSinglePageUrl = this.mSinglePageUrl; | |
93 return info; | 177 return info; |
94 } | 178 } |
95 | 179 |
96 /** | 180 /** |
97 * Evaluates which PageParamInfo is better based on the properties of PagePa ramInfo. | 181 * Evaluates which PageParamInfo is better based on the properties of PagePa ramInfo. |
98 * We prefer the one if the list of PageParameterDetector.LinkInfo forms a l inear formula, see | 182 * We prefer the one if the list of PageLinkInfo forms a linear formula, see getLinearFormula(). |
99 * PageParamterDetector.getPageParamLinearFormula(). | |
100 * | 183 * |
101 * Returns 1 if this is better, -1 if other is better and 0 if they are equa l. | 184 * Returns 1 if this is better, -1 if other is better and 0 if they are equa l. |
102 * | 185 * |
103 */ | 186 */ |
104 int compareTo(PageParamInfo other) { | 187 int compareTo(PageParamInfo other) { |
105 // We prefer the one where the LinkInfo array forms a linear formula, se e isLinearFormula. | 188 // We prefer the one where the LinkInfo array forms a linear formula, se e isLinearFormula. |
106 if (mFormula != null && other.mFormula == null) return 1; | 189 if (mFormula != null && other.mFormula == null) return 1; |
107 if (mFormula == null && other.mFormula != null) return -1; | 190 if (mFormula == null && other.mFormula != null) return -1; |
108 | 191 |
109 if (mType == other.mType) return 0; | 192 if (mType == other.mType) return 0; |
110 | 193 |
111 // For different page param types, we prefer PAGE_NUMBER. | 194 // For different page param types, we prefer PAGE_NUMBER. |
112 if (mType == Type.PAGE_NUMBER) return 1; | 195 if (mType == Type.PAGE_NUMBER) return 1; |
113 if (other.mType == Type.PAGE_NUMBER) return -1; | 196 if (other.mType == Type.PAGE_NUMBER) return -1; |
114 // Can't decide as they have unknown page type. | 197 // Can't decide as they have unknown page type. |
115 return 0; | 198 return 0; |
116 } | 199 } |
117 | 200 |
118 @Override | 201 @Override |
119 public String toString() { // For debugging. | 202 public String toString() { // For debugging. |
120 String str = new String("Type: " + mType + "\nPageInfo: " + mAllPageInfo. size()); | 203 String str = new String("Type: " + mType + "\nPageInfo: " + mAllPageInfo. size()); |
121 for (PageInfo page : mAllPageInfo) { | 204 for (PageInfo page : mAllPageInfo) { |
122 str += "\n " + page.toString(); | 205 str += "\n " + page.toString(); |
123 } | 206 } |
124 str += "\nsinglePageUrl: " + mSinglePageUrl + | 207 str += "\nformula: " + (mFormula == null ? "null" : mFormula.toString()); |
125 "\nformula: " + (mFormula == null ? "null" : mFormula.toString()) ; | |
126 return str; | 208 return str; |
127 } | 209 } |
128 | 210 |
211 /** | |
212 * Detects if page numbers in list of PageLinkInfo's are adjacent, and if pa ge numbers in list | |
213 * of PageInfo's are consecutive. | |
214 * | |
215 * For adjacency, the page numbers in list of PageLinkInfo's must either be adjacent, or | |
216 * separated by at most 1 plain text number which must represent the current page number in one | |
217 * of the PageInfo's. | |
218 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list | |
219 * of PageLinkInfo's, or between a PageLinkInfo and a plain text number. | |
220 * | |
221 * Returns a int value that is a combination of bits: | |
222 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent | |
223 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive. | |
224 * | |
225 * @param allLinkInfo the list of PageLinkInfo's to evaluate | |
226 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
227 */ | |
228 static int arePageNumsAdjacentAndConsecutive(List<PageLinkInfo> allLinkInfo, | |
229 List<PageInfo> ascendingNumbers) { | |
230 int result = 0; | |
231 | |
232 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is | |
233 // current page number respresented in plain text. | |
234 int firstPos = -1; | |
235 int lastPos = -1; | |
236 int gapPos = -1; | |
237 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique. | |
238 for (PageLinkInfo linkInfo : allLinkInfo) { | |
239 final int currPos = linkInfo.mPosInAscendingList; | |
240 if (lastPos == -1) { | |
241 firstPos = currPos; | |
242 } else if (currPos != lastPos + 1) { | |
243 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6 | |
244 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not | |
245 // adjacent. | |
246 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1 ) return result; | |
247 gapPos = currPos - 1; | |
248 } | |
249 // Make sure page param value, i.e. page number represented in plain text, is unique. | |
250 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; | |
251 lastPos = currPos; | |
252 } // for all LinkInfo's | |
253 | |
254 result |= PAGE_NUM_ADJACENT_MASK; | |
255 | |
256 // Now, determine if page numbers in ascendingNumbers are consecutive. | |
257 | |
258 // First, handle the gap. | |
259 if (gapPos != -1) { | |
260 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu lt; | |
261 // The "gap" should represent current page number in plain text. | |
262 // Check if its adjacent page numbers are consecutive. | |
263 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. | |
264 // This can eliminate links affecting the number of items on a page. | |
265 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; | |
266 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && | |
267 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) { | |
268 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
269 } | |
270 return result; | |
271 } | |
272 | |
273 // There is no gap. Check if at least one of the following cases is sat isfied: | |
274 // Case #1: "[1] [2] ..." or "1 [2] ... ". | |
275 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 && | |
276 ascendingNumbers.get(1).mPageNum == 2) { | |
277 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
278 } | |
279 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. | |
280 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && | |
281 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) { | |
282 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
283 } | |
284 // Case #3: "... [n-1] [n]" or "... [n - 1] n". | |
285 final int numbersSize = ascendingNumbers.size(); | |
286 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && | |
287 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == | |
288 ascendingNumbers.get(numbersSize - 1).mPageNum) { | |
289 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
290 } | |
291 // Case #4: "... [i-1] [i] [i+1] ...". | |
292 for (int i = firstPos + 1; i < lastPos; i++) { | |
293 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) { | |
294 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
295 } | |
296 } | |
297 | |
298 // Otherwise, there's no pair of consecutive values. | |
299 return result; | |
300 } | |
301 | |
302 /** | |
303 * Determines if the list of PageLinkInfo's form a linear formula: | |
304 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0). | |
305 * | |
306 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2 | |
307 * PageLinkInfo's, and then validated against the remaining PageLinkInfo's. | |
308 * The order of page numbers doesn't matter. | |
309 * | |
310 * Returns LinearFormula, containing the coefficient and delta, if the page | |
311 * parameter forumla could be determined. Otherwise, returns null. | |
312 * | |
313 * @param allLinkInfo the list of PageLinkInfo's to evaluate | |
314 */ | |
315 // TODO(kuan): As this gets rolled out, reassesss the necessity of non-1 coe fficient support. | |
316 private static LinearFormula getLinearFormula(List<PageLinkInfo> allLinkInfo ) { | |
317 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; | |
318 | |
319 final PageLinkInfo firstLink = allLinkInfo.get(0); | |
320 final PageLinkInfo secondLink = allLinkInfo.get(1); | |
321 | |
322 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) { | |
323 return null; | |
324 } | |
325 | |
326 int deltaX = secondLink.mPageNum - firstLink.mPageNum; | |
327 if (deltaX == 0) return null; | |
328 | |
329 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; | |
330 int coefficient = deltaY / deltaX; | |
331 if (coefficient == 0) return null; | |
332 | |
333 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ; | |
334 if (delta != 0 && delta != -coefficient) return null; | |
335 | |
336 // Check if the remaining elements are on the same linear map. | |
337 for (int i = 2; i < allLinkInfo.size(); i++) { | |
338 final PageLinkInfo link = allLinkInfo.get(i); | |
339 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null; | |
340 } | |
341 | |
342 return new LinearFormula(coefficient, delta); | |
343 } | |
344 | |
345 /** | |
346 * Returns true if page numbers in list of PageInfo's form a sequence, based on a pipeline of | |
347 * rules: | |
348 * - first PageInfo must have a URL unless it is the first page | |
349 * - there's only one plain number without URL in list | |
350 * - if only two pages, they must be siblings - 2nd page number must follow 1st | |
351 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be | |
352 * head/tail or have URLs. | |
353 * | |
354 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
355 */ | |
356 private static boolean isPageNumberSequence(List<PageInfo> ascendingNumbers) { | |
357 if (ascendingNumbers.size() <= 1) return false; | |
358 | |
359 // The first one must have a URL unless it is the first page. | |
360 final PageInfo firstPage = ascendingNumbers.get(0); | |
361 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; | |
362 | |
363 // There's only one plain number without URL in ascending numbers group. | |
364 boolean hasPlainNum = false; | |
365 for (PageInfo page : ascendingNumbers) { | |
366 if (page.mUrl.isEmpty()) { | |
367 if (hasPlainNum) return false; | |
368 hasPlainNum = true; | |
369 } | |
370 } | |
371 | |
372 // If there are only two pages, they must be siblings. | |
373 if (ascendingNumbers.size() == 2) { | |
374 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; | |
375 } | |
376 | |
377 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e. | |
378 for (int i = 1; i < ascendingNumbers.size(); i++) { | |
379 // If two adjacent numbers are not consecutive, we accept them only when: | |
380 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n]. | |
381 // 2) both of them have URLs. | |
382 final PageInfo currPage = ascendingNumbers.get(i); | |
383 final PageInfo prevPage = ascendingNumbers.get(i - 1); | |
384 if (currPage.mPageNum - prevPage.mPageNum != 1) { | |
385 if (i != 1 && i != ascendingNumbers.size() - 1) return false; | |
386 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f alse; | |
387 } | |
388 } | |
389 | |
390 return true; | |
391 } | |
392 | |
129 } | 393 } |
OLD | NEW |