OLD | NEW |
---|---|
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import com.google.gwt.regexp.shared.MatchResult; | 7 import com.google.gwt.regexp.shared.MatchResult; |
8 import com.google.gwt.regexp.shared.RegExp; | 8 import com.google.gwt.regexp.shared.RegExp; |
9 | 9 |
10 import java.util.ArrayList; | 10 import java.util.ArrayList; |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
45 * <a href=http: *a/b?c=1&p=30>4</a> | 45 * <a href=http: *a/b?c=1&p=30>4</a> |
46 * <a href=http: *a/b?c=1&p=40>5</a> | 46 * <a href=http: *a/b?c=1&p=40>5</a> |
47 * <a href=http: *a/b?c=1&p=all>single page</a> | 47 * <a href=http: *a/b?c=1&p=all>single page</a> |
48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so | 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so |
49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!]. | 49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!]. |
50 * Then, this class extracts the single page based on page parameter info. The single page url is | 50 * Then, this class extracts the single page based on page parameter info. The single page url is |
51 * http: *a/b?c=1&p=all. | 51 * http: *a/b?c=1&p=all. |
52 */ | 52 */ |
53 public class PageParameterDetector { | 53 public class PageParameterDetector { |
54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | 54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
55 private static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER .length(); | |
56 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; | |
57 | |
58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; | |
59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; | |
55 | 60 |
56 /** | 61 /** |
57 * Stores information about the link (anchor) after the page parameter is de tected: | 62 * Stores information about the link (anchor) after the page parameter is de tected: |
58 * - the page number (as represented by the original plain text) for the lin k | 63 * - the page number (as represented by the original plain text) for the lin k |
59 * - the original page parameter numeric component in the URL (this componen t would be replaced | 64 * - the original page parameter numeric component in the URL (this componen t would be replaced |
60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) | 65 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) |
61 * - the position of this link in the list of ascending numbers. | 66 * - the position of this link in the list of ascending numbers. |
62 */ | 67 */ |
63 static class LinkInfo { | 68 static class LinkInfo { |
64 private int mPageNum; | 69 private int mPageNum; |
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d)); | 177 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d)); |
173 if (value >= 0) { | 178 if (value >= 0) { |
174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER + | 179 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER + |
175 urlStr.substring(matchEnd), | 180 urlStr.substring(matchEnd), |
176 new LinkInfo(pageNum, value, posInAscendingNumbers)); | 181 new LinkInfo(pageNum, value, posInAscendingNumbers)); |
177 } | 182 } |
178 } // while there're matches | 183 } // while there're matches |
179 } // extractPageParamCandidatesFromPath | 184 } // extractPageParamCandidatesFromPath |
180 | 185 |
181 /** | 186 /** |
187 * Validates the page pattern according to the current document URL through a pipeline of rules: | |
188 * - for query page parameter, pattern and URL must have same path component s. | |
189 * - for path page parameter, | |
190 * - pattern and URL must have same number of path components. | |
191 * - if only 1 path component, both must have long-enough common prefix an d suffix. | |
192 * - else all pattern's components, except for page parameter, must be sam e as url's. | |
193 * - lastly, pattern's components cannot be calendar digits. | |
194 * | |
195 * Returns true if page pattern is valid. | |
196 * | |
197 * @param docUrl the current document URL | |
198 * @param pagePattern the page pattern to validate | |
199 */ | |
200 static boolean isPagePatternValid(ParsedUrl docUrl, String pagePattern) { | |
201 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER); | |
202 if (pageParamPos == -1) return false; | |
203 | |
204 ParsedUrl patternUrl = ParsedUrl.create(pagePattern); | |
205 | |
206 // If page parameter is a query, page pattern and doc URL must have the same path. | |
207 if (pagePattern.lastIndexOf('?', pageParamPos - 1) != -1) { | |
208 return docUrl.getTrimmedPath().equalsIgnoreCase(patternUrl.getTrimme dPath()); | |
209 } | |
210 | |
211 final String[] urlPathComponents = docUrl.getPathComponents(); | |
212 final String[] patternPathComponents = patternUrl.getPathComponents(); | |
213 final int urlPathComponentsLen = urlPathComponents.length; | |
214 final int patternPathComponentsLen = patternPathComponents.length; | |
215 | |
216 // If the page param is inside of path components, both the pattern and doc URL must have | |
217 // the similar path. | |
218 if (urlPathComponentsLen > patternPathComponentsLen) return false; | |
cjhopman
2015/03/27 00:16:12
why ">" and not "!="?
kuan
2015/03/31 17:17:50
because pattern can hv more path components than d
| |
219 | |
220 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must | |
221 // be at least half of the entire component in doc URL, e.g doc URL is | |
222 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]". | |
223 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { | |
224 final String urlComponent = urlPathComponents[0]; | |
225 final String patternComponent = patternPathComponents[0]; | |
226 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent); | |
227 return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) + | |
228 commonPrefixLen) * 2 >= urlComponent.length(); | |
229 } | |
230 | |
231 // Get index of page parameter. | |
232 int paramIndex = 0; | |
233 for (; paramIndex < patternPathComponentsLen; paramIndex++) { | |
234 if (patternPathComponents[paramIndex].contains(PAGE_PARAM_PLACEHOLDE R)) break; | |
235 } | |
236 | |
237 // Except for the component containing the page param, the other compone nts of doc URL must | |
cjhopman
2015/03/27 00:16:12
Can this be extracted to a separate function.
kuan
2015/03/31 17:17:50
Done.
| |
238 // be part of pattern's path. But pattern may have more components, e.g . doc URL is | |
239 // /thread/12 and pattern is /thread/12/page/[*!]. | |
240 boolean passedPageParamComponent = false; | |
241 for (int i = 0, j = 0; i < urlPathComponentsLen && j < patternPathCompon entsLen; i++, j++) { | |
cjhopman
2015/03/27 00:16:12
I'm not really sure I follow the logic here (and a
cjhopman
2015/03/27 00:18:21
It won't reject that example actually. Still, how
kuan
2015/03/31 17:17:50
this would be invalid - pattern has extra "page" p
cjhopman
2015/04/07 00:45:48
I guess that the behavior doesn't seem to match th
kuan
2015/04/10 22:41:27
i've added ur examples, with explanations, to the
| |
242 if (i == paramIndex && !passedPageParamComponent) { | |
243 passedPageParamComponent = true; | |
244 // Repeat current path component if doc URL has less components (as per comments | |
245 // just above, doc URL may have less components). | |
246 if (urlPathComponentsLen < patternPathComponentsLen) i--; | |
247 continue; | |
248 } | |
249 | |
250 if (!urlPathComponents[i].equalsIgnoreCase(patternPathComponents[j]) ) return false; | |
251 } | |
252 | |
253 // Check if pattern is for a calendar page, e.g. 2012/01/[*!], which wou ld be a | |
254 // false-positive. | |
255 if (paramIndex >= 2 && | |
cjhopman
2015/03/27 00:16:12
Extract this to another function
kuan
2015/03/31 17:17:50
Done.
| |
256 // Only if param is the entire path component. This handles som e cases erroneously | |
257 // considered false-positives e.g. first page is | |
258 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467.html, | |
cjhopman
2015/03/27 00:16:12
why do we require that it be ordered yyyy/mm/dd fo
kuan
2015/03/31 17:17:50
i would think so. how else do we detect calendar
| |
259 // and second page is | |
260 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467_Page2.html, | |
261 // would be considered false-positives otherwise because of "201 4" and "07". | |
262 patternPathComponents[paramIndex].length() == PAGE_PARAM_PLACEHO LDER_LEN) { | |
263 int month = StringUtil.toNumber(patternPathComponents[paramIndex - 1 ]); | |
264 if (month > 0 && month <= 12) { | |
265 int year = StringUtil.toNumber(patternPathComponents[paramIndex - 2]); | |
266 if (year > 1970 && year < 3000) return false; | |
267 } | |
268 } | |
269 | |
270 return true; | |
271 } // isPagePatternValid | |
272 | |
273 /** | |
274 * Evaluates if the given list of LinkInfo's is a list of paging URLs: | |
275 * - page numbers in list of LinkInfo's must be adjacent | |
276 * - page numbers in list of ascending numbers must either | |
277 * - be consecutive and form a page number sequence, or | |
278 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b | |
279 * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must | |
280 * match page pattern, and the only outlink must be 2nd or 3rd page. | |
281 * | |
282 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null. | |
283 * | |
284 * @param allLinkInfo the list of LinkInfo's to evaluate | |
285 * @param pagePattern the URL pattern to use | |
286 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
287 * @param firstPageUrl the URL of the PageInfo with mPageNum=1 | |
288 */ | |
289 private static PageParamInfo getPageParamInfo(String pagePattern, List<LinkI nfo> allLinkInfo, | |
290 List<PageParamInfo.PageInfo> ascendingNumbers, String firstPageUrl) { | |
291 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { | |
292 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers); | |
293 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; | |
294 | |
295 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul a(allLinkInfo); | |
296 | |
297 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu tive and of a page | |
298 // number sequence. | |
299 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null; | |
300 if (!isPageNumberSeq(ascendingNumbers)) return null; | |
301 PageParamInfo pageParamInfo = new PageParamInfo(); | |
302 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
303 pageParamInfo.mFormula = linearFormula; | |
304 for (LinkInfo link : allLinkInfo) { | |
305 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m PageNum, | |
306 ascendingNumbers.get(link.mPosInAscendingList).mUrl)); | |
307 } | |
308 return pageParamInfo; | |
309 } | |
310 | |
311 // Most of news article have no more than 3 pages and the first page pro bably doesn't have | |
312 // any page parameter. If the first page url matches the the page patte rn, we treat it as | |
313 // the first page of this pattern. | |
314 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { | |
315 final LinkInfo onlyLink = allLinkInfo.get(0); | |
316 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && | |
317 onlyLink.mPosInAscendingList == 1; | |
318 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && | |
319 onlyLink.mPosInAscendingList == 2 && | |
320 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3 | |
321 // elements; check if previous element is previous page. | |
322 ascendingNumbers.get(1).mPageNum == 2; | |
323 // 1 LinkInfo means ascendingNumbers has >= 1 element. | |
324 if (ascendingNumbers.get(0).mPageNum == 1 && | |
325 (secondPageIsOutlink || thirdPageIsOutlink) && | |
326 isPagingUrl(firstPageUrl, pagePattern)) { | |
327 // Has valid PageParamInfo, create and populate it. | |
328 PageParamInfo pageParamInfo = new PageParamInfo(); | |
329 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
330 int coefficient; | |
331 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; | |
332 if (delta == 0 || delta == 1) { | |
333 coefficient = 1; | |
334 } else { | |
335 coefficient = onlyLink.mPageParamValue; | |
336 delta = 0; | |
337 } | |
338 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic ient, delta); | |
339 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir stPageUrl)); | |
340 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi nk.mPageNum, | |
341 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) ); | |
342 return pageParamInfo; | |
343 } | |
344 } | |
345 | |
346 return null; | |
347 } // getPageParamInfo | |
348 | |
349 /** | |
182 * Returns true if given name is backlisted as a known bad page param name. | 350 * Returns true if given name is backlisted as a known bad page param name. |
183 */ | 351 */ |
184 private static boolean isPageParamNameBad(String name) { | 352 private static boolean isPageParamNameBad(String name) { |
185 initBadPageParamNames(); | 353 initBadPageParamNames(); |
186 return sBadPageParamNames.contains(name.toLowerCase()); | 354 return sBadPageParamNames.contains(name.toLowerCase()); |
187 } // isPageParamNameBad | 355 } // isPageParamNameBad |
188 | 356 |
189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | 357 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). |
190 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. | 358 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. |
191 | 359 |
192 /** | 360 /** |
193 * Returns true if: | 361 * Returns true if: |
194 * - the digitStart to digitEnd of urlStr is the last path component, and | 362 * - the digitStart to digitEnd of urlStr is the last path component, and |
195 * - the entire path component is numeric, and | 363 * - the entire path component is numeric, and |
196 * - the previous path component is a bad page param name. | 364 * - the previous path component is a bad page param name. |
197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad | 365 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad |
198 * page param. | 366 * page param. |
199 */ | 367 */ |
200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, | 368 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart, |
201 int digitStart, int digitEnd) { | 369 int digitEnd) { |
202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. | 370 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. |
203 pathStart < digitStart - 1) { // Not the first path component. | 371 pathStart < digitStart - 1) { // Not the first path component. |
204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | 372 String postMatch = urlStr.substring(digitEnd).toLowerCase(); |
205 // Checks that this is the last path component, and trailing charact ers, if available, | 373 // Checks that this is the last path component, and trailing charact ers, if available, |
206 // are (s)htm(l) extensions. | 374 // are (s)htm(l) extensions. |
207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); | 375 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); |
208 if (sExtRegExp.test(postMatch)) { | 376 if (sExtRegExp.test(postMatch)) { |
209 // Entire component is numeric, get previous path component. | 377 // Entire component is numeric, get previous path component. |
210 if (sLastPathComponentRegExp == null) { | 378 if (sLastPathComponentRegExp == null) { |
211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; | 379 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; |
212 } | 380 } |
213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | 381 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( |
214 urlStr.substring(pathStart + 1, digitStart)); | 382 urlStr.substring(pathStart + 1, digitStart)); |
215 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && | 383 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && |
216 isPageParamNameBad(prevPathComponent.getGroup(1))) { | 384 isPageParamNameBad(prevPathComponent.getGroup(1))) { |
217 return true; | 385 return true; |
218 } | 386 } |
219 } // last numeric path component | 387 } // last numeric path component |
220 } | 388 } |
221 | 389 |
222 return false; | 390 return false; |
223 } // isLastNumericPathComponentBad | 391 } // isLastNumericPathComponentBad |
224 | 392 |
393 private static int getLongestCommonPrefixLength(String str1, String str2) { | |
394 if (str1.isEmpty() || str2.isEmpty()) return 0; | |
395 | |
396 int limit = Math.min(str1.length(), str2.length()); | |
397 int i = 0; | |
398 for (; i < limit; i++) { | |
399 if (str1.charAt(i) != str2.charAt(i)) break; | |
400 } | |
401 return i; | |
402 } // getLongestCommonPrefixLength | |
cjhopman
2015/03/27 00:16:12
Let's remove all these comments marking what funct
kuan
2015/03/31 17:17:50
Done.
| |
403 | |
404 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) { | |
405 int commonSuffixLen = 0; | |
406 for (int i = str1.length() - 1, j = str2.length() - 1; | |
407 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { | |
408 if (str1.charAt(i) != str2.charAt(i)) break; | |
409 } | |
410 return commonSuffixLen; | |
411 } // getLongestCommonSuffixLength | |
412 | |
413 /** | |
414 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n umbers in list of | |
415 * PageParamInfo.PageInfo's are consecutive. | |
416 * | |
417 * For adjacency, the page numbers in list of LinkInfo's must either be adja cent, or separated | |
418 * by at most 1 plain text number which must represent the current page numb er in one of the | |
419 * PageParamInfo.PageInfo's. | |
420 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list | |
421 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are | |
422 * likely to be page size selection links (e.g. in the document "See 1-10, 1 1-20..."). | |
423 * | |
424 * Returns a int value that is a combination of bits: | |
425 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent | |
426 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive. | |
427 * | |
428 * @param allLinkInfo the list of LinkInfo's to evaluate | |
429 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
430 */ | |
431 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo, | |
432 List<PageParamInfo.PageInfo> ascendingNumbers) { | |
433 int result = 0; | |
434 | |
435 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is | |
436 // current page number respresented in plain text. | |
437 int firstPos = -1; | |
438 int lastPos = -1; | |
439 int gapPos = -1; | |
440 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique. | |
441 for (LinkInfo linkInfo : allLinkInfo) { | |
442 final int currPos = linkInfo.mPosInAscendingList; | |
443 if (lastPos == -1) { | |
444 firstPos = currPos; | |
445 } else if (currPos != lastPos + 1) { | |
446 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6 | |
447 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not | |
448 // adjacent. | |
449 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1 ) return result; | |
450 gapPos = currPos - 1; | |
451 } | |
452 // Make sure page param value, i.e. page number represented in plain text, is unique. | |
453 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; | |
454 lastPos = currPos; | |
455 } // for all LinkInfo's | |
456 | |
457 result |= PAGE_NUM_ADJACENT_MASK; | |
458 | |
459 // Now, determine if page numbers in ascendingNumbers are consecutive. | |
460 | |
461 // First, handle the gap. | |
462 if (gapPos != -1) { | |
463 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu lt; | |
464 // The "gap" should represent current page number in plain text. | |
465 // Check if its adjacent page numbers are consecutive. | |
466 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. | |
467 // This can eliminate links affecting the number of items on a page. | |
468 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; | |
469 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && | |
470 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) { | |
471 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
472 } | |
473 return result; | |
474 } | |
475 | |
476 // There is no gap. Check if at least one of the following cases is sat isfied: | |
477 // Case #1: "[1] [2] ..." or "1 [2] ... ". | |
478 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 && | |
479 ascendingNumbers.get(1).mPageNum == 2) { | |
480 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
481 } | |
482 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. | |
483 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && | |
484 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) { | |
485 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
486 } | |
487 // Case #3: "... [n-1] [n]" or "... [n - 1] n". | |
488 final int numbersSize = ascendingNumbers.size(); | |
489 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && | |
490 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == | |
491 ascendingNumbers.get(numbersSize - 1).mPageNum) { | |
492 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
493 } | |
494 // Case #4: "... [i-1] [i] [i+1] ...". | |
495 for (int i = firstPos + 1; i < lastPos; i++) { | |
496 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) { | |
497 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
498 } | |
499 } | |
500 | |
501 // Otherwise, there's no pair of consecutive values. | |
502 return result; | |
503 } // arePageNumsAdjacentAndConsecutive | |
504 | |
505 /** | |
506 * | |
507 * Determines if the list of LinkInfo's form a linear formula: | |
508 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0). | |
cjhopman
2015/03/27 00:16:11
Do we really need this complicated linear formula?
kuan
2015/03/31 17:17:50
it's true we don't really care about the actual va
cjhopman
2015/04/07 00:45:48
i just want you to be sure if it's necessary or un
kuan
2015/04/10 22:41:27
i'm wary of removing it now, including the non-1 c
kuan
2015/04/13 17:21:38
to clarify the example above, the pagination URLs
| |
509 * | |
510 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2 | |
511 * LinkInfo's, and then validated against the remaining LinkInfo's. | |
512 * The order of page numbers doesn't matter. | |
513 * | |
514 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta , if the page | |
515 * parameter forumla could be determined. Otherwise, returns null. | |
516 * | |
517 * @param allLinkInfo the list of LinkInfo's to evaluate | |
518 */ | |
519 private static PageParamInfo.LinearFormula getPageParamLinearFormula( | |
520 List<LinkInfo> allLinkInfo) { | |
521 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; | |
522 | |
523 final LinkInfo firstLink = allLinkInfo.get(0); | |
524 final LinkInfo secondLink = allLinkInfo.get(1); | |
525 | |
526 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) { | |
527 return null; | |
528 } | |
529 | |
530 int deltaX = secondLink.mPageNum - firstLink.mPageNum; | |
531 if (deltaX == 0) return null; | |
532 | |
533 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; | |
534 int coefficient = deltaY / deltaX; | |
535 if (coefficient == 0) return null; | |
536 | |
537 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ; | |
538 if (delta != 0 && delta != -coefficient) return null; | |
539 | |
540 // Check if the remaining elements are on the same linear map. | |
541 for (int i = 2; i < allLinkInfo.size(); i++) { | |
542 final LinkInfo link = allLinkInfo.get(i); | |
543 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null; | |
544 } | |
545 | |
546 return new PageParamInfo.LinearFormula(coefficient, delta); | |
547 } // getPageParamLinearFormula | |
548 | |
549 /** | |
550 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s equence, based on | |
551 * a pipeline of rules: | |
552 * - first PageInfo must have a URL unless it is the first page | |
553 * - there's only one plain number without URL in list | |
554 * - if only two pages, they must be siblings | |
cjhopman
2015/03/27 00:16:12
what's a sibling?
kuan
2015/03/31 17:17:50
Done.
| |
555 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be | |
556 * head/tail or have URLs. | |
557 * | |
558 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
559 */ | |
560 private static boolean isPageNumberSeq(List<PageParamInfo.PageInfo> ascendin gNumbers) { | |
cjhopman
2015/03/27 00:16:12
Try to avoid abbreviations in function names: s/Se
kuan
2015/03/31 17:17:50
Done.
| |
561 if (ascendingNumbers.size() <= 1) return false; | |
562 | |
563 // The first one must have a URL unless it is the first page. | |
564 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0); | |
565 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; | |
566 | |
567 // There's only one plain number without URL in ascending numbers group. | |
568 boolean hasPlainNum = false; | |
569 for (PageParamInfo.PageInfo page : ascendingNumbers) { | |
570 if (page.mUrl.isEmpty()) { | |
571 if (hasPlainNum) return false; | |
572 hasPlainNum = true; | |
573 } | |
574 } | |
575 | |
576 // If there are only two pages, they must be siblings. | |
577 if (ascendingNumbers.size() == 2) { | |
578 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; | |
579 } | |
580 | |
581 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e. | |
582 for (int i = 1; i < ascendingNumbers.size(); i++) { | |
583 // If two adjacent numbers are not consecutive, we accept them only when: | |
584 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n]. | |
585 // 2) both of them have URLs. | |
586 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i); | |
587 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1); | |
588 if (currPage.mPageNum - prevPage.mPageNum != 1) { | |
589 if (i != 1 && i != ascendingNumbers.size() - 1) return false; | |
590 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f alse; | |
591 } | |
592 } | |
593 | |
594 return true; | |
595 } // isPageNumberSeq | |
596 | |
597 private static RegExp sSlashExtRegExp = null; // Match either '/' or ".htm( l)". | |
cjhopman
2015/03/27 00:16:12
This name needs to be more descriptive.
kuan
2015/03/31 17:17:50
Done.
| |
598 | |
599 /** | |
600 * Returns true if a URL matches the generated page pattern based on a pipel ine of rules: | |
601 * - suffix (part of pattern after page param placeholder) must be same, and | |
602 * - for query page parameter, | |
603 * - scheme, host, and path must be same, and | |
604 * - query components, except that for page number, must be same in order and value, and | |
605 * - query value must be a plain number. | |
606 * - for path page parameter that is part of a path component, | |
607 * - if the first different character in path component is suffix, it must be a page parameter | |
608 * separator, followed by the page parameter in the pattern | |
609 * - else if it's page parameter, it and possible following digits must be a plain number. | |
610 * - for path page parameter that is the entire path component, | |
611 * - if URL has no page number param and previous path component, everythi ng else matches, or | |
612 * - if prefix is the same, URL doesn't have anyhing else | |
613 * - else url must have '/' at the same position as pattern's page paramet er path component, | |
614 * followed by a plain number. | |
615 * | |
616 * @param url the URL to evalutate | |
617 * @param pagePattern the URL page pattern to match with | |
618 */ | |
619 static boolean isPagingUrl(String url, String pagePattern) { | |
620 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER); | |
621 if (pageParamPos == -1) return false; | |
622 | |
623 int queryComponentStartPos = pagePattern.lastIndexOf('&', pageParamPos - 1); | |
624 if (queryComponentStartPos == -1) { // Page number is the first query. | |
625 queryComponentStartPos = pagePattern.lastIndexOf('?', pageParamPos - 1); | |
626 } | |
627 | |
628 final int urlLen = url.length(); | |
629 final int patternLen = pagePattern.length(); | |
630 boolean isDynamicParam = queryComponentStartPos > 0 && | |
631 pagePattern.charAt(pageParamPos - 1) == '='; | |
632 | |
633 // Both url and patterm must have the same suffix, if available. | |
634 int suffixLen = patternLen - pageParamPos - PAGE_PARAM_PLACEHOLDER_LEN; | |
635 if (suffixLen != 0) { | |
636 int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'. | |
637 if (!url.regionMatches(urlLen - compareLen, pagePattern, patternLen - compareLen, | |
638 compareLen)) { | |
639 return false; | |
640 } | |
641 } | |
642 | |
643 final int suffixPos = urlLen - suffixLen; | |
644 | |
645 if (isDynamicParam) { | |
646 // If page parameter is dynamic, the url matches the pattern only wh en: | |
647 // 1. has same prefix (scheme, host, path) | |
648 // 2. has same query components with same value (except page numbe r query) in the same | |
649 // order. | |
650 // Examples: | |
651 // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&query C=v3 | |
652 // Returns true for: | |
653 // - http://foo.com/a/b/?queryA=v1&queryC=v3 | |
654 // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3 | |
655 // Otherwise, returns false. | |
656 // | |
657 // If page pattern is http://foo.com/a/b?page=[*!]&query=a | |
658 // Returns true for: | |
659 // - http://foo.com/a/b?query=a | |
660 // - http://foo.com/a/b?page=2&query=a | |
661 // Otherwise, returns false. | |
662 // | |
663 // If page pattern is http://foo.com/a/b?page=[*!] | |
664 // Returns true for: | |
665 // - http://foo.com/a/b/ | |
666 // - http://foo.com/a/b.html | |
667 // - http://foo.com/a/b.htm | |
668 // - http://foo.com/a/b?page=2 | |
669 // Otherwise, returns false. | |
670 | |
671 // Both url and pattern must have the same prefix. | |
672 if (suffixPos < queryComponentStartPos || | |
673 !url.regionMatches(0, pagePattern, 0, queryComponentStartPos )) { | |
674 return false; | |
675 } | |
676 | |
677 // If the url doesn't have page number query, it is fine. | |
678 if (queryComponentStartPos == suffixPos) return true; | |
679 | |
680 // If the only difference in the page param query component of url a nd pattern is "/", | |
681 // ".html" or ".html", it is fine. | |
682 String diffPart = url.substring(queryComponentStartPos, suffixPos).t oLowerCase(); | |
683 if (sSlashExtRegExp == null) sSlashExtRegExp = RegExp.compile("^\\/| (.html?)$", "i"); | |
684 if (sSlashExtRegExp.test(diffPart)) return true; | |
685 | |
686 // Both url and pattern must have the same query name. | |
687 if (!url.regionMatches(queryComponentStartPos, pagePattern, queryCom ponentStartPos, | |
688 pageParamPos - queryComponentStartPos)) { | |
689 return false; | |
690 } | |
691 | |
692 return isPlainNumber(url.substring(pageParamPos, suffixPos)); | |
693 } // isDynamicParam | |
694 | |
695 // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: | |
696 // - www.foo.com/a/abc-2.html | |
697 // - www.foo.com/a/abc.html. | |
698 // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: | |
699 // - www.foo.com/a/2/abc.html | |
700 // - www.foo.com/a/abc.html | |
701 // - www.foo.com/abc.html. | |
702 int pageParamPathComponentPos = pagePattern.lastIndexOf('/', pageParamPo s); | |
703 if (pageParamPathComponentPos == -1) return false; | |
704 | |
705 // Handle case where page param is part of the path component (as oppose d to being the | |
706 // entire path component). | |
707 if (pagePattern.charAt(pageParamPos - 1) != '/') { | |
708 // The page param path component of both url and pattern must have t he same prefix. | |
709 if (urlLen < pageParamPathComponentPos + suffixLen || | |
710 !url.regionMatches(0, pagePattern, 0, pageParamPathComponent Pos)) { | |
711 return false; | |
712 } | |
713 | |
714 // Find the first different character in page param path component j ust before | |
715 // placeholder or suffix, then check if it's acceptable. | |
716 int firstDiffPos = pageParamPathComponentPos; | |
717 int maxPos = Math.min(pageParamPos, suffixPos); | |
718 for (; firstDiffPos < maxPos; firstDiffPos++) { | |
719 if (url.charAt(firstDiffPos) != pagePattern.charAt(firstDiffPos) ) break; | |
720 } | |
721 if (firstDiffPos == suffixPos) { // First different character is th e suffix. | |
722 if (firstDiffPos + 1 == pageParamPos && | |
723 isPageParamSeparator(pagePattern.charAt(firstDiffPos))) { | |
724 return true; | |
725 } | |
726 } else if (firstDiffPos == pageParamPos) { // First different chara cter is page param. | |
727 if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) retur n true; | |
728 } | |
729 | |
730 return false; | |
731 } // page param is part of the (not entire) path component. | |
732 | |
733 // Handle case where page param is the entire path component. | |
734 int prevPageParamPathComponentPos = pagePattern.lastIndexOf('/', | |
735 pageParamPathComponentPos - 1); | |
736 if (prevPageParamPathComponentPos != -1) { | |
737 // The url doesn't have page number param and previous path componen t, like | |
738 // www.foo.com/abc.html. | |
739 if (prevPageParamPathComponentPos + suffixLen == urlLen) { | |
740 return url.regionMatches(0, pagePattern, 0, prevPageParamPathCom ponentPos); | |
741 } | |
742 } | |
743 | |
744 // If both url and pattern have the same prefix, url must have nothing e lse. | |
745 if (url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) { | |
746 int acceptLen = pageParamPathComponentPos + suffixLen; | |
747 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html. | |
748 if (acceptLen == urlLen) return true; | |
749 if (acceptLen > urlLen) return false; | |
750 | |
751 // While we are here, the url must have page number param, so the ur l must have a '/' | |
752 // at the pattern's path component start position. | |
753 if (url.charAt(pageParamPathComponentPos) != '/') return false; | |
754 | |
755 return isPlainNumber(url.substring(pageParamPathComponentPos + 1, su ffixPos)); | |
756 } | |
757 | |
758 return false; | |
759 } // isPagingUrl | |
760 | |
225 /** | 761 /** |
226 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in | 762 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in |
227 * alphabetical order. | 763 * alphabetical order. |
228 */ | 764 */ |
229 private static void initBadPageParamNames() { | 765 private static void initBadPageParamNames() { |
230 if (sBadPageParamNames != null) return; | 766 if (sBadPageParamNames != null) return; |
231 | 767 |
232 sBadPageParamNames = new HashSet<String>(); | 768 sBadPageParamNames = new HashSet<String>(); |
233 sBadPageParamNames.add("baixar-gratis"); | 769 sBadPageParamNames.add("baixar-gratis"); |
234 sBadPageParamNames.add("category"); | 770 sBadPageParamNames.add("category"); |
(...skipping 19 matching lines...) Expand all Loading... | |
254 sBadPageParamNames.add("sortby"); | 790 sBadPageParamNames.add("sortby"); |
255 sBadPageParamNames.add("subscriptions"); | 791 sBadPageParamNames.add("subscriptions"); |
256 sBadPageParamNames.add("tag"); | 792 sBadPageParamNames.add("tag"); |
257 sBadPageParamNames.add("tags"); | 793 sBadPageParamNames.add("tags"); |
258 sBadPageParamNames.add("video"); | 794 sBadPageParamNames.add("video"); |
259 sBadPageParamNames.add("videos"); | 795 sBadPageParamNames.add("videos"); |
260 sBadPageParamNames.add("w"); | 796 sBadPageParamNames.add("w"); |
261 sBadPageParamNames.add("wiki"); | 797 sBadPageParamNames.add("wiki"); |
262 } // initBadPageParamNames | 798 } // initBadPageParamNames |
263 | 799 |
800 /** | |
801 * Returns true if given string can be converted to a number >= 0. | |
802 */ | |
803 private static boolean isPlainNumber(String str) { | |
804 return StringUtil.toNumber(str) >= 0; | |
805 } // isPlainNumber | |
806 | |
807 /** | |
808 * Returns true if given character is one of '-', '_', ';', ','. | |
809 */ | |
810 public static native boolean isPageParamSeparator(Character c) /*-{ | |
811 return /[-_;,]/.test(c); | |
812 }-*/; | |
813 | |
264 } | 814 } |
OLD | NEW |