Chromium Code Reviews| Index: java/org/chromium/distiller/PageParameterDetector.java |
| diff --git a/java/org/chromium/distiller/PageParameterDetector.java b/java/org/chromium/distiller/PageParameterDetector.java |
| index 0bf6dfcb7a57c7d290bb751bcc82e8d6d8f1767c..e3fb9b69b893d9d70ad50410b0489c9e490cd283 100644 |
| --- a/java/org/chromium/distiller/PageParameterDetector.java |
| +++ b/java/org/chromium/distiller/PageParameterDetector.java |
| @@ -52,6 +52,11 @@ import java.util.Set; |
| */ |
| public class PageParameterDetector { |
| private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
| + private static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length(); |
| + private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; |
| + |
| + static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; |
| + static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; |
| /** |
| * Stores information about the link (anchor) after the page parameter is detected: |
| @@ -179,6 +184,169 @@ public class PageParameterDetector { |
| } // extractPageParamCandidatesFromPath |
| /** |
| + * Validates the page pattern according to the current document URL through a pipeline of rules: |
| + * - for query page parameter, pattern and URL must have same path components. |
| + * - for path page parameter, |
| + * - pattern and URL must have same number of path components. |
| + * - if only 1 path component, both must have long-enough common prefix and suffix. |
| + * - else all pattern's components, except for page parameter, must be same as url's. |
| + * - lastly, pattern's components cannot be calendar digits. |
| + * |
| + * Returns true if page pattern is valid. |
| + * |
| + * @param docUrl the current document URL |
| + * @param pagePattern the page pattern to validate |
| + */ |
| + static boolean isPagePatternValid(ParsedUrl docUrl, String pagePattern) { |
| + int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER); |
| + if (pageParamPos == -1) return false; |
| + |
| + ParsedUrl patternUrl = ParsedUrl.create(pagePattern); |
| + |
| + // If page parameter is a query, page pattern and doc URL must have the same path. |
| + if (pagePattern.lastIndexOf('?', pageParamPos - 1) != -1) { |
| + return docUrl.getTrimmedPath().equalsIgnoreCase(patternUrl.getTrimmedPath()); |
| + } |
| + |
| + final String[] urlPathComponents = docUrl.getPathComponents(); |
| + final String[] patternPathComponents = patternUrl.getPathComponents(); |
| + final int urlPathComponentsLen = urlPathComponents.length; |
| + final int patternPathComponentsLen = patternPathComponents.length; |
| + |
| + // If the page param is inside of path components, both the pattern and doc URL must have |
| + // the similar path. |
| + if (urlPathComponentsLen > patternPathComponentsLen) return false; |
|
cjhopman
2015/03/27 00:16:12
why ">" and not "!="?
kuan
2015/03/31 17:17:50
because pattern can hv more path components than d
|
| + |
| + // If both doc URL and page pattern have only 1 component, their common prefix+suffix must |
| + // be at least half of the entire component in doc URL, e.g doc URL is |
| + // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads-132-[*!]". |
| + if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { |
| + final String urlComponent = urlPathComponents[0]; |
| + final String patternComponent = patternPathComponents[0]; |
| + int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, patternComponent); |
| + return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) + |
| + commonPrefixLen) * 2 >= urlComponent.length(); |
| + } |
| + |
| + // Get index of page parameter. |
| + int paramIndex = 0; |
| + for (; paramIndex < patternPathComponentsLen; paramIndex++) { |
| + if (patternPathComponents[paramIndex].contains(PAGE_PARAM_PLACEHOLDER)) break; |
| + } |
| + |
| + // Except for the component containing the page param, the other components of doc URL must |
|
cjhopman
2015/03/27 00:16:12
Can this be extracted to a separate function.
kuan
2015/03/31 17:17:50
Done.
|
| + // be part of pattern's path. But pattern may have more components, e.g. doc URL is |
| + // /thread/12 and pattern is /thread/12/page/[*!]. |
| + boolean passedPageParamComponent = false; |
| + for (int i = 0, j = 0; i < urlPathComponentsLen && j < patternPathComponentsLen; i++, j++) { |
|
cjhopman
2015/03/27 00:16:12
I'm not really sure I follow the logic here (and a
cjhopman
2015/03/27 00:18:21
It won't reject that example actually. Still, how
kuan
2015/03/31 17:17:50
this would be invalid - pattern has extra "page" p
cjhopman
2015/04/07 00:45:48
I guess that the behavior doesn't seem to match th
kuan
2015/04/10 22:41:27
i've added ur examples, with explanations, to the
|
| + if (i == paramIndex && !passedPageParamComponent) { |
| + passedPageParamComponent = true; |
| + // Repeat current path component if doc URL has less components (as per comments |
| + // just above, doc URL may have less components). |
| + if (urlPathComponentsLen < patternPathComponentsLen) i--; |
| + continue; |
| + } |
| + |
| + if (!urlPathComponents[i].equalsIgnoreCase(patternPathComponents[j])) return false; |
| + } |
| + |
| + // Check if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a |
| + // false-positive. |
| + if (paramIndex >= 2 && |
|
cjhopman
2015/03/27 00:16:12
Extract this to another function
kuan
2015/03/31 17:17:50
Done.
|
| + // Only if param is the entire path component. This handles some cases erroneously |
| + // considered false-positives e.g. first page is |
| + // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467.html, |
|
cjhopman
2015/03/27 00:16:12
why do we require that it be ordered yyyy/mm/dd fo
kuan
2015/03/31 17:17:50
i would think so. how else do we detect calendar
|
| + // and second page is |
| + // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-questions-109467_Page2.html, |
| + // would be considered false-positives otherwise because of "2014" and "07". |
| + patternPathComponents[paramIndex].length() == PAGE_PARAM_PLACEHOLDER_LEN) { |
| + int month = StringUtil.toNumber(patternPathComponents[paramIndex - 1]); |
| + if (month > 0 && month <= 12) { |
| + int year = StringUtil.toNumber(patternPathComponents[paramIndex - 2]); |
| + if (year > 1970 && year < 3000) return false; |
| + } |
| + } |
| + |
| + return true; |
| + } // isPagePatternValid |
| + |
| + /** |
| + * Evaluates if the given list of LinkInfo's is a list of paging URLs: |
| + * - page numbers in list of LinkInfo's must be adjacent |
| + * - page numbers in list of ascending numbers must either |
| + * - be consecutive and form a page number sequence, or |
| + * - must construct a linear map with a linear formula: page_parameter = a * page_number + b |
| + * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must |
| + * match page pattern, and the only outlink must be 2nd or 3rd page. |
| + * |
| + * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null. |
| + * |
| + * @param allLinkInfo the list of LinkInfo's to evaluate |
| + * @param pagePattern the URL pattern to use |
| + * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| + * @param firstPageUrl the URL of the PageInfo with mPageNum=1 |
| + */ |
| + private static PageParamInfo getPageParamInfo(String pagePattern, List<LinkInfo> allLinkInfo, |
| + List<PageParamInfo.PageInfo> ascendingNumbers, String firstPageUrl) { |
| + if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { |
| + int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendingNumbers); |
| + if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; |
| + |
| + PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormula(allLinkInfo); |
| + |
| + // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecutive and of a page |
| + // number sequence. |
| + if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MASK) return null; |
| + if (!isPageNumberSeq(ascendingNumbers)) return null; |
| + PageParamInfo pageParamInfo = new PageParamInfo(); |
| + pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; |
| + pageParamInfo.mFormula = linearFormula; |
| + for (LinkInfo link : allLinkInfo) { |
| + pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.mPageNum, |
| + ascendingNumbers.get(link.mPosInAscendingList).mUrl)); |
| + } |
| + return pageParamInfo; |
| + } |
| + |
| + // Most of news article have no more than 3 pages and the first page probably doesn't have |
| + // any page parameter. If the first page url matches the the page pattern, we treat it as |
| + // the first page of this pattern. |
| + if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { |
| + final LinkInfo onlyLink = allLinkInfo.get(0); |
| + boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && |
| + onlyLink.mPosInAscendingList == 1; |
| + boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && |
| + onlyLink.mPosInAscendingList == 2 && |
| + // onlyLink's pos is 2 (evaluated right before), so ascendingNumbers has >= 3 |
| + // elements; check if previous element is previous page. |
| + ascendingNumbers.get(1).mPageNum == 2; |
| + // 1 LinkInfo means ascendingNumbers has >= 1 element. |
| + if (ascendingNumbers.get(0).mPageNum == 1 && |
| + (secondPageIsOutlink || thirdPageIsOutlink) && |
| + isPagingUrl(firstPageUrl, pagePattern)) { |
| + // Has valid PageParamInfo, create and populate it. |
| + PageParamInfo pageParamInfo = new PageParamInfo(); |
| + pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; |
| + int coefficient; |
| + int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; |
| + if (delta == 0 || delta == 1) { |
| + coefficient = 1; |
| + } else { |
| + coefficient = onlyLink.mPageParamValue; |
| + delta = 0; |
| + } |
| + pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coefficient, delta); |
| + pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, firstPageUrl)); |
| + pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLink.mPageNum, |
| + ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl)); |
| + return pageParamInfo; |
| + } |
| + } |
| + |
| + return null; |
| + } // getPageParamInfo |
| + |
| + /** |
| * Returns true if given name is backlisted as a known bad page param name. |
| */ |
| private static boolean isPageParamNameBad(String name) { |
| @@ -197,8 +365,8 @@ public class PageParameterDetector { |
| * E.g. "www.foo.com/tag/2" will return true because of the above reasons and "tag" is a bad |
| * page param. |
| */ |
| - static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, |
| - int digitStart, int digitEnd) { |
| + static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, int digitStart, |
| + int digitEnd) { |
| if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. |
| pathStart < digitStart - 1) { // Not the first path component. |
| String postMatch = urlStr.substring(digitEnd).toLowerCase(); |
| @@ -222,6 +390,374 @@ public class PageParameterDetector { |
| return false; |
| } // isLastNumericPathComponentBad |
| + private static int getLongestCommonPrefixLength(String str1, String str2) { |
| + if (str1.isEmpty() || str2.isEmpty()) return 0; |
| + |
| + int limit = Math.min(str1.length(), str2.length()); |
| + int i = 0; |
| + for (; i < limit; i++) { |
| + if (str1.charAt(i) != str2.charAt(i)) break; |
| + } |
| + return i; |
| + } // getLongestCommonPrefixLength |
|
cjhopman
2015/03/27 00:16:12
Let's remove all these comments marking what funct
kuan
2015/03/31 17:17:50
Done.
|
| + |
| + private static int getLongestCommonSuffixLength(String str1, String str2, int startIndex) { |
| + int commonSuffixLen = 0; |
| + for (int i = str1.length() - 1, j = str2.length() - 1; |
| + i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { |
| + if (str1.charAt(i) != str2.charAt(i)) break; |
| + } |
| + return commonSuffixLen; |
| + } // getLongestCommonSuffixLength |
| + |
| + /** |
| + * Detects if page numbers in list of LinkInfo's are adjacent, and if page numbers in list of |
| + * PageParamInfo.PageInfo's are consecutive. |
| + * |
| + * For adjacency, the page numbers in list of LinkInfo's must either be adjacent, or separated |
| + * by at most 1 plain text number which must represent the current page number in one of the |
| + * PageParamInfo.PageInfo's. |
| + * For consecutiveness, there must be at least one pair of consecutive number values in the list |
| + * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are |
| + * likely to be page size selection links (e.g. in the document "See 1-10, 11-20..."). |
| + * |
| + * Returns a int value that is a combination of bits: |
| + * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent |
| + * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are consecutive. |
| + * |
| + * @param allLinkInfo the list of LinkInfo's to evaluate |
| + * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| + */ |
| + static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo, |
| + List<PageParamInfo.PageInfo> ascendingNumbers) { |
| + int result = 0; |
| + |
| + // Check if elements in allLinkInfo are adjacent or there's only 1 gap i.e. the gap is |
| + // current page number respresented in plain text. |
| + int firstPos = -1; |
| + int lastPos = -1; |
| + int gapPos = -1; |
| + Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that page number is unique. |
| + for (LinkInfo linkInfo : allLinkInfo) { |
| + final int currPos = linkInfo.mPosInAscendingList; |
| + if (lastPos == -1) { |
| + firstPos = currPos; |
| + } else if (currPos != lastPos + 1) { |
| + // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6 |
| + // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), allLinkInfo is not |
| + // adjacent. |
| + if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1) return result; |
| + gapPos = currPos - 1; |
| + } |
| + // Make sure page param value, i.e. page number represented in plain text, is unique. |
| + if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; |
| + lastPos = currPos; |
| + } // for all LinkInfo's |
| + |
| + result |= PAGE_NUM_ADJACENT_MASK; |
| + |
| + // Now, determine if page numbers in ascendingNumbers are consecutive. |
| + |
| + // First, handle the gap. |
| + if (gapPos != -1) { |
| + if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return result; |
| + // The "gap" should represent current page number in plain text. |
| + // Check if its adjacent page numbers are consecutive. |
| + // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. |
| + // This can eliminate links affecting the number of items on a page. |
| + final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; |
| + if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && |
| + ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) { |
| + return result | PAGE_NUM_CONSECUTIVE_MASK; |
| + } |
| + return result; |
| + } |
| + |
| + // There is no gap. Check if at least one of the following cases is satisfied: |
| + // Case #1: "[1] [2] ..." or "1 [2] ... ". |
| + if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 && |
| + ascendingNumbers.get(1).mPageNum == 2) { |
| + return result | PAGE_NUM_CONSECUTIVE_MASK; |
| + } |
| + // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. |
| + if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && |
| + ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get(0).mUrl.isEmpty()) { |
| + return result | PAGE_NUM_CONSECUTIVE_MASK; |
| + } |
| + // Case #3: "... [n-1] [n]" or "... [n - 1] n". |
| + final int numbersSize = ascendingNumbers.size(); |
| + if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && |
| + ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == |
| + ascendingNumbers.get(numbersSize - 1).mPageNum) { |
| + return result | PAGE_NUM_CONSECUTIVE_MASK; |
| + } |
| + // Case #4: "... [i-1] [i] [i+1] ...". |
| + for (int i = firstPos + 1; i < lastPos; i++) { |
| + if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get(i + 1).mPageNum) { |
| + return result | PAGE_NUM_CONSECUTIVE_MASK; |
| + } |
| + } |
| + |
| + // Otherwise, there's no pair of consecutive values. |
| + return result; |
| + } // arePageNumsAdjacentAndConsecutive |
| + |
| + /** |
| + * |
| + * Determines if the list of LinkInfo's form a linear formula: |
| + * pageParamValue = coefficient * pageNum + delta (delta == -coefficient or delta == 0). |
|
cjhopman
2015/03/27 00:16:11
Do we really need this complicated linear formula?
kuan
2015/03/31 17:17:50
it's true we don't really care about the actual va
cjhopman
2015/04/07 00:45:48
i just want you to be sure if it's necessary or un
kuan
2015/04/10 22:41:27
i'm wary of removing it now, including the non-1 c
kuan
2015/04/13 17:21:38
to clarify the example above, the pagination URLs
|
| + * |
| + * The coefficient and delta are calculated from the page parameter values and page numbers of 2 |
| + * LinkInfo's, and then validated against the remaining LinkInfo's. |
| + * The order of page numbers doesn't matter. |
| + * |
| + * Returns PageParamInfo.LinearFormula, containing the coefficient and delta, if the page |
| + * parameter forumla could be determined. Otherwise, returns null. |
| + * |
| + * @param allLinkInfo the list of LinkInfo's to evaluate |
| + */ |
| + private static PageParamInfo.LinearFormula getPageParamLinearFormula( |
| + List<LinkInfo> allLinkInfo) { |
| + if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; |
| + |
| + final LinkInfo firstLink = allLinkInfo.get(0); |
| + final LinkInfo secondLink = allLinkInfo.get(1); |
| + |
| + if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.mPageNum) > 4) { |
| + return null; |
| + } |
| + |
| + int deltaX = secondLink.mPageNum - firstLink.mPageNum; |
| + if (deltaX == 0) return null; |
| + |
| + int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; |
| + int coefficient = deltaY / deltaX; |
| + if (coefficient == 0) return null; |
| + |
| + int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum; |
| + if (delta != 0 && delta != -coefficient) return null; |
| + |
| + // Check if the remaining elements are on the same linear map. |
| + for (int i = 2; i < allLinkInfo.size(); i++) { |
| + final LinkInfo link = allLinkInfo.get(i); |
| + if (link.mPageParamValue != coefficient * link.mPageNum + delta) return null; |
| + } |
| + |
| + return new PageParamInfo.LinearFormula(coefficient, delta); |
| + } // getPageParamLinearFormula |
| + |
| + /** |
| + * Returns true if page numbers in list of PageParamInfo.PageInfo's form a sequence, based on |
| + * a pipeline of rules: |
| + * - first PageInfo must have a URL unless it is the first page |
| + * - there's only one plain number without URL in list |
| + * - if only two pages, they must be siblings |
|
cjhopman
2015/03/27 00:16:12
what's a sibling?
kuan
2015/03/31 17:17:50
Done.
|
| + * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecutive numbers must be |
| + * head/tail or have URLs. |
| + * |
| + * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| + */ |
| + private static boolean isPageNumberSeq(List<PageParamInfo.PageInfo> ascendingNumbers) { |
|
cjhopman
2015/03/27 00:16:12
Try to avoid abbreviations in function names: s/Se
kuan
2015/03/31 17:17:50
Done.
|
| + if (ascendingNumbers.size() <= 1) return false; |
| + |
| + // The first one must have a URL unless it is the first page. |
| + final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0); |
| + if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; |
| + |
| + // There's only one plain number without URL in ascending numbers group. |
| + boolean hasPlainNum = false; |
| + for (PageParamInfo.PageInfo page : ascendingNumbers) { |
| + if (page.mUrl.isEmpty()) { |
| + if (hasPlainNum) return false; |
| + hasPlainNum = true; |
| + } |
| + } |
| + |
| + // If there are only two pages, they must be siblings. |
| + if (ascendingNumbers.size() == 2) { |
| + return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; |
| + } |
| + |
| + // Check if page numbers in ascendingNumbers are adjacent and consecutive. |
| + for (int i = 1; i < ascendingNumbers.size(); i++) { |
| + // If two adjacent numbers are not consecutive, we accept them only when: |
| + // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2], [3]...[i], [n]. |
| + // 2) both of them have URLs. |
| + final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i); |
| + final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1); |
| + if (currPage.mPageNum - prevPage.mPageNum != 1) { |
| + if (i != 1 && i != ascendingNumbers.size() - 1) return false; |
| + if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return false; |
| + } |
| + } |
| + |
| + return true; |
| + } // isPageNumberSeq |
| + |
| + private static RegExp sSlashExtRegExp = null; // Match either '/' or ".htm(l)". |
|
cjhopman
2015/03/27 00:16:12
This name needs to be more descriptive.
kuan
2015/03/31 17:17:50
Done.
|
| + |
| + /** |
| + * Returns true if a URL matches the generated page pattern based on a pipeline of rules: |
| + * - suffix (part of pattern after page param placeholder) must be same, and |
| + * - for query page parameter, |
| + * - scheme, host, and path must be same, and |
| + * - query components, except that for page number, must be same in order and value, and |
| + * - query value must be a plain number. |
| + * - for path page parameter that is part of a path component, |
| + * - if the first different character in path component is suffix, it must be a page parameter |
| + * separator, followed by the page parameter in the pattern |
| + * - else if it's page parameter, it and possible following digits must be a plain number. |
| + * - for path page parameter that is the entire path component, |
| + * - if URL has no page number param and previous path component, everything else matches, or |
| + * - if prefix is the same, URL doesn't have anyhing else |
| + * - else url must have '/' at the same position as pattern's page parameter path component, |
| + * followed by a plain number. |
| + * |
| + * @param url the URL to evalutate |
| + * @param pagePattern the URL page pattern to match with |
| + */ |
| + static boolean isPagingUrl(String url, String pagePattern) { |
| + int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER); |
| + if (pageParamPos == -1) return false; |
| + |
| + int queryComponentStartPos = pagePattern.lastIndexOf('&', pageParamPos - 1); |
| + if (queryComponentStartPos == -1) { // Page number is the first query. |
| + queryComponentStartPos = pagePattern.lastIndexOf('?', pageParamPos - 1); |
| + } |
| + |
| + final int urlLen = url.length(); |
| + final int patternLen = pagePattern.length(); |
| + boolean isDynamicParam = queryComponentStartPos > 0 && |
| + pagePattern.charAt(pageParamPos - 1) == '='; |
| + |
| + // Both url and patterm must have the same suffix, if available. |
| + int suffixLen = patternLen - pageParamPos - PAGE_PARAM_PLACEHOLDER_LEN; |
| + if (suffixLen != 0) { |
| + int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'. |
| + if (!url.regionMatches(urlLen - compareLen, pagePattern, patternLen - compareLen, |
| + compareLen)) { |
| + return false; |
| + } |
| + } |
| + |
| + final int suffixPos = urlLen - suffixLen; |
| + |
| + if (isDynamicParam) { |
| + // If page parameter is dynamic, the url matches the pattern only when: |
| + // 1. has same prefix (scheme, host, path) |
| + // 2. has same query components with same value (except page number query) in the same |
| + // order. |
| + // Examples: |
| + // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&queryC=v3 |
| + // Returns true for: |
| + // - http://foo.com/a/b/?queryA=v1&queryC=v3 |
| + // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3 |
| + // Otherwise, returns false. |
| + // |
| + // If page pattern is http://foo.com/a/b?page=[*!]&query=a |
| + // Returns true for: |
| + // - http://foo.com/a/b?query=a |
| + // - http://foo.com/a/b?page=2&query=a |
| + // Otherwise, returns false. |
| + // |
| + // If page pattern is http://foo.com/a/b?page=[*!] |
| + // Returns true for: |
| + // - http://foo.com/a/b/ |
| + // - http://foo.com/a/b.html |
| + // - http://foo.com/a/b.htm |
| + // - http://foo.com/a/b?page=2 |
| + // Otherwise, returns false. |
| + |
| + // Both url and pattern must have the same prefix. |
| + if (suffixPos < queryComponentStartPos || |
| + !url.regionMatches(0, pagePattern, 0, queryComponentStartPos)) { |
| + return false; |
| + } |
| + |
| + // If the url doesn't have page number query, it is fine. |
| + if (queryComponentStartPos == suffixPos) return true; |
| + |
| + // If the only difference in the page param query component of url and pattern is "/", |
| + // ".html" or ".html", it is fine. |
| + String diffPart = url.substring(queryComponentStartPos, suffixPos).toLowerCase(); |
| + if (sSlashExtRegExp == null) sSlashExtRegExp = RegExp.compile("^\\/|(.html?)$", "i"); |
| + if (sSlashExtRegExp.test(diffPart)) return true; |
| + |
| + // Both url and pattern must have the same query name. |
| + if (!url.regionMatches(queryComponentStartPos, pagePattern, queryComponentStartPos, |
| + pageParamPos - queryComponentStartPos)) { |
| + return false; |
| + } |
| + |
| + return isPlainNumber(url.substring(pageParamPos, suffixPos)); |
| + } // isDynamicParam |
| + |
| + // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: |
| + // - www.foo.com/a/abc-2.html |
| + // - www.foo.com/a/abc.html. |
| + // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: |
| + // - www.foo.com/a/2/abc.html |
| + // - www.foo.com/a/abc.html |
| + // - www.foo.com/abc.html. |
| + int pageParamPathComponentPos = pagePattern.lastIndexOf('/', pageParamPos); |
| + if (pageParamPathComponentPos == -1) return false; |
| + |
| + // Handle case where page param is part of the path component (as opposed to being the |
| + // entire path component). |
| + if (pagePattern.charAt(pageParamPos - 1) != '/') { |
| + // The page param path component of both url and pattern must have the same prefix. |
| + if (urlLen < pageParamPathComponentPos + suffixLen || |
| + !url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) { |
| + return false; |
| + } |
| + |
| + // Find the first different character in page param path component just before |
| + // placeholder or suffix, then check if it's acceptable. |
| + int firstDiffPos = pageParamPathComponentPos; |
| + int maxPos = Math.min(pageParamPos, suffixPos); |
| + for (; firstDiffPos < maxPos; firstDiffPos++) { |
| + if (url.charAt(firstDiffPos) != pagePattern.charAt(firstDiffPos)) break; |
| + } |
| + if (firstDiffPos == suffixPos) { // First different character is the suffix. |
| + if (firstDiffPos + 1 == pageParamPos && |
| + isPageParamSeparator(pagePattern.charAt(firstDiffPos))) { |
| + return true; |
| + } |
| + } else if (firstDiffPos == pageParamPos) { // First different character is page param. |
| + if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) return true; |
| + } |
| + |
| + return false; |
| + } // page param is part of the (not entire) path component. |
| + |
| + // Handle case where page param is the entire path component. |
| + int prevPageParamPathComponentPos = pagePattern.lastIndexOf('/', |
| + pageParamPathComponentPos - 1); |
| + if (prevPageParamPathComponentPos != -1) { |
| + // The url doesn't have page number param and previous path component, like |
| + // www.foo.com/abc.html. |
| + if (prevPageParamPathComponentPos + suffixLen == urlLen) { |
| + return url.regionMatches(0, pagePattern, 0, prevPageParamPathComponentPos); |
| + } |
| + } |
| + |
| + // If both url and pattern have the same prefix, url must have nothing else. |
| + if (url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) { |
| + int acceptLen = pageParamPathComponentPos + suffixLen; |
| + // The url doesn't have page number parameter, like www.foo.com/a/abc.html. |
| + if (acceptLen == urlLen) return true; |
| + if (acceptLen > urlLen) return false; |
| + |
| + // While we are here, the url must have page number param, so the url must have a '/' |
| + // at the pattern's path component start position. |
| + if (url.charAt(pageParamPathComponentPos) != '/') return false; |
| + |
| + return isPlainNumber(url.substring(pageParamPathComponentPos + 1, suffixPos)); |
| + } |
| + |
| + return false; |
| + } // isPagingUrl |
| + |
| /** |
| * If sBadPageParamNames is null, initialize it with all the known bad page param names, in |
| * alphabetical order. |
| @@ -261,4 +797,18 @@ public class PageParameterDetector { |
| sBadPageParamNames.add("wiki"); |
| } // initBadPageParamNames |
| + /** |
| + * Returns true if given string can be converted to a number >= 0. |
| + */ |
| + private static boolean isPlainNumber(String str) { |
| + return StringUtil.toNumber(str) >= 0; |
| + } // isPlainNumber |
| + |
| + /** |
| + * Returns true if given character is one of '-', '_', ';', ','. |
| + */ |
| + public static native boolean isPageParamSeparator(Character c) /*-{ |
| + return /[-_;,]/.test(c); |
| + }-*/; |
| + |
| } |