Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import com.google.gwt.regexp.shared.MatchResult; | 7 import com.google.gwt.regexp.shared.MatchResult; |
| 8 import com.google.gwt.regexp.shared.RegExp; | 8 import com.google.gwt.regexp.shared.RegExp; |
| 9 | 9 |
| 10 import java.util.ArrayList; | 10 import java.util.ArrayList; |
| (...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 45 * <a href=http: *a/b?c=1&p=30>4</a> | 45 * <a href=http: *a/b?c=1&p=30>4</a> |
| 46 * <a href=http: *a/b?c=1&p=40>5</a> | 46 * <a href=http: *a/b?c=1&p=40>5</a> |
| 47 * <a href=http: *a/b?c=1&p=all>single page</a> | 47 * <a href=http: *a/b?c=1&p=all>single page</a> |
| 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so | 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so |
| 49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!]. | 49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!]. |
| 50 * Then, this class extracts the single page based on page parameter info. The single page url is | 50 * Then, this class extracts the single page based on page parameter info. The single page url is |
| 51 * http: *a/b?c=1&p=all. | 51 * http: *a/b?c=1&p=all. |
| 52 */ | 52 */ |
| 53 public class PageParameterDetector { | 53 public class PageParameterDetector { |
| 54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | 54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
| 55 private static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER .length(); | |
| 56 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; | |
| 57 | |
| 58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; | |
| 59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; | |
| 55 | 60 |
| 56 /** | 61 /** |
| 57 * Stores information about the link (anchor) after the page parameter is de tected: | 62 * Stores information about the link (anchor) after the page parameter is de tected: |
| 58 * - the page number (as represented by the original plain text) for the lin k | 63 * - the page number (as represented by the original plain text) for the lin k |
| 59 * - the original page parameter numeric component in the URL (this componen t would be replaced | 64 * - the original page parameter numeric component in the URL (this componen t would be replaced |
| 60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) | 65 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) |
| 61 * - the position of this link in the list of ascending numbers. | 66 * - the position of this link in the list of ascending numbers. |
| 62 */ | 67 */ |
| 63 static class LinkInfo { | 68 static class LinkInfo { |
| 64 private int mPageNum; | 69 private int mPageNum; |
| (...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d)); | 177 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d)); |
| 173 if (value >= 0) { | 178 if (value >= 0) { |
| 174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER + | 179 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER + |
| 175 urlStr.substring(matchEnd), | 180 urlStr.substring(matchEnd), |
| 176 new LinkInfo(pageNum, value, posInAscendingNumbers)); | 181 new LinkInfo(pageNum, value, posInAscendingNumbers)); |
| 177 } | 182 } |
| 178 } // while there're matches | 183 } // while there're matches |
| 179 } // extractPageParamCandidatesFromPath | 184 } // extractPageParamCandidatesFromPath |
| 180 | 185 |
| 181 /** | 186 /** |
| 187 * Validates the page pattern according to the current document URL through a pipeline of rules: | |
| 188 * - for query page parameter, pattern and URL must have same path component s. | |
| 189 * - for path page parameter, | |
| 190 * - pattern and URL must have same number of path components. | |
| 191 * - if only 1 path component, both must have long-enough common prefix an d suffix. | |
| 192 * - else all pattern's components, except for page parameter, must be sam e as url's. | |
| 193 * - lastly, pattern's components cannot be calendar digits. | |
| 194 * | |
| 195 * Returns true if page pattern is valid. | |
| 196 * | |
| 197 * @param docUrl the current document URL | |
| 198 * @param pagePattern the page pattern to validate | |
| 199 */ | |
| 200 static boolean isPagePatternValid(ParsedUrl docUrl, String pagePattern) { | |
| 201 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER); | |
| 202 if (pageParamPos == -1) return false; | |
| 203 | |
| 204 ParsedUrl patternUrl = ParsedUrl.create(pagePattern); | |
| 205 | |
| 206 // If page parameter is a query, page pattern and doc URL must have the same path. | |
| 207 if (pagePattern.lastIndexOf('?', pageParamPos - 1) != -1) { | |
| 208 return docUrl.getTrimmedPath().equalsIgnoreCase(patternUrl.getTrimme dPath()); | |
| 209 } | |
| 210 | |
| 211 final String[] urlPathComponents = docUrl.getPathComponents(); | |
| 212 final String[] patternPathComponents = patternUrl.getPathComponents(); | |
| 213 final int urlPathComponentsLen = urlPathComponents.length; | |
| 214 final int patternPathComponentsLen = patternPathComponents.length; | |
| 215 | |
| 216 // If the page param is inside of path components, both the pattern and doc URL must have | |
| 217 // the similar path. | |
| 218 if (urlPathComponentsLen > patternPathComponentsLen) return false; | |
|
cjhopman
2015/03/27 00:16:12
why ">" and not "!="?
kuan
2015/03/31 17:17:50
because pattern can hv more path components than d
| |
| 219 | |
| 220 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must | |
| 221 // be at least half of the entire component in doc URL, e.g doc URL is | |
| 222 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]". | |
| 223 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { | |
| 224 final String urlComponent = urlPathComponents[0]; | |
| 225 final String patternComponent = patternPathComponents[0]; | |
| 226 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent); | |
| 227 return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) + | |
| 228 commonPrefixLen) * 2 >= urlComponent.length(); | |
| 229 } | |
| 230 | |
| 231 // Get index of page parameter. | |
| 232 int paramIndex = 0; | |
| 233 for (; paramIndex < patternPathComponentsLen; paramIndex++) { | |
| 234 if (patternPathComponents[paramIndex].contains(PAGE_PARAM_PLACEHOLDE R)) break; | |
| 235 } | |
| 236 | |
| 237 // Except for the component containing the page param, the other compone nts of doc URL must | |
|
cjhopman
2015/03/27 00:16:12
Can this be extracted to a separate function.
kuan
2015/03/31 17:17:50
Done.
| |
| 238 // be part of pattern's path. But pattern may have more components, e.g . doc URL is | |
| 239 // /thread/12 and pattern is /thread/12/page/[*!]. | |
| 240 boolean passedPageParamComponent = false; | |
| 241 for (int i = 0, j = 0; i < urlPathComponentsLen && j < patternPathCompon entsLen; i++, j++) { | |
|
cjhopman
2015/03/27 00:16:12
I'm not really sure I follow the logic here (and a
cjhopman
2015/03/27 00:18:21
It won't reject that example actually. Still, how
kuan
2015/03/31 17:17:50
this would be invalid - pattern has extra "page" p
cjhopman
2015/04/07 00:45:48
I guess that the behavior doesn't seem to match th
kuan
2015/04/10 22:41:27
i've added ur examples, with explanations, to the
| |
| 242 if (i == paramIndex && !passedPageParamComponent) { | |
| 243 passedPageParamComponent = true; | |
| 244 // Repeat current path component if doc URL has less components (as per comments | |
| 245 // just above, doc URL may have less components). | |
| 246 if (urlPathComponentsLen < patternPathComponentsLen) i--; | |
| 247 continue; | |
| 248 } | |
| 249 | |
| 250 if (!urlPathComponents[i].equalsIgnoreCase(patternPathComponents[j]) ) return false; | |
| 251 } | |
| 252 | |
| 253 // Check if pattern is for a calendar page, e.g. 2012/01/[*!], which wou ld be a | |
| 254 // false-positive. | |
| 255 if (paramIndex >= 2 && | |
|
cjhopman
2015/03/27 00:16:12
Extract this to another function
kuan
2015/03/31 17:17:50
Done.
| |
| 256 // Only if param is the entire path component. This handles som e cases erroneously | |
| 257 // considered false-positives e.g. first page is | |
| 258 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467.html, | |
|
cjhopman
2015/03/27 00:16:12
why do we require that it be ordered yyyy/mm/dd fo
kuan
2015/03/31 17:17:50
i would think so. how else do we detect calendar
| |
| 259 // and second page is | |
| 260 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467_Page2.html, | |
| 261 // would be considered false-positives otherwise because of "201 4" and "07". | |
| 262 patternPathComponents[paramIndex].length() == PAGE_PARAM_PLACEHO LDER_LEN) { | |
| 263 int month = StringUtil.toNumber(patternPathComponents[paramIndex - 1 ]); | |
| 264 if (month > 0 && month <= 12) { | |
| 265 int year = StringUtil.toNumber(patternPathComponents[paramIndex - 2]); | |
| 266 if (year > 1970 && year < 3000) return false; | |
| 267 } | |
| 268 } | |
| 269 | |
| 270 return true; | |
| 271 } // isPagePatternValid | |
| 272 | |
| 273 /** | |
| 274 * Evaluates if the given list of LinkInfo's is a list of paging URLs: | |
| 275 * - page numbers in list of LinkInfo's must be adjacent | |
| 276 * - page numbers in list of ascending numbers must either | |
| 277 * - be consecutive and form a page number sequence, or | |
| 278 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b | |
| 279 * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must | |
| 280 * match page pattern, and the only outlink must be 2nd or 3rd page. | |
| 281 * | |
| 282 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null. | |
| 283 * | |
| 284 * @param allLinkInfo the list of LinkInfo's to evaluate | |
| 285 * @param pagePattern the URL pattern to use | |
| 286 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
| 287 * @param firstPageUrl the URL of the PageInfo with mPageNum=1 | |
| 288 */ | |
| 289 private static PageParamInfo getPageParamInfo(String pagePattern, List<LinkI nfo> allLinkInfo, | |
| 290 List<PageParamInfo.PageInfo> ascendingNumbers, String firstPageUrl) { | |
| 291 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { | |
| 292 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers); | |
| 293 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; | |
| 294 | |
| 295 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul a(allLinkInfo); | |
| 296 | |
| 297 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu tive and of a page | |
| 298 // number sequence. | |
| 299 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null; | |
| 300 if (!isPageNumberSeq(ascendingNumbers)) return null; | |
| 301 PageParamInfo pageParamInfo = new PageParamInfo(); | |
| 302 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
| 303 pageParamInfo.mFormula = linearFormula; | |
| 304 for (LinkInfo link : allLinkInfo) { | |
| 305 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m PageNum, | |
| 306 ascendingNumbers.get(link.mPosInAscendingList).mUrl)); | |
| 307 } | |
| 308 return pageParamInfo; | |
| 309 } | |
| 310 | |
| 311 // Most of news article have no more than 3 pages and the first page pro bably doesn't have | |
| 312 // any page parameter. If the first page url matches the the page patte rn, we treat it as | |
| 313 // the first page of this pattern. | |
| 314 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { | |
| 315 final LinkInfo onlyLink = allLinkInfo.get(0); | |
| 316 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && | |
| 317 onlyLink.mPosInAscendingList == 1; | |
| 318 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && | |
| 319 onlyLink.mPosInAscendingList == 2 && | |
| 320 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3 | |
| 321 // elements; check if previous element is previous page. | |
| 322 ascendingNumbers.get(1).mPageNum == 2; | |
| 323 // 1 LinkInfo means ascendingNumbers has >= 1 element. | |
| 324 if (ascendingNumbers.get(0).mPageNum == 1 && | |
| 325 (secondPageIsOutlink || thirdPageIsOutlink) && | |
| 326 isPagingUrl(firstPageUrl, pagePattern)) { | |
| 327 // Has valid PageParamInfo, create and populate it. | |
| 328 PageParamInfo pageParamInfo = new PageParamInfo(); | |
| 329 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
| 330 int coefficient; | |
| 331 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; | |
| 332 if (delta == 0 || delta == 1) { | |
| 333 coefficient = 1; | |
| 334 } else { | |
| 335 coefficient = onlyLink.mPageParamValue; | |
| 336 delta = 0; | |
| 337 } | |
| 338 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic ient, delta); | |
| 339 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir stPageUrl)); | |
| 340 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi nk.mPageNum, | |
| 341 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) ); | |
| 342 return pageParamInfo; | |
| 343 } | |
| 344 } | |
| 345 | |
| 346 return null; | |
| 347 } // getPageParamInfo | |
| 348 | |
| 349 /** | |
| 182 * Returns true if given name is backlisted as a known bad page param name. | 350 * Returns true if given name is backlisted as a known bad page param name. |
| 183 */ | 351 */ |
| 184 private static boolean isPageParamNameBad(String name) { | 352 private static boolean isPageParamNameBad(String name) { |
| 185 initBadPageParamNames(); | 353 initBadPageParamNames(); |
| 186 return sBadPageParamNames.contains(name.toLowerCase()); | 354 return sBadPageParamNames.contains(name.toLowerCase()); |
| 187 } // isPageParamNameBad | 355 } // isPageParamNameBad |
| 188 | 356 |
| 189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | 357 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). |
| 190 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. | 358 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. |
| 191 | 359 |
| 192 /** | 360 /** |
| 193 * Returns true if: | 361 * Returns true if: |
| 194 * - the digitStart to digitEnd of urlStr is the last path component, and | 362 * - the digitStart to digitEnd of urlStr is the last path component, and |
| 195 * - the entire path component is numeric, and | 363 * - the entire path component is numeric, and |
| 196 * - the previous path component is a bad page param name. | 364 * - the previous path component is a bad page param name. |
| 197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad | 365 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad |
| 198 * page param. | 366 * page param. |
| 199 */ | 367 */ |
| 200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, | 368 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart, |
| 201 int digitStart, int digitEnd) { | 369 int digitEnd) { |
| 202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. | 370 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. |
| 203 pathStart < digitStart - 1) { // Not the first path component. | 371 pathStart < digitStart - 1) { // Not the first path component. |
| 204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | 372 String postMatch = urlStr.substring(digitEnd).toLowerCase(); |
| 205 // Checks that this is the last path component, and trailing charact ers, if available, | 373 // Checks that this is the last path component, and trailing charact ers, if available, |
| 206 // are (s)htm(l) extensions. | 374 // are (s)htm(l) extensions. |
| 207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); | 375 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); |
| 208 if (sExtRegExp.test(postMatch)) { | 376 if (sExtRegExp.test(postMatch)) { |
| 209 // Entire component is numeric, get previous path component. | 377 // Entire component is numeric, get previous path component. |
| 210 if (sLastPathComponentRegExp == null) { | 378 if (sLastPathComponentRegExp == null) { |
| 211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; | 379 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; |
| 212 } | 380 } |
| 213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | 381 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( |
| 214 urlStr.substring(pathStart + 1, digitStart)); | 382 urlStr.substring(pathStart + 1, digitStart)); |
| 215 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && | 383 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && |
| 216 isPageParamNameBad(prevPathComponent.getGroup(1))) { | 384 isPageParamNameBad(prevPathComponent.getGroup(1))) { |
| 217 return true; | 385 return true; |
| 218 } | 386 } |
| 219 } // last numeric path component | 387 } // last numeric path component |
| 220 } | 388 } |
| 221 | 389 |
| 222 return false; | 390 return false; |
| 223 } // isLastNumericPathComponentBad | 391 } // isLastNumericPathComponentBad |
| 224 | 392 |
| 393 private static int getLongestCommonPrefixLength(String str1, String str2) { | |
| 394 if (str1.isEmpty() || str2.isEmpty()) return 0; | |
| 395 | |
| 396 int limit = Math.min(str1.length(), str2.length()); | |
| 397 int i = 0; | |
| 398 for (; i < limit; i++) { | |
| 399 if (str1.charAt(i) != str2.charAt(i)) break; | |
| 400 } | |
| 401 return i; | |
| 402 } // getLongestCommonPrefixLength | |
|
cjhopman
2015/03/27 00:16:12
Let's remove all these comments marking what funct
kuan
2015/03/31 17:17:50
Done.
| |
| 403 | |
| 404 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) { | |
| 405 int commonSuffixLen = 0; | |
| 406 for (int i = str1.length() - 1, j = str2.length() - 1; | |
| 407 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { | |
| 408 if (str1.charAt(i) != str2.charAt(i)) break; | |
| 409 } | |
| 410 return commonSuffixLen; | |
| 411 } // getLongestCommonSuffixLength | |
| 412 | |
| 413 /** | |
| 414 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n umbers in list of | |
| 415 * PageParamInfo.PageInfo's are consecutive. | |
| 416 * | |
| 417 * For adjacency, the page numbers in list of LinkInfo's must either be adja cent, or separated | |
| 418 * by at most 1 plain text number which must represent the current page numb er in one of the | |
| 419 * PageParamInfo.PageInfo's. | |
| 420 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list | |
| 421 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are | |
| 422 * likely to be page size selection links (e.g. in the document "See 1-10, 1 1-20..."). | |
| 423 * | |
| 424 * Returns a int value that is a combination of bits: | |
| 425 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent | |
| 426 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive. | |
| 427 * | |
| 428 * @param allLinkInfo the list of LinkInfo's to evaluate | |
| 429 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
| 430 */ | |
| 431 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo, | |
| 432 List<PageParamInfo.PageInfo> ascendingNumbers) { | |
| 433 int result = 0; | |
| 434 | |
| 435 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is | |
| 436 // current page number respresented in plain text. | |
| 437 int firstPos = -1; | |
| 438 int lastPos = -1; | |
| 439 int gapPos = -1; | |
| 440 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique. | |
| 441 for (LinkInfo linkInfo : allLinkInfo) { | |
| 442 final int currPos = linkInfo.mPosInAscendingList; | |
| 443 if (lastPos == -1) { | |
| 444 firstPos = currPos; | |
| 445 } else if (currPos != lastPos + 1) { | |
| 446 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6 | |
| 447 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not | |
| 448 // adjacent. | |
| 449 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1 ) return result; | |
| 450 gapPos = currPos - 1; | |
| 451 } | |
| 452 // Make sure page param value, i.e. page number represented in plain text, is unique. | |
| 453 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; | |
| 454 lastPos = currPos; | |
| 455 } // for all LinkInfo's | |
| 456 | |
| 457 result |= PAGE_NUM_ADJACENT_MASK; | |
| 458 | |
| 459 // Now, determine if page numbers in ascendingNumbers are consecutive. | |
| 460 | |
| 461 // First, handle the gap. | |
| 462 if (gapPos != -1) { | |
| 463 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu lt; | |
| 464 // The "gap" should represent current page number in plain text. | |
| 465 // Check if its adjacent page numbers are consecutive. | |
| 466 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. | |
| 467 // This can eliminate links affecting the number of items on a page. | |
| 468 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; | |
| 469 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && | |
| 470 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) { | |
| 471 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 472 } | |
| 473 return result; | |
| 474 } | |
| 475 | |
| 476 // There is no gap. Check if at least one of the following cases is sat isfied: | |
| 477 // Case #1: "[1] [2] ..." or "1 [2] ... ". | |
| 478 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 && | |
| 479 ascendingNumbers.get(1).mPageNum == 2) { | |
| 480 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 481 } | |
| 482 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. | |
| 483 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && | |
| 484 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) { | |
| 485 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 486 } | |
| 487 // Case #3: "... [n-1] [n]" or "... [n - 1] n". | |
| 488 final int numbersSize = ascendingNumbers.size(); | |
| 489 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && | |
| 490 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == | |
| 491 ascendingNumbers.get(numbersSize - 1).mPageNum) { | |
| 492 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 493 } | |
| 494 // Case #4: "... [i-1] [i] [i+1] ...". | |
| 495 for (int i = firstPos + 1; i < lastPos; i++) { | |
| 496 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) { | |
| 497 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 498 } | |
| 499 } | |
| 500 | |
| 501 // Otherwise, there's no pair of consecutive values. | |
| 502 return result; | |
| 503 } // arePageNumsAdjacentAndConsecutive | |
| 504 | |
| 505 /** | |
| 506 * | |
| 507 * Determines if the list of LinkInfo's form a linear formula: | |
| 508 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0). | |
|
cjhopman
2015/03/27 00:16:11
Do we really need this complicated linear formula?
kuan
2015/03/31 17:17:50
it's true we don't really care about the actual va
cjhopman
2015/04/07 00:45:48
i just want you to be sure if it's necessary or un
kuan
2015/04/10 22:41:27
i'm wary of removing it now, including the non-1 c
kuan
2015/04/13 17:21:38
to clarify the example above, the pagination URLs
| |
| 509 * | |
| 510 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2 | |
| 511 * LinkInfo's, and then validated against the remaining LinkInfo's. | |
| 512 * The order of page numbers doesn't matter. | |
| 513 * | |
| 514 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta , if the page | |
| 515 * parameter forumla could be determined. Otherwise, returns null. | |
| 516 * | |
| 517 * @param allLinkInfo the list of LinkInfo's to evaluate | |
| 518 */ | |
| 519 private static PageParamInfo.LinearFormula getPageParamLinearFormula( | |
| 520 List<LinkInfo> allLinkInfo) { | |
| 521 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; | |
| 522 | |
| 523 final LinkInfo firstLink = allLinkInfo.get(0); | |
| 524 final LinkInfo secondLink = allLinkInfo.get(1); | |
| 525 | |
| 526 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) { | |
| 527 return null; | |
| 528 } | |
| 529 | |
| 530 int deltaX = secondLink.mPageNum - firstLink.mPageNum; | |
| 531 if (deltaX == 0) return null; | |
| 532 | |
| 533 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; | |
| 534 int coefficient = deltaY / deltaX; | |
| 535 if (coefficient == 0) return null; | |
| 536 | |
| 537 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ; | |
| 538 if (delta != 0 && delta != -coefficient) return null; | |
| 539 | |
| 540 // Check if the remaining elements are on the same linear map. | |
| 541 for (int i = 2; i < allLinkInfo.size(); i++) { | |
| 542 final LinkInfo link = allLinkInfo.get(i); | |
| 543 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null; | |
| 544 } | |
| 545 | |
| 546 return new PageParamInfo.LinearFormula(coefficient, delta); | |
| 547 } // getPageParamLinearFormula | |
| 548 | |
| 549 /** | |
| 550 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s equence, based on | |
| 551 * a pipeline of rules: | |
| 552 * - first PageInfo must have a URL unless it is the first page | |
| 553 * - there's only one plain number without URL in list | |
| 554 * - if only two pages, they must be siblings | |
|
cjhopman
2015/03/27 00:16:12
what's a sibling?
kuan
2015/03/31 17:17:50
Done.
| |
| 555 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be | |
| 556 * head/tail or have URLs. | |
| 557 * | |
| 558 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
| 559 */ | |
| 560 private static boolean isPageNumberSeq(List<PageParamInfo.PageInfo> ascendin gNumbers) { | |
|
cjhopman
2015/03/27 00:16:12
Try to avoid abbreviations in function names: s/Se
kuan
2015/03/31 17:17:50
Done.
| |
| 561 if (ascendingNumbers.size() <= 1) return false; | |
| 562 | |
| 563 // The first one must have a URL unless it is the first page. | |
| 564 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0); | |
| 565 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; | |
| 566 | |
| 567 // There's only one plain number without URL in ascending numbers group. | |
| 568 boolean hasPlainNum = false; | |
| 569 for (PageParamInfo.PageInfo page : ascendingNumbers) { | |
| 570 if (page.mUrl.isEmpty()) { | |
| 571 if (hasPlainNum) return false; | |
| 572 hasPlainNum = true; | |
| 573 } | |
| 574 } | |
| 575 | |
| 576 // If there are only two pages, they must be siblings. | |
| 577 if (ascendingNumbers.size() == 2) { | |
| 578 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; | |
| 579 } | |
| 580 | |
| 581 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e. | |
| 582 for (int i = 1; i < ascendingNumbers.size(); i++) { | |
| 583 // If two adjacent numbers are not consecutive, we accept them only when: | |
| 584 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n]. | |
| 585 // 2) both of them have URLs. | |
| 586 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i); | |
| 587 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1); | |
| 588 if (currPage.mPageNum - prevPage.mPageNum != 1) { | |
| 589 if (i != 1 && i != ascendingNumbers.size() - 1) return false; | |
| 590 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f alse; | |
| 591 } | |
| 592 } | |
| 593 | |
| 594 return true; | |
| 595 } // isPageNumberSeq | |
| 596 | |
| 597 private static RegExp sSlashExtRegExp = null; // Match either '/' or ".htm( l)". | |
|
cjhopman
2015/03/27 00:16:12
This name needs to be more descriptive.
kuan
2015/03/31 17:17:50
Done.
| |
| 598 | |
| 599 /** | |
| 600 * Returns true if a URL matches the generated page pattern based on a pipel ine of rules: | |
| 601 * - suffix (part of pattern after page param placeholder) must be same, and | |
| 602 * - for query page parameter, | |
| 603 * - scheme, host, and path must be same, and | |
| 604 * - query components, except that for page number, must be same in order and value, and | |
| 605 * - query value must be a plain number. | |
| 606 * - for path page parameter that is part of a path component, | |
| 607 * - if the first different character in path component is suffix, it must be a page parameter | |
| 608 * separator, followed by the page parameter in the pattern | |
| 609 * - else if it's page parameter, it and possible following digits must be a plain number. | |
| 610 * - for path page parameter that is the entire path component, | |
| 611 * - if URL has no page number param and previous path component, everythi ng else matches, or | |
| 612 * - if prefix is the same, URL doesn't have anyhing else | |
| 613 * - else url must have '/' at the same position as pattern's page paramet er path component, | |
| 614 * followed by a plain number. | |
| 615 * | |
| 616 * @param url the URL to evalutate | |
| 617 * @param pagePattern the URL page pattern to match with | |
| 618 */ | |
| 619 static boolean isPagingUrl(String url, String pagePattern) { | |
| 620 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER); | |
| 621 if (pageParamPos == -1) return false; | |
| 622 | |
| 623 int queryComponentStartPos = pagePattern.lastIndexOf('&', pageParamPos - 1); | |
| 624 if (queryComponentStartPos == -1) { // Page number is the first query. | |
| 625 queryComponentStartPos = pagePattern.lastIndexOf('?', pageParamPos - 1); | |
| 626 } | |
| 627 | |
| 628 final int urlLen = url.length(); | |
| 629 final int patternLen = pagePattern.length(); | |
| 630 boolean isDynamicParam = queryComponentStartPos > 0 && | |
| 631 pagePattern.charAt(pageParamPos - 1) == '='; | |
| 632 | |
| 633 // Both url and patterm must have the same suffix, if available. | |
| 634 int suffixLen = patternLen - pageParamPos - PAGE_PARAM_PLACEHOLDER_LEN; | |
| 635 if (suffixLen != 0) { | |
| 636 int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'. | |
| 637 if (!url.regionMatches(urlLen - compareLen, pagePattern, patternLen - compareLen, | |
| 638 compareLen)) { | |
| 639 return false; | |
| 640 } | |
| 641 } | |
| 642 | |
| 643 final int suffixPos = urlLen - suffixLen; | |
| 644 | |
| 645 if (isDynamicParam) { | |
| 646 // If page parameter is dynamic, the url matches the pattern only wh en: | |
| 647 // 1. has same prefix (scheme, host, path) | |
| 648 // 2. has same query components with same value (except page numbe r query) in the same | |
| 649 // order. | |
| 650 // Examples: | |
| 651 // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&query C=v3 | |
| 652 // Returns true for: | |
| 653 // - http://foo.com/a/b/?queryA=v1&queryC=v3 | |
| 654 // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3 | |
| 655 // Otherwise, returns false. | |
| 656 // | |
| 657 // If page pattern is http://foo.com/a/b?page=[*!]&query=a | |
| 658 // Returns true for: | |
| 659 // - http://foo.com/a/b?query=a | |
| 660 // - http://foo.com/a/b?page=2&query=a | |
| 661 // Otherwise, returns false. | |
| 662 // | |
| 663 // If page pattern is http://foo.com/a/b?page=[*!] | |
| 664 // Returns true for: | |
| 665 // - http://foo.com/a/b/ | |
| 666 // - http://foo.com/a/b.html | |
| 667 // - http://foo.com/a/b.htm | |
| 668 // - http://foo.com/a/b?page=2 | |
| 669 // Otherwise, returns false. | |
| 670 | |
| 671 // Both url and pattern must have the same prefix. | |
| 672 if (suffixPos < queryComponentStartPos || | |
| 673 !url.regionMatches(0, pagePattern, 0, queryComponentStartPos )) { | |
| 674 return false; | |
| 675 } | |
| 676 | |
| 677 // If the url doesn't have page number query, it is fine. | |
| 678 if (queryComponentStartPos == suffixPos) return true; | |
| 679 | |
| 680 // If the only difference in the page param query component of url a nd pattern is "/", | |
| 681 // ".html" or ".html", it is fine. | |
| 682 String diffPart = url.substring(queryComponentStartPos, suffixPos).t oLowerCase(); | |
| 683 if (sSlashExtRegExp == null) sSlashExtRegExp = RegExp.compile("^\\/| (.html?)$", "i"); | |
| 684 if (sSlashExtRegExp.test(diffPart)) return true; | |
| 685 | |
| 686 // Both url and pattern must have the same query name. | |
| 687 if (!url.regionMatches(queryComponentStartPos, pagePattern, queryCom ponentStartPos, | |
| 688 pageParamPos - queryComponentStartPos)) { | |
| 689 return false; | |
| 690 } | |
| 691 | |
| 692 return isPlainNumber(url.substring(pageParamPos, suffixPos)); | |
| 693 } // isDynamicParam | |
| 694 | |
| 695 // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: | |
| 696 // - www.foo.com/a/abc-2.html | |
| 697 // - www.foo.com/a/abc.html. | |
| 698 // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: | |
| 699 // - www.foo.com/a/2/abc.html | |
| 700 // - www.foo.com/a/abc.html | |
| 701 // - www.foo.com/abc.html. | |
| 702 int pageParamPathComponentPos = pagePattern.lastIndexOf('/', pageParamPo s); | |
| 703 if (pageParamPathComponentPos == -1) return false; | |
| 704 | |
| 705 // Handle case where page param is part of the path component (as oppose d to being the | |
| 706 // entire path component). | |
| 707 if (pagePattern.charAt(pageParamPos - 1) != '/') { | |
| 708 // The page param path component of both url and pattern must have t he same prefix. | |
| 709 if (urlLen < pageParamPathComponentPos + suffixLen || | |
| 710 !url.regionMatches(0, pagePattern, 0, pageParamPathComponent Pos)) { | |
| 711 return false; | |
| 712 } | |
| 713 | |
| 714 // Find the first different character in page param path component j ust before | |
| 715 // placeholder or suffix, then check if it's acceptable. | |
| 716 int firstDiffPos = pageParamPathComponentPos; | |
| 717 int maxPos = Math.min(pageParamPos, suffixPos); | |
| 718 for (; firstDiffPos < maxPos; firstDiffPos++) { | |
| 719 if (url.charAt(firstDiffPos) != pagePattern.charAt(firstDiffPos) ) break; | |
| 720 } | |
| 721 if (firstDiffPos == suffixPos) { // First different character is th e suffix. | |
| 722 if (firstDiffPos + 1 == pageParamPos && | |
| 723 isPageParamSeparator(pagePattern.charAt(firstDiffPos))) { | |
| 724 return true; | |
| 725 } | |
| 726 } else if (firstDiffPos == pageParamPos) { // First different chara cter is page param. | |
| 727 if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) retur n true; | |
| 728 } | |
| 729 | |
| 730 return false; | |
| 731 } // page param is part of the (not entire) path component. | |
| 732 | |
| 733 // Handle case where page param is the entire path component. | |
| 734 int prevPageParamPathComponentPos = pagePattern.lastIndexOf('/', | |
| 735 pageParamPathComponentPos - 1); | |
| 736 if (prevPageParamPathComponentPos != -1) { | |
| 737 // The url doesn't have page number param and previous path componen t, like | |
| 738 // www.foo.com/abc.html. | |
| 739 if (prevPageParamPathComponentPos + suffixLen == urlLen) { | |
| 740 return url.regionMatches(0, pagePattern, 0, prevPageParamPathCom ponentPos); | |
| 741 } | |
| 742 } | |
| 743 | |
| 744 // If both url and pattern have the same prefix, url must have nothing e lse. | |
| 745 if (url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) { | |
| 746 int acceptLen = pageParamPathComponentPos + suffixLen; | |
| 747 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html. | |
| 748 if (acceptLen == urlLen) return true; | |
| 749 if (acceptLen > urlLen) return false; | |
| 750 | |
| 751 // While we are here, the url must have page number param, so the ur l must have a '/' | |
| 752 // at the pattern's path component start position. | |
| 753 if (url.charAt(pageParamPathComponentPos) != '/') return false; | |
| 754 | |
| 755 return isPlainNumber(url.substring(pageParamPathComponentPos + 1, su ffixPos)); | |
| 756 } | |
| 757 | |
| 758 return false; | |
| 759 } // isPagingUrl | |
| 760 | |
| 225 /** | 761 /** |
| 226 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in | 762 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in |
| 227 * alphabetical order. | 763 * alphabetical order. |
| 228 */ | 764 */ |
| 229 private static void initBadPageParamNames() { | 765 private static void initBadPageParamNames() { |
| 230 if (sBadPageParamNames != null) return; | 766 if (sBadPageParamNames != null) return; |
| 231 | 767 |
| 232 sBadPageParamNames = new HashSet<String>(); | 768 sBadPageParamNames = new HashSet<String>(); |
| 233 sBadPageParamNames.add("baixar-gratis"); | 769 sBadPageParamNames.add("baixar-gratis"); |
| 234 sBadPageParamNames.add("category"); | 770 sBadPageParamNames.add("category"); |
| (...skipping 19 matching lines...) Expand all Loading... | |
| 254 sBadPageParamNames.add("sortby"); | 790 sBadPageParamNames.add("sortby"); |
| 255 sBadPageParamNames.add("subscriptions"); | 791 sBadPageParamNames.add("subscriptions"); |
| 256 sBadPageParamNames.add("tag"); | 792 sBadPageParamNames.add("tag"); |
| 257 sBadPageParamNames.add("tags"); | 793 sBadPageParamNames.add("tags"); |
| 258 sBadPageParamNames.add("video"); | 794 sBadPageParamNames.add("video"); |
| 259 sBadPageParamNames.add("videos"); | 795 sBadPageParamNames.add("videos"); |
| 260 sBadPageParamNames.add("w"); | 796 sBadPageParamNames.add("w"); |
| 261 sBadPageParamNames.add("wiki"); | 797 sBadPageParamNames.add("wiki"); |
| 262 } // initBadPageParamNames | 798 } // initBadPageParamNames |
| 263 | 799 |
| 800 /** | |
| 801 * Returns true if given string can be converted to a number >= 0. | |
| 802 */ | |
| 803 private static boolean isPlainNumber(String str) { | |
| 804 return StringUtil.toNumber(str) >= 0; | |
| 805 } // isPlainNumber | |
| 806 | |
| 807 /** | |
| 808 * Returns true if given character is one of '-', '_', ';', ','. | |
| 809 */ | |
| 810 public static native boolean isPageParamSeparator(Character c) /*-{ | |
| 811 return /[-_;,]/.test(c); | |
| 812 }-*/; | |
| 813 | |
| 264 } | 814 } |
| OLD | NEW |