Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(39)

Side by Side Diff: java/org/chromium/distiller/PageParameterDetector.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master
Patch Set: rename test Created 5 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | javatests/org/chromium/distiller/PageParameterDetectorTest.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 package org.chromium.distiller; 5 package org.chromium.distiller;
6 6
7 import com.google.gwt.regexp.shared.MatchResult; 7 import com.google.gwt.regexp.shared.MatchResult;
8 import com.google.gwt.regexp.shared.RegExp; 8 import com.google.gwt.regexp.shared.RegExp;
9 9
10 import java.util.ArrayList; 10 import java.util.ArrayList;
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
45 * <a href=http: *a/b?c=1&p=30>4</a> 45 * <a href=http: *a/b?c=1&p=30>4</a>
46 * <a href=http: *a/b?c=1&p=40>5</a> 46 * <a href=http: *a/b?c=1&p=40>5</a>
47 * <a href=http: *a/b?c=1&p=all>single page</a> 47 * <a href=http: *a/b?c=1&p=all>single page</a>
48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so
49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!]. 49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!].
50 * Then, this class extracts the single page based on page parameter info. The single page url is 50 * Then, this class extracts the single page based on page parameter info. The single page url is
51 * http: *a/b?c=1&p=all. 51 * http: *a/b?c=1&p=all.
52 */ 52 */
53 public class PageParameterDetector { 53 public class PageParameterDetector {
54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; 54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";
55 private static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER .length();
56 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2;
57
58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0;
59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1;
55 60
56 /** 61 /**
57 * Stores information about the link (anchor) after the page parameter is de tected: 62 * Stores information about the link (anchor) after the page parameter is de tected:
58 * - the page number (as represented by the original plain text) for the lin k 63 * - the page number (as represented by the original plain text) for the lin k
59 * - the original page parameter numeric component in the URL (this componen t would be replaced 64 * - the original page parameter numeric component in the URL (this componen t would be replaced
60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) 65 * by PAGE_PARAM_PLACEHOLDER in the URL pattern)
61 * - the position of this link in the list of ascending numbers. 66 * - the position of this link in the list of ascending numbers.
62 */ 67 */
63 static class LinkInfo { 68 static class LinkInfo {
64 private int mPageNum; 69 private int mPageNum;
(...skipping 107 matching lines...) Expand 10 before | Expand all | Expand 10 after
172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d)); 177 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d));
173 if (value >= 0) { 178 if (value >= 0) {
174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER + 179 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER +
175 urlStr.substring(matchEnd), 180 urlStr.substring(matchEnd),
176 new LinkInfo(pageNum, value, posInAscendingNumbers)); 181 new LinkInfo(pageNum, value, posInAscendingNumbers));
177 } 182 }
178 } // while there're matches 183 } // while there're matches
179 } // extractPageParamCandidatesFromPath 184 } // extractPageParamCandidatesFromPath
180 185
181 /** 186 /**
187 * Validates the page pattern according to the current document URL through a pipeline of rules:
188 * - for query page parameter, pattern and URL must have same path component s.
189 * - for path page parameter,
190 * - pattern and URL must have same number of path components.
191 * - if only 1 path component, both must have long-enough common prefix an d suffix.
192 * - else all pattern's components, except for page parameter, must be sam e as url's.
193 * - lastly, pattern's components cannot be calendar digits.
194 *
195 * Returns true if page pattern is valid.
196 *
197 * @param docUrl the current document URL
198 * @param pagePattern the page pattern to validate
199 */
200 static boolean isPagePatternValid(ParsedUrl docUrl, String pagePattern) {
201 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER);
202 if (pageParamPos == -1) return false;
203
204 ParsedUrl patternUrl = ParsedUrl.create(pagePattern);
205
206 // If page parameter is a query, page pattern and doc URL must have the same path.
207 if (pagePattern.lastIndexOf('?', pageParamPos - 1) != -1) {
208 return docUrl.getTrimmedPath().equalsIgnoreCase(patternUrl.getTrimme dPath());
209 }
210
211 final String[] urlPathComponents = docUrl.getPathComponents();
212 final String[] patternPathComponents = patternUrl.getPathComponents();
213 final int urlPathComponentsLen = urlPathComponents.length;
214 final int patternPathComponentsLen = patternPathComponents.length;
215
216 // If the page param is inside of path components, both the pattern and doc URL must have
217 // the similar path.
218 if (urlPathComponentsLen > patternPathComponentsLen) return false;
cjhopman 2015/03/27 00:16:12 why ">" and not "!="?
kuan 2015/03/31 17:17:50 because pattern can hv more path components than d
219
220 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must
221 // be at least half of the entire component in doc URL, e.g doc URL is
222 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]".
223 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {
224 final String urlComponent = urlPathComponents[0];
225 final String patternComponent = patternPathComponents[0];
226 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent);
227 return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) +
228 commonPrefixLen) * 2 >= urlComponent.length();
229 }
230
231 // Get index of page parameter.
232 int paramIndex = 0;
233 for (; paramIndex < patternPathComponentsLen; paramIndex++) {
234 if (patternPathComponents[paramIndex].contains(PAGE_PARAM_PLACEHOLDE R)) break;
235 }
236
237 // Except for the component containing the page param, the other compone nts of doc URL must
cjhopman 2015/03/27 00:16:12 Can this be extracted to a separate function.
kuan 2015/03/31 17:17:50 Done.
238 // be part of pattern's path. But pattern may have more components, e.g . doc URL is
239 // /thread/12 and pattern is /thread/12/page/[*!].
240 boolean passedPageParamComponent = false;
241 for (int i = 0, j = 0; i < urlPathComponentsLen && j < patternPathCompon entsLen; i++, j++) {
cjhopman 2015/03/27 00:16:12 I'm not really sure I follow the logic here (and a
cjhopman 2015/03/27 00:18:21 It won't reject that example actually. Still, how
kuan 2015/03/31 17:17:50 this would be invalid - pattern has extra "page" p
cjhopman 2015/04/07 00:45:48 I guess that the behavior doesn't seem to match th
kuan 2015/04/10 22:41:27 i've added ur examples, with explanations, to the
242 if (i == paramIndex && !passedPageParamComponent) {
243 passedPageParamComponent = true;
244 // Repeat current path component if doc URL has less components (as per comments
245 // just above, doc URL may have less components).
246 if (urlPathComponentsLen < patternPathComponentsLen) i--;
247 continue;
248 }
249
250 if (!urlPathComponents[i].equalsIgnoreCase(patternPathComponents[j]) ) return false;
251 }
252
253 // Check if pattern is for a calendar page, e.g. 2012/01/[*!], which wou ld be a
254 // false-positive.
255 if (paramIndex >= 2 &&
cjhopman 2015/03/27 00:16:12 Extract this to another function
kuan 2015/03/31 17:17:50 Done.
256 // Only if param is the entire path component. This handles som e cases erroneously
257 // considered false-positives e.g. first page is
258 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467.html,
cjhopman 2015/03/27 00:16:12 why do we require that it be ordered yyyy/mm/dd fo
kuan 2015/03/31 17:17:50 i would think so. how else do we detect calendar
259 // and second page is
260 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467_Page2.html,
261 // would be considered false-positives otherwise because of "201 4" and "07".
262 patternPathComponents[paramIndex].length() == PAGE_PARAM_PLACEHO LDER_LEN) {
263 int month = StringUtil.toNumber(patternPathComponents[paramIndex - 1 ]);
264 if (month > 0 && month <= 12) {
265 int year = StringUtil.toNumber(patternPathComponents[paramIndex - 2]);
266 if (year > 1970 && year < 3000) return false;
267 }
268 }
269
270 return true;
271 } // isPagePatternValid
272
273 /**
274 * Evaluates if the given list of LinkInfo's is a list of paging URLs:
275 * - page numbers in list of LinkInfo's must be adjacent
276 * - page numbers in list of ascending numbers must either
277 * - be consecutive and form a page number sequence, or
278 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b
279 * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must
280 * match page pattern, and the only outlink must be 2nd or 3rd page.
281 *
282 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null.
283 *
284 * @param allLinkInfo the list of LinkInfo's to evaluate
285 * @param pagePattern the URL pattern to use
286 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
287 * @param firstPageUrl the URL of the PageInfo with mPageNum=1
288 */
289 private static PageParamInfo getPageParamInfo(String pagePattern, List<LinkI nfo> allLinkInfo,
290 List<PageParamInfo.PageInfo> ascendingNumbers, String firstPageUrl) {
291 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) {
292 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers);
293 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null;
294
295 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul a(allLinkInfo);
296
297 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu tive and of a page
298 // number sequence.
299 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null;
300 if (!isPageNumberSeq(ascendingNumbers)) return null;
301 PageParamInfo pageParamInfo = new PageParamInfo();
302 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;
303 pageParamInfo.mFormula = linearFormula;
304 for (LinkInfo link : allLinkInfo) {
305 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m PageNum,
306 ascendingNumbers.get(link.mPosInAscendingList).mUrl));
307 }
308 return pageParamInfo;
309 }
310
311 // Most of news article have no more than 3 pages and the first page pro bably doesn't have
312 // any page parameter. If the first page url matches the the page patte rn, we treat it as
313 // the first page of this pattern.
314 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) {
315 final LinkInfo onlyLink = allLinkInfo.get(0);
316 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 &&
317 onlyLink.mPosInAscendingList == 1;
318 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 &&
319 onlyLink.mPosInAscendingList == 2 &&
320 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3
321 // elements; check if previous element is previous page.
322 ascendingNumbers.get(1).mPageNum == 2;
323 // 1 LinkInfo means ascendingNumbers has >= 1 element.
324 if (ascendingNumbers.get(0).mPageNum == 1 &&
325 (secondPageIsOutlink || thirdPageIsOutlink) &&
326 isPagingUrl(firstPageUrl, pagePattern)) {
327 // Has valid PageParamInfo, create and populate it.
328 PageParamInfo pageParamInfo = new PageParamInfo();
329 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;
330 int coefficient;
331 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum;
332 if (delta == 0 || delta == 1) {
333 coefficient = 1;
334 } else {
335 coefficient = onlyLink.mPageParamValue;
336 delta = 0;
337 }
338 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic ient, delta);
339 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir stPageUrl));
340 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi nk.mPageNum,
341 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) );
342 return pageParamInfo;
343 }
344 }
345
346 return null;
347 } // getPageParamInfo
348
349 /**
182 * Returns true if given name is backlisted as a known bad page param name. 350 * Returns true if given name is backlisted as a known bad page param name.
183 */ 351 */
184 private static boolean isPageParamNameBad(String name) { 352 private static boolean isPageParamNameBad(String name) {
185 initBadPageParamNames(); 353 initBadPageParamNames();
186 return sBadPageParamNames.contains(name.toLowerCase()); 354 return sBadPageParamNames.contains(name.toLowerCase());
187 } // isPageParamNameBad 355 } // isPageParamNameBad
188 356
189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). 357 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).
190 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. 358 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.
191 359
192 /** 360 /**
193 * Returns true if: 361 * Returns true if:
194 * - the digitStart to digitEnd of urlStr is the last path component, and 362 * - the digitStart to digitEnd of urlStr is the last path component, and
195 * - the entire path component is numeric, and 363 * - the entire path component is numeric, and
196 * - the previous path component is a bad page param name. 364 * - the previous path component is a bad page param name.
197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad 365 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad
198 * page param. 366 * page param.
199 */ 367 */
200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, 368 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart,
201 int digitStart, int digitEnd) { 369 int digitEnd) {
202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. 370 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.
203 pathStart < digitStart - 1) { // Not the first path component. 371 pathStart < digitStart - 1) { // Not the first path component.
204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); 372 String postMatch = urlStr.substring(digitEnd).toLowerCase();
205 // Checks that this is the last path component, and trailing charact ers, if available, 373 // Checks that this is the last path component, and trailing charact ers, if available,
206 // are (s)htm(l) extensions. 374 // are (s)htm(l) extensions.
207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); 375 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");
208 if (sExtRegExp.test(postMatch)) { 376 if (sExtRegExp.test(postMatch)) {
209 // Entire component is numeric, get previous path component. 377 // Entire component is numeric, get previous path component.
210 if (sLastPathComponentRegExp == null) { 378 if (sLastPathComponentRegExp == null) {
211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; 379 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;
212 } 380 }
213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( 381 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(
214 urlStr.substring(pathStart + 1, digitStart)); 382 urlStr.substring(pathStart + 1, digitStart));
215 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && 383 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&
216 isPageParamNameBad(prevPathComponent.getGroup(1))) { 384 isPageParamNameBad(prevPathComponent.getGroup(1))) {
217 return true; 385 return true;
218 } 386 }
219 } // last numeric path component 387 } // last numeric path component
220 } 388 }
221 389
222 return false; 390 return false;
223 } // isLastNumericPathComponentBad 391 } // isLastNumericPathComponentBad
224 392
393 private static int getLongestCommonPrefixLength(String str1, String str2) {
394 if (str1.isEmpty() || str2.isEmpty()) return 0;
395
396 int limit = Math.min(str1.length(), str2.length());
397 int i = 0;
398 for (; i < limit; i++) {
399 if (str1.charAt(i) != str2.charAt(i)) break;
400 }
401 return i;
402 } // getLongestCommonPrefixLength
cjhopman 2015/03/27 00:16:12 Let's remove all these comments marking what funct
kuan 2015/03/31 17:17:50 Done.
403
404 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) {
405 int commonSuffixLen = 0;
406 for (int i = str1.length() - 1, j = str2.length() - 1;
407 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {
408 if (str1.charAt(i) != str2.charAt(i)) break;
409 }
410 return commonSuffixLen;
411 } // getLongestCommonSuffixLength
412
413 /**
414 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n umbers in list of
415 * PageParamInfo.PageInfo's are consecutive.
416 *
417 * For adjacency, the page numbers in list of LinkInfo's must either be adja cent, or separated
418 * by at most 1 plain text number which must represent the current page numb er in one of the
419 * PageParamInfo.PageInfo's.
420 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list
421 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are
422 * likely to be page size selection links (e.g. in the document "See 1-10, 1 1-20...").
423 *
424 * Returns a int value that is a combination of bits:
425 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent
426 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive.
427 *
428 * @param allLinkInfo the list of LinkInfo's to evaluate
429 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
430 */
431 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo,
432 List<PageParamInfo.PageInfo> ascendingNumbers) {
433 int result = 0;
434
435 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is
436 // current page number respresented in plain text.
437 int firstPos = -1;
438 int lastPos = -1;
439 int gapPos = -1;
440 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique.
441 for (LinkInfo linkInfo : allLinkInfo) {
442 final int currPos = linkInfo.mPosInAscendingList;
443 if (lastPos == -1) {
444 firstPos = currPos;
445 } else if (currPos != lastPos + 1) {
446 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6
447 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not
448 // adjacent.
449 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1 ) return result;
450 gapPos = currPos - 1;
451 }
452 // Make sure page param value, i.e. page number represented in plain text, is unique.
453 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result;
454 lastPos = currPos;
455 } // for all LinkInfo's
456
457 result |= PAGE_NUM_ADJACENT_MASK;
458
459 // Now, determine if page numbers in ascendingNumbers are consecutive.
460
461 // First, handle the gap.
462 if (gapPos != -1) {
463 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu lt;
464 // The "gap" should represent current page number in plain text.
465 // Check if its adjacent page numbers are consecutive.
466 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected.
467 // This can eliminate links affecting the number of items on a page.
468 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum;
469 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 &&
470 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) {
471 return result | PAGE_NUM_CONSECUTIVE_MASK;
472 }
473 return result;
474 }
475
476 // There is no gap. Check if at least one of the following cases is sat isfied:
477 // Case #1: "[1] [2] ..." or "1 [2] ... ".
478 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 &&
479 ascendingNumbers.get(1).mPageNum == 2) {
480 return result | PAGE_NUM_CONSECUTIVE_MASK;
481 }
482 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern.
483 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 &&
484 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) {
485 return result | PAGE_NUM_CONSECUTIVE_MASK;
486 }
487 // Case #3: "... [n-1] [n]" or "... [n - 1] n".
488 final int numbersSize = ascendingNumbers.size();
489 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) &&
490 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 ==
491 ascendingNumbers.get(numbersSize - 1).mPageNum) {
492 return result | PAGE_NUM_CONSECUTIVE_MASK;
493 }
494 // Case #4: "... [i-1] [i] [i+1] ...".
495 for (int i = firstPos + 1; i < lastPos; i++) {
496 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) {
497 return result | PAGE_NUM_CONSECUTIVE_MASK;
498 }
499 }
500
501 // Otherwise, there's no pair of consecutive values.
502 return result;
503 } // arePageNumsAdjacentAndConsecutive
504
505 /**
506 *
507 * Determines if the list of LinkInfo's form a linear formula:
508 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0).
cjhopman 2015/03/27 00:16:11 Do we really need this complicated linear formula?
kuan 2015/03/31 17:17:50 it's true we don't really care about the actual va
cjhopman 2015/04/07 00:45:48 i just want you to be sure if it's necessary or un
kuan 2015/04/10 22:41:27 i'm wary of removing it now, including the non-1 c
kuan 2015/04/13 17:21:38 to clarify the example above, the pagination URLs
509 *
510 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2
511 * LinkInfo's, and then validated against the remaining LinkInfo's.
512 * The order of page numbers doesn't matter.
513 *
514 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta , if the page
515 * parameter forumla could be determined. Otherwise, returns null.
516 *
517 * @param allLinkInfo the list of LinkInfo's to evaluate
518 */
519 private static PageParamInfo.LinearFormula getPageParamLinearFormula(
520 List<LinkInfo> allLinkInfo) {
521 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null;
522
523 final LinkInfo firstLink = allLinkInfo.get(0);
524 final LinkInfo secondLink = allLinkInfo.get(1);
525
526 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) {
527 return null;
528 }
529
530 int deltaX = secondLink.mPageNum - firstLink.mPageNum;
531 if (deltaX == 0) return null;
532
533 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue;
534 int coefficient = deltaY / deltaX;
535 if (coefficient == 0) return null;
536
537 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ;
538 if (delta != 0 && delta != -coefficient) return null;
539
540 // Check if the remaining elements are on the same linear map.
541 for (int i = 2; i < allLinkInfo.size(); i++) {
542 final LinkInfo link = allLinkInfo.get(i);
543 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null;
544 }
545
546 return new PageParamInfo.LinearFormula(coefficient, delta);
547 } // getPageParamLinearFormula
548
549 /**
550 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s equence, based on
551 * a pipeline of rules:
552 * - first PageInfo must have a URL unless it is the first page
553 * - there's only one plain number without URL in list
554 * - if only two pages, they must be siblings
cjhopman 2015/03/27 00:16:12 what's a sibling?
kuan 2015/03/31 17:17:50 Done.
555 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be
556 * head/tail or have URLs.
557 *
558 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's
559 */
560 private static boolean isPageNumberSeq(List<PageParamInfo.PageInfo> ascendin gNumbers) {
cjhopman 2015/03/27 00:16:12 Try to avoid abbreviations in function names: s/Se
kuan 2015/03/31 17:17:50 Done.
561 if (ascendingNumbers.size() <= 1) return false;
562
563 // The first one must have a URL unless it is the first page.
564 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0);
565 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false;
566
567 // There's only one plain number without URL in ascending numbers group.
568 boolean hasPlainNum = false;
569 for (PageParamInfo.PageInfo page : ascendingNumbers) {
570 if (page.mUrl.isEmpty()) {
571 if (hasPlainNum) return false;
572 hasPlainNum = true;
573 }
574 }
575
576 // If there are only two pages, they must be siblings.
577 if (ascendingNumbers.size() == 2) {
578 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum;
579 }
580
581 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e.
582 for (int i = 1; i < ascendingNumbers.size(); i++) {
583 // If two adjacent numbers are not consecutive, we accept them only when:
584 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n].
585 // 2) both of them have URLs.
586 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i);
587 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1);
588 if (currPage.mPageNum - prevPage.mPageNum != 1) {
589 if (i != 1 && i != ascendingNumbers.size() - 1) return false;
590 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f alse;
591 }
592 }
593
594 return true;
595 } // isPageNumberSeq
596
597 private static RegExp sSlashExtRegExp = null; // Match either '/' or ".htm( l)".
cjhopman 2015/03/27 00:16:12 This name needs to be more descriptive.
kuan 2015/03/31 17:17:50 Done.
598
599 /**
600 * Returns true if a URL matches the generated page pattern based on a pipel ine of rules:
601 * - suffix (part of pattern after page param placeholder) must be same, and
602 * - for query page parameter,
603 * - scheme, host, and path must be same, and
604 * - query components, except that for page number, must be same in order and value, and
605 * - query value must be a plain number.
606 * - for path page parameter that is part of a path component,
607 * - if the first different character in path component is suffix, it must be a page parameter
608 * separator, followed by the page parameter in the pattern
609 * - else if it's page parameter, it and possible following digits must be a plain number.
610 * - for path page parameter that is the entire path component,
611 * - if URL has no page number param and previous path component, everythi ng else matches, or
612 * - if prefix is the same, URL doesn't have anyhing else
613 * - else url must have '/' at the same position as pattern's page paramet er path component,
614 * followed by a plain number.
615 *
616 * @param url the URL to evalutate
617 * @param pagePattern the URL page pattern to match with
618 */
619 static boolean isPagingUrl(String url, String pagePattern) {
620 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER);
621 if (pageParamPos == -1) return false;
622
623 int queryComponentStartPos = pagePattern.lastIndexOf('&', pageParamPos - 1);
624 if (queryComponentStartPos == -1) { // Page number is the first query.
625 queryComponentStartPos = pagePattern.lastIndexOf('?', pageParamPos - 1);
626 }
627
628 final int urlLen = url.length();
629 final int patternLen = pagePattern.length();
630 boolean isDynamicParam = queryComponentStartPos > 0 &&
631 pagePattern.charAt(pageParamPos - 1) == '=';
632
633 // Both url and patterm must have the same suffix, if available.
634 int suffixLen = patternLen - pageParamPos - PAGE_PARAM_PLACEHOLDER_LEN;
635 if (suffixLen != 0) {
636 int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'.
637 if (!url.regionMatches(urlLen - compareLen, pagePattern, patternLen - compareLen,
638 compareLen)) {
639 return false;
640 }
641 }
642
643 final int suffixPos = urlLen - suffixLen;
644
645 if (isDynamicParam) {
646 // If page parameter is dynamic, the url matches the pattern only wh en:
647 // 1. has same prefix (scheme, host, path)
648 // 2. has same query components with same value (except page numbe r query) in the same
649 // order.
650 // Examples:
651 // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&query C=v3
652 // Returns true for:
653 // - http://foo.com/a/b/?queryA=v1&queryC=v3
654 // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3
655 // Otherwise, returns false.
656 //
657 // If page pattern is http://foo.com/a/b?page=[*!]&query=a
658 // Returns true for:
659 // - http://foo.com/a/b?query=a
660 // - http://foo.com/a/b?page=2&query=a
661 // Otherwise, returns false.
662 //
663 // If page pattern is http://foo.com/a/b?page=[*!]
664 // Returns true for:
665 // - http://foo.com/a/b/
666 // - http://foo.com/a/b.html
667 // - http://foo.com/a/b.htm
668 // - http://foo.com/a/b?page=2
669 // Otherwise, returns false.
670
671 // Both url and pattern must have the same prefix.
672 if (suffixPos < queryComponentStartPos ||
673 !url.regionMatches(0, pagePattern, 0, queryComponentStartPos )) {
674 return false;
675 }
676
677 // If the url doesn't have page number query, it is fine.
678 if (queryComponentStartPos == suffixPos) return true;
679
680 // If the only difference in the page param query component of url a nd pattern is "/",
681 // ".html" or ".html", it is fine.
682 String diffPart = url.substring(queryComponentStartPos, suffixPos).t oLowerCase();
683 if (sSlashExtRegExp == null) sSlashExtRegExp = RegExp.compile("^\\/| (.html?)$", "i");
684 if (sSlashExtRegExp.test(diffPart)) return true;
685
686 // Both url and pattern must have the same query name.
687 if (!url.regionMatches(queryComponentStartPos, pagePattern, queryCom ponentStartPos,
688 pageParamPos - queryComponentStartPos)) {
689 return false;
690 }
691
692 return isPlainNumber(url.substring(pageParamPos, suffixPos));
693 } // isDynamicParam
694
695 // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:
696 // - www.foo.com/a/abc-2.html
697 // - www.foo.com/a/abc.html.
698 // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:
699 // - www.foo.com/a/2/abc.html
700 // - www.foo.com/a/abc.html
701 // - www.foo.com/abc.html.
702 int pageParamPathComponentPos = pagePattern.lastIndexOf('/', pageParamPo s);
703 if (pageParamPathComponentPos == -1) return false;
704
705 // Handle case where page param is part of the path component (as oppose d to being the
706 // entire path component).
707 if (pagePattern.charAt(pageParamPos - 1) != '/') {
708 // The page param path component of both url and pattern must have t he same prefix.
709 if (urlLen < pageParamPathComponentPos + suffixLen ||
710 !url.regionMatches(0, pagePattern, 0, pageParamPathComponent Pos)) {
711 return false;
712 }
713
714 // Find the first different character in page param path component j ust before
715 // placeholder or suffix, then check if it's acceptable.
716 int firstDiffPos = pageParamPathComponentPos;
717 int maxPos = Math.min(pageParamPos, suffixPos);
718 for (; firstDiffPos < maxPos; firstDiffPos++) {
719 if (url.charAt(firstDiffPos) != pagePattern.charAt(firstDiffPos) ) break;
720 }
721 if (firstDiffPos == suffixPos) { // First different character is th e suffix.
722 if (firstDiffPos + 1 == pageParamPos &&
723 isPageParamSeparator(pagePattern.charAt(firstDiffPos))) {
724 return true;
725 }
726 } else if (firstDiffPos == pageParamPos) { // First different chara cter is page param.
727 if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) retur n true;
728 }
729
730 return false;
731 } // page param is part of the (not entire) path component.
732
733 // Handle case where page param is the entire path component.
734 int prevPageParamPathComponentPos = pagePattern.lastIndexOf('/',
735 pageParamPathComponentPos - 1);
736 if (prevPageParamPathComponentPos != -1) {
737 // The url doesn't have page number param and previous path componen t, like
738 // www.foo.com/abc.html.
739 if (prevPageParamPathComponentPos + suffixLen == urlLen) {
740 return url.regionMatches(0, pagePattern, 0, prevPageParamPathCom ponentPos);
741 }
742 }
743
744 // If both url and pattern have the same prefix, url must have nothing e lse.
745 if (url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) {
746 int acceptLen = pageParamPathComponentPos + suffixLen;
747 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html.
748 if (acceptLen == urlLen) return true;
749 if (acceptLen > urlLen) return false;
750
751 // While we are here, the url must have page number param, so the ur l must have a '/'
752 // at the pattern's path component start position.
753 if (url.charAt(pageParamPathComponentPos) != '/') return false;
754
755 return isPlainNumber(url.substring(pageParamPathComponentPos + 1, su ffixPos));
756 }
757
758 return false;
759 } // isPagingUrl
760
225 /** 761 /**
226 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in 762 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in
227 * alphabetical order. 763 * alphabetical order.
228 */ 764 */
229 private static void initBadPageParamNames() { 765 private static void initBadPageParamNames() {
230 if (sBadPageParamNames != null) return; 766 if (sBadPageParamNames != null) return;
231 767
232 sBadPageParamNames = new HashSet<String>(); 768 sBadPageParamNames = new HashSet<String>();
233 sBadPageParamNames.add("baixar-gratis"); 769 sBadPageParamNames.add("baixar-gratis");
234 sBadPageParamNames.add("category"); 770 sBadPageParamNames.add("category");
(...skipping 19 matching lines...) Expand all
254 sBadPageParamNames.add("sortby"); 790 sBadPageParamNames.add("sortby");
255 sBadPageParamNames.add("subscriptions"); 791 sBadPageParamNames.add("subscriptions");
256 sBadPageParamNames.add("tag"); 792 sBadPageParamNames.add("tag");
257 sBadPageParamNames.add("tags"); 793 sBadPageParamNames.add("tags");
258 sBadPageParamNames.add("video"); 794 sBadPageParamNames.add("video");
259 sBadPageParamNames.add("videos"); 795 sBadPageParamNames.add("videos");
260 sBadPageParamNames.add("w"); 796 sBadPageParamNames.add("w");
261 sBadPageParamNames.add("wiki"); 797 sBadPageParamNames.add("wiki");
262 } // initBadPageParamNames 798 } // initBadPageParamNames
263 799
800 /**
801 * Returns true if given string can be converted to a number >= 0.
802 */
803 private static boolean isPlainNumber(String str) {
804 return StringUtil.toNumber(str) >= 0;
805 } // isPlainNumber
806
807 /**
808 * Returns true if given character is one of '-', '_', ';', ','.
809 */
810 public static native boolean isPageParamSeparator(Character c) /*-{
811 return /[-_;,]/.test(c);
812 }-*/;
813
264 } 814 }
OLDNEW
« no previous file with comments | « no previous file | javatests/org/chromium/distiller/PageParameterDetectorTest.java » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698