java/org/chromium/distiller/PageParameterDetector.java - Issue 1029593003: implement validations of pagination URLs

Side by Side Diff: java/org/chromium/distiller/PageParameterDetector.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: rename test Created 5 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import com.google.gwt.regexp.shared.MatchResult;	7 import com.google.gwt.regexp.shared.MatchResult;

8 import com.google.gwt.regexp.shared.RegExp;	8 import com.google.gwt.regexp.shared.RegExp;

9	9

10 import java.util.ArrayList;	10 import java.util.ArrayList;

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
45 * <a href=http: *a/b?c=1&p=30>4</a>	45 * <a href=http: *a/b?c=1&p=30>4</a>

46 * <a href=http: *a/b?c=1&p=40>5</a>	46 * <a href=http: *a/b?c=1&p=40>5</a>

47 * <a href=http: *a/b?c=1&p=all>single page</a>	47 * <a href=http: *a/b?c=1&p=all>single page</a>

48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so	48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so

49 * guesses it is the page parameter. The associated page pattern is http: a/b? c=1&p=[!].	49 * guesses it is the page parameter. The associated page pattern is http: a/b? c=1&p=[!].

50 * Then, this class extracts the single page based on page parameter info. The single page url is	50 * Then, this class extracts the single page based on page parameter info. The single page url is

51 * http: *a/b?c=1&p=all.	51 * http: *a/b?c=1&p=all.

52 */	52 */

53 public class PageParameterDetector {	53 public class PageParameterDetector {

54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";	54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";

	55 private static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER .length();

	56 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2;

	57

	58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0;

	59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1;

55	60

56 /**	61 /**

57 * Stores information about the link (anchor) after the page parameter is de tected:	62 * Stores information about the link (anchor) after the page parameter is de tected:

58 * - the page number (as represented by the original plain text) for the lin k	63 * - the page number (as represented by the original plain text) for the lin k

59 * - the original page parameter numeric component in the URL (this componen t would be replaced	64 * - the original page parameter numeric component in the URL (this componen t would be replaced

60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern)	65 * by PAGE_PARAM_PLACEHOLDER in the URL pattern)

61 * - the position of this link in the list of ascending numbers.	66 * - the position of this link in the list of ascending numbers.

62 */	67 */

63 static class LinkInfo {	68 static class LinkInfo {

64 private int mPageNum;	69 private int mPageNum;

(...skipping 107 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d));	177 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d));

173 if (value >= 0) {	178 if (value >= 0) {

174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER +	179 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER +

175 urlStr.substring(matchEnd),	180 urlStr.substring(matchEnd),

176 new LinkInfo(pageNum, value, posInAscendingNumbers));	181 new LinkInfo(pageNum, value, posInAscendingNumbers));

177 }	182 }

178 } // while there're matches	183 } // while there're matches

179 } // extractPageParamCandidatesFromPath	184 } // extractPageParamCandidatesFromPath

180	185

181 /**	186 /**

	187 * Validates the page pattern according to the current document URL through a pipeline of rules:

	188 * - for query page parameter, pattern and URL must have same path component s.

	189 * - for path page parameter,

	190 * - pattern and URL must have same number of path components.

	191 * - if only 1 path component, both must have long-enough common prefix an d suffix.

	192 * - else all pattern's components, except for page parameter, must be sam e as url's.

	193 * - lastly, pattern's components cannot be calendar digits.

	194 *

	195 * Returns true if page pattern is valid.

	196 *

	197 * @param docUrl the current document URL

	198 * @param pagePattern the page pattern to validate

	199 */

	200 static boolean isPagePatternValid(ParsedUrl docUrl, String pagePattern) {

	201 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER);

	202 if (pageParamPos == -1) return false;

	203

	204 ParsedUrl patternUrl = ParsedUrl.create(pagePattern);

	205

	206 // If page parameter is a query, page pattern and doc URL must have the same path.

	207 if (pagePattern.lastIndexOf('?', pageParamPos - 1) != -1) {

	208 return docUrl.getTrimmedPath().equalsIgnoreCase(patternUrl.getTrimme dPath());

	209 }

	210

	211 final String[] urlPathComponents = docUrl.getPathComponents();

	212 final String[] patternPathComponents = patternUrl.getPathComponents();

	213 final int urlPathComponentsLen = urlPathComponents.length;

	214 final int patternPathComponentsLen = patternPathComponents.length;

	215

	216 // If the page param is inside of path components, both the pattern and doc URL must have

	217 // the similar path.

	218 if (urlPathComponentsLen > patternPathComponentsLen) return false;
	cjhopman 2015/03/27 00:16:12 why ">" and not "!="? why ">" and not "!="? kuan 2015/03/31 17:17:50 because pattern can hv more path components than d Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > why ">" and not "!="? because pattern can hv more path components than doc url e.g. the example in PagePattern.hasSamePathComponentsAs().
	219

	220 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must

	221 // be at least half of the entire component in doc URL, e.g doc URL is

	222 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]".

	223 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {

	224 final String urlComponent = urlPathComponents[0];

	225 final String patternComponent = patternPathComponents[0];

	226 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent);

	227 return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) +

	228 commonPrefixLen) * 2 >= urlComponent.length();

	229 }

	230

	231 // Get index of page parameter.

	232 int paramIndex = 0;

	233 for (; paramIndex < patternPathComponentsLen; paramIndex++) {

	234 if (patternPathComponents[paramIndex].contains(PAGE_PARAM_PLACEHOLDE R)) break;

	235 }

	236

	237 // Except for the component containing the page param, the other compone nts of doc URL must
	cjhopman 2015/03/27 00:16:12 Can this be extracted to a separate function. Can this be extracted to a separate function. kuan 2015/03/31 17:17:50 Done. Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > Can this be extracted to a separate function. Done.
	238 // be part of pattern's path. But pattern may have more components, e.g . doc URL is

	239 // /thread/12 and pattern is /thread/12/page/[*!].

	240 boolean passedPageParamComponent = false;

	241 for (int i = 0, j = 0; i < urlPathComponentsLen && j < patternPathCompon entsLen; i++, j++) {
	cjhopman 2015/03/27 00:16:12 I'm not really sure I follow the logic here (and a I'm not really sure I follow the logic here (and afaict, this will reject the example in the comment). cjhopman 2015/03/27 00:18:21 It won't reject that example actually. Still, how Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > I'm not really sure I follow the logic here (and afaict, this will reject the > example in the comment). It won't reject that example actually. Still, how does it handle: /thread/12/foo /thread/12/page/[!]/foo and /thread/12/foo /thread/12/[!]/foo and does what it does in those two cases make sense? kuan 2015/03/31 17:17:50 this would be invalid - pattern has extra "page" p Show quoted text On 2015/03/27 00:18:21, cjhopman wrote: > On 2015/03/27 00:16:12, cjhopman wrote: > > I'm not really sure I follow the logic here (and afaict, this will reject the > > example in the comment). > > It won't reject that example actually. Still, how does it handle: > > /thread/12/foo /thread/12/page/[!]/foo this would be invalid - pattern has extra "page" path component before page param path component, which won't match with url's "foo" path component. Show quoted text > and > /thread/12/foo /thread/12/[!]/foo this would be valid - pattern's "foo" path component would match with url's "foo" path component. Show quoted text > and does what it does in those two cases make sense? i don't know if this is too strict, but i worry about false-positives. in any case, i added ur 2 examples to the tests. cjhopman 2015/04/07 00:45:48 I guess that the behavior doesn't seem to match th Show quoted text On 2015/03/31 17:17:50, kuan wrote: > On 2015/03/27 00:18:21, cjhopman wrote: > > On 2015/03/27 00:16:12, cjhopman wrote: > > > I'm not really sure I follow the logic here (and afaict, this will reject > the > > > example in the comment). > > > > It won't reject that example actually. Still, how does it handle: > > > > /thread/12/foo /thread/12/page/[!]/foo > > this would be invalid - pattern has extra "page" path component before page > param path component, which won't match with url's "foo" path component. > > > and > > /thread/12/foo /thread/12/[!]/foo > > this would be valid - pattern's "foo" path component would match with url's > "foo" path component. > > > and does what it does in those two cases make sense? > > i don't know if this is too strict, but i worry about false-positives. in any > case, i added ur 2 examples to the tests. I guess that the behavior doesn't seem to match the comment. The comment says "the other components of doc URL must be part of pattern's path. But pattern may have more components", which seems to imply that the following should be valid: /thread/12/foo /thread/12/page/[!]/foo kuan* 2015/04/10 22:41:27 i've added ur examples, with explanations, to the Show quoted text On 2015/04/07 00:45:48, cjhopman wrote: > On 2015/03/31 17:17:50, kuan wrote: > > On 2015/03/27 00:18:21, cjhopman wrote: > > > On 2015/03/27 00:16:12, cjhopman wrote: > > > > I'm not really sure I follow the logic here (and afaict, this will reject > > the > > > > example in the comment). > > > > > > It won't reject that example actually. Still, how does it handle: > > > > > > /thread/12/foo /thread/12/page/[!]/foo > > > > this would be invalid - pattern has extra "page" path component before page > > param path component, which won't match with url's "foo" path component. > > > > > and > > > /thread/12/foo /thread/12/[!]/foo > > > > this would be valid - pattern's "foo" path component would match with url's > > "foo" path component. > > > > > and does what it does in those two cases make sense? > > > > i don't know if this is too strict, but i worry about false-positives. in any > > case, i added ur 2 examples to the tests. > > I guess that the behavior doesn't seem to match the comment. The comment says > "the other components of doc URL must be part of pattern's path. But pattern > may have more components", which seems to imply that the following should be > valid: > > /thread/12/foo /thread/12/page/[*!]/foo i've added ur examples, with explanations, to the comments to illustrate in more details.
	242 if (i == paramIndex && !passedPageParamComponent) {

	243 passedPageParamComponent = true;

	244 // Repeat current path component if doc URL has less components (as per comments

	245 // just above, doc URL may have less components).

	246 if (urlPathComponentsLen < patternPathComponentsLen) i--;

	247 continue;

	248 }

	249

	250 if (!urlPathComponents[i].equalsIgnoreCase(patternPathComponents[j]) ) return false;

	251 }

	252

	253 // Check if pattern is for a calendar page, e.g. 2012/01/[*!], which wou ld be a

	254 // false-positive.

	255 if (paramIndex >= 2 &&
	cjhopman 2015/03/27 00:16:12 Extract this to another function Extract this to another function kuan 2015/03/31 17:17:50 Done. Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > Extract this to another function Done.
	256 // Only if param is the entire path component. This handles som e cases erroneously

	257 // considered false-positives e.g. first page is

	258 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467.html,
	cjhopman 2015/03/27 00:16:12 why do we require that it be ordered yyyy/mm/dd fo why do we require that it be ordered yyyy/mm/dd for us to reject it? Is that just particularly common? kuan 2015/03/31 17:17:50 i would think so. how else do we detect calendar Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > why do we require that it be ordered yyyy/mm/dd for us to reject it? Is that > just particularly common? i would think so. how else do we detect calendar links?
	259 // and second page is

	260 // http://www.politico.com/story/2014/07/barack-obama-immigratio n-legal-questions-109467_Page2.html,

	261 // would be considered false-positives otherwise because of "201 4" and "07".

	262 patternPathComponents[paramIndex].length() == PAGE_PARAM_PLACEHO LDER_LEN) {

	263 int month = StringUtil.toNumber(patternPathComponents[paramIndex - 1 ]);

	264 if (month > 0 && month <= 12) {

	265 int year = StringUtil.toNumber(patternPathComponents[paramIndex - 2]);

	266 if (year > 1970 && year < 3000) return false;

	267 }

	268 }

	269

	270 return true;

	271 } // isPagePatternValid

	272

	273 /**

	274 * Evaluates if the given list of LinkInfo's is a list of paging URLs:

	275 * - page numbers in list of LinkInfo's must be adjacent

	276 * - page numbers in list of ascending numbers must either

	277 * - be consecutive and form a page number sequence, or

	278 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b

	279 * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must

	280 * match page pattern, and the only outlink must be 2nd or 3rd page.

	281 *

	282 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null.

	283 *

	284 * @param allLinkInfo the list of LinkInfo's to evaluate

	285 * @param pagePattern the URL pattern to use

	286 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

	287 * @param firstPageUrl the URL of the PageInfo with mPageNum=1

	288 */

	289 private static PageParamInfo getPageParamInfo(String pagePattern, List<LinkI nfo> allLinkInfo,

	290 List<PageParamInfo.PageInfo> ascendingNumbers, String firstPageUrl) {

	291 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) {

	292 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers);

	293 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null;

	294

	295 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul a(allLinkInfo);

	296

	297 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu tive and of a page

	298 // number sequence.

	299 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null;

	300 if (!isPageNumberSeq(ascendingNumbers)) return null;

	301 PageParamInfo pageParamInfo = new PageParamInfo();

	302 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;

	303 pageParamInfo.mFormula = linearFormula;

	304 for (LinkInfo link : allLinkInfo) {

	305 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m PageNum,

	306 ascendingNumbers.get(link.mPosInAscendingList).mUrl));

	307 }

	308 return pageParamInfo;

	309 }

	310

	311 // Most of news article have no more than 3 pages and the first page pro bably doesn't have

	312 // any page parameter. If the first page url matches the the page patte rn, we treat it as

	313 // the first page of this pattern.

	314 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) {

	315 final LinkInfo onlyLink = allLinkInfo.get(0);

	316 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 &&

	317 onlyLink.mPosInAscendingList == 1;

	318 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 &&

	319 onlyLink.mPosInAscendingList == 2 &&

	320 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3

	321 // elements; check if previous element is previous page.

	322 ascendingNumbers.get(1).mPageNum == 2;

	323 // 1 LinkInfo means ascendingNumbers has >= 1 element.

	324 if (ascendingNumbers.get(0).mPageNum == 1 &&

	325 (secondPageIsOutlink \|\| thirdPageIsOutlink) &&

	326 isPagingUrl(firstPageUrl, pagePattern)) {

	327 // Has valid PageParamInfo, create and populate it.

	328 PageParamInfo pageParamInfo = new PageParamInfo();

	329 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;

	330 int coefficient;

	331 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum;

	332 if (delta == 0 \|\| delta == 1) {

	333 coefficient = 1;

	334 } else {

	335 coefficient = onlyLink.mPageParamValue;

	336 delta = 0;

	337 }

	338 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic ient, delta);

	339 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir stPageUrl));

	340 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi nk.mPageNum,

	341 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) );

	342 return pageParamInfo;

	343 }

	344 }

	345

	346 return null;

	347 } // getPageParamInfo

	348

	349 /**

182 * Returns true if given name is backlisted as a known bad page param name.	350 * Returns true if given name is backlisted as a known bad page param name.

183 */	351 */

184 private static boolean isPageParamNameBad(String name) {	352 private static boolean isPageParamNameBad(String name) {

185 initBadPageParamNames();	353 initBadPageParamNames();

186 return sBadPageParamNames.contains(name.toLowerCase());	354 return sBadPageParamNames.contains(name.toLowerCase());

187 } // isPageParamNameBad	355 } // isPageParamNameBad

188	356

189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).	357 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).

190 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.	358 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.

191	359

192 /**	360 /**

193 * Returns true if:	361 * Returns true if:

194 * - the digitStart to digitEnd of urlStr is the last path component, and	362 * - the digitStart to digitEnd of urlStr is the last path component, and

195 * - the entire path component is numeric, and	363 * - the entire path component is numeric, and

196 * - the previous path component is a bad page param name.	364 * - the previous path component is a bad page param name.

197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad	365 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad

198 * page param.	366 * page param.

199 */	367 */

200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart,	368 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart,

201 int digitStart, int digitEnd) {	369 int digitEnd) {

202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.	370 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.

203 pathStart < digitStart - 1) { // Not the first path component.	371 pathStart < digitStart - 1) { // Not the first path component.

204 String postMatch = urlStr.substring(digitEnd).toLowerCase();	372 String postMatch = urlStr.substring(digitEnd).toLowerCase();

205 // Checks that this is the last path component, and trailing charact ers, if available,	373 // Checks that this is the last path component, and trailing charact ers, if available,

206 // are (s)htm(l) extensions.	374 // are (s)htm(l) extensions.

207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");	375 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");

208 if (sExtRegExp.test(postMatch)) {	376 if (sExtRegExp.test(postMatch)) {

209 // Entire component is numeric, get previous path component.	377 // Entire component is numeric, get previous path component.

210 if (sLastPathComponentRegExp == null) {	378 if (sLastPathComponentRegExp == null) {

211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;	379 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;

212 }	380 }

213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(	381 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(

214 urlStr.substring(pathStart + 1, digitStart));	382 urlStr.substring(pathStart + 1, digitStart));

215 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&	383 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&

216 isPageParamNameBad(prevPathComponent.getGroup(1))) {	384 isPageParamNameBad(prevPathComponent.getGroup(1))) {

217 return true;	385 return true;

218 }	386 }

219 } // last numeric path component	387 } // last numeric path component

220 }	388 }

221	389

222 return false;	390 return false;

223 } // isLastNumericPathComponentBad	391 } // isLastNumericPathComponentBad

224	392

	393 private static int getLongestCommonPrefixLength(String str1, String str2) {

	394 if (str1.isEmpty() \|\| str2.isEmpty()) return 0;

	395

	396 int limit = Math.min(str1.length(), str2.length());

	397 int i = 0;

	398 for (; i < limit; i++) {

	399 if (str1.charAt(i) != str2.charAt(i)) break;

	400 }

	401 return i;

	402 } // getLongestCommonPrefixLength
	cjhopman 2015/03/27 00:16:12 Let's remove all these comments marking what funct Let's remove all these comments marking what function is ending. kuan 2015/03/31 17:17:50 Done. Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > Let's remove all these comments marking what function is ending. Done.
	403

	404 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) {

	405 int commonSuffixLen = 0;

	406 for (int i = str1.length() - 1, j = str2.length() - 1;

	407 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {

	408 if (str1.charAt(i) != str2.charAt(i)) break;

	409 }

	410 return commonSuffixLen;

	411 } // getLongestCommonSuffixLength

	412

	413 /**

	414 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n umbers in list of

	415 * PageParamInfo.PageInfo's are consecutive.

	416 *

	417 * For adjacency, the page numbers in list of LinkInfo's must either be adja cent, or separated

	418 * by at most 1 plain text number which must represent the current page numb er in one of the

	419 * PageParamInfo.PageInfo's.

	420 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list

	421 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are

	422 * likely to be page size selection links (e.g. in the document "See 1-10, 1 1-20...").

	423 *

	424 * Returns a int value that is a combination of bits:

	425 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent

	426 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive.

	427 *

	428 * @param allLinkInfo the list of LinkInfo's to evaluate

	429 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

	430 */

	431 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo,

	432 List<PageParamInfo.PageInfo> ascendingNumbers) {

	433 int result = 0;

	434

	435 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is

	436 // current page number respresented in plain text.

	437 int firstPos = -1;

	438 int lastPos = -1;

	439 int gapPos = -1;

	440 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique.

	441 for (LinkInfo linkInfo : allLinkInfo) {

	442 final int currPos = linkInfo.mPosInAscendingList;

	443 if (lastPos == -1) {

	444 firstPos = currPos;

	445 } else if (currPos != lastPos + 1) {

	446 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6

	447 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not

	448 // adjacent.

	449 if (currPos <= lastPos \|\| currPos != lastPos + 2 \|\| gapPos != -1 ) return result;

	450 gapPos = currPos - 1;

	451 }

	452 // Make sure page param value, i.e. page number represented in plain text, is unique.

	453 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result;

	454 lastPos = currPos;

	455 } // for all LinkInfo's

	456

	457 result \|= PAGE_NUM_ADJACENT_MASK;

	458

	459 // Now, determine if page numbers in ascendingNumbers are consecutive.

	460

	461 // First, handle the gap.

	462 if (gapPos != -1) {

	463 if (gapPos <= 0 \|\| gapPos >= ascendingNumbers.size() - 1) return resu lt;

	464 // The "gap" should represent current page number in plain text.

	465 // Check if its adjacent page numbers are consecutive.

	466 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected.

	467 // This can eliminate links affecting the number of items on a page.

	468 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum;

	469 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 &&

	470 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) {

	471 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	472 }

	473 return result;

	474 }

	475

	476 // There is no gap. Check if at least one of the following cases is sat isfied:

	477 // Case #1: "[1] [2] ..." or "1 [2] ... ".

	478 if ((firstPos == 0 \|\| firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 &&

	479 ascendingNumbers.get(1).mPageNum == 2) {

	480 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	481 }

	482 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern.

	483 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 &&

	484 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) {

	485 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	486 }

	487 // Case #3: "... [n-1] [n]" or "... [n - 1] n".

	488 final int numbersSize = ascendingNumbers.size();

	489 if ((lastPos == numbersSize - 1 \|\| lastPos == numbersSize - 2) &&

	490 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 ==

	491 ascendingNumbers.get(numbersSize - 1).mPageNum) {

	492 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	493 }

	494 // Case #4: "... [i-1] [i] [i+1] ...".

	495 for (int i = firstPos + 1; i < lastPos; i++) {

	496 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) {

	497 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	498 }

	499 }

	500

	501 // Otherwise, there's no pair of consecutive values.

	502 return result;

	503 } // arePageNumsAdjacentAndConsecutive

	504

	505 /**

	506 *

	507 * Determines if the list of LinkInfo's form a linear formula:

	508 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0).
	cjhopman 2015/03/27 00:16:11 Do we really need this complicated linear formula? Do we really need this complicated linear formula? It feels like the coefficient is useful for pages like search results or forums or something but not for things that we actually expect to work on. kuan 2015/03/31 17:17:50 it's true we don't really care about the actual va Show quoted text On 2015/03/27 00:16:11, cjhopman wrote: > Do we really need this complicated linear formula? It feels like the coefficient > is useful for pages like search results or forums or something but not for > things that we actually expect to work on. it's true we don't really care about the actual values of coefficient and delta, but without them, how do we know if the page parameter is an actual one that maps to the page number? won't this help weed out false-positives with arbitrary numbers in the page parameter? i could remove it if u're sure it's unnecessary. cjhopman 2015/04/07 00:45:48 i just want you to be sure if it's necessary or un Show quoted text On 2015/03/31 17:17:50, kuan wrote: > On 2015/03/27 00:16:11, cjhopman wrote: > > Do we really need this complicated linear formula? It feels like the > coefficient > > is useful for pages like search results or forums or something but not for > > things that we actually expect to work on. > > it's true we don't really care about the actual values of coefficient and delta, > but without them, how do we know if the page parameter is an actual one that > maps to the page number? won't this help weed out false-positives with > arbitrary numbers in the page parameter? > > i could remove it if u're sure it's unnecessary. i just want you to be sure if it's necessary or unnecessary. I think that you want something like it, but it might not need to support non-1 coefficient. kuan 2015/04/10 22:41:27 i'm wary of removing it now, including the non-1 c Show quoted text On 2015/04/07 00:45:48, cjhopman wrote: > On 2015/03/31 17:17:50, kuan wrote: > > On 2015/03/27 00:16:11, cjhopman wrote: > > > Do we really need this complicated linear formula? It feels like the > > coefficient > > > is useful for pages like search results or forums or something but not for > > > things that we actually expect to work on. > > > > it's true we don't really care about the actual values of coefficient and > delta, > > but without them, how do we know if the page parameter is an actual one that > > maps to the page number? won't this help weed out false-positives with > > arbitrary numbers in the page parameter? > > > > i could remove it if u're sure it's unnecessary. > > i just want you to be sure if it's necessary or unnecessary. I think that you > want something like it, but it might not need to support non-1 coefficient. i'm wary of removing it now, including the non-1 coefficient support, without enough knowledge on how websites in the wild organize their pages. i do think we need to verify the linear formula exists. if we only support 1 coefficient, the following pagination URLs (in the tests) won't be detected, cos coefficient is 10: - page 1: http://www.google.com/test - page 2: http://www.google.com/test?page=10 - page 3: http://www.google.com/test?page=20. i'd prefer to keep this as is, for now anyway, so i've added a TODO to reassess its necessity. kuan 2015/04/13 17:21:38 to clarify the example above, the pagination URLs Show quoted text On 2015/04/10 22:41:27, kuan wrote: > On 2015/04/07 00:45:48, cjhopman wrote: > > On 2015/03/31 17:17:50, kuan wrote: > > > On 2015/03/27 00:16:11, cjhopman wrote: > > > > Do we really need this complicated linear formula? It feels like the > > > coefficient > > > > is useful for pages like search results or forums or something but not for > > > > things that we actually expect to work on. > > > > > > it's true we don't really care about the actual values of coefficient and > > delta, > > > but without them, how do we know if the page parameter is an actual one that > > > maps to the page number? won't this help weed out false-positives with > > > arbitrary numbers in the page parameter? > > > > > > i could remove it if u're sure it's unnecessary. > > > > i just want you to be sure if it's necessary or unnecessary. I think that you > > want something like it, but it might not need to support non-1 coefficient. > > i'm wary of removing it now, including the non-1 coefficient support, without > enough knowledge on how websites in the wild organize their pages. i do think > we need to verify the linear formula exists. > > if we only support 1 coefficient, the following pagination URLs (in the tests) > won't be detected, cos coefficient is 10: > - page 1: http://www.google.com/test > - page 2: http://www.google.com/test?page=10 > - page 3: http://www.google.com/test?page=20. > > i'd prefer to keep this as is, for now anyway, so i've added a TODO to reassess > its necessity. to clarify the example above, the pagination URLs would be detected, initially. however, if there also exist other pagination URLs with page=arbitraryNumber, we wouldn't know these do not form a linear formula. in the end, we'd end up with a non-decisive situation, and conclude that there's no pagination.
	509 *

	510 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2

	511 * LinkInfo's, and then validated against the remaining LinkInfo's.

	512 * The order of page numbers doesn't matter.

	513 *

	514 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta , if the page

	515 * parameter forumla could be determined. Otherwise, returns null.

	516 *

	517 * @param allLinkInfo the list of LinkInfo's to evaluate

	518 */

	519 private static PageParamInfo.LinearFormula getPageParamLinearFormula(

	520 List<LinkInfo> allLinkInfo) {

	521 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null;

	522

	523 final LinkInfo firstLink = allLinkInfo.get(0);

	524 final LinkInfo secondLink = allLinkInfo.get(1);

	525

	526 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) {

	527 return null;

	528 }

	529

	530 int deltaX = secondLink.mPageNum - firstLink.mPageNum;

	531 if (deltaX == 0) return null;

	532

	533 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue;

	534 int coefficient = deltaY / deltaX;

	535 if (coefficient == 0) return null;

	536

	537 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ;

	538 if (delta != 0 && delta != -coefficient) return null;

	539

	540 // Check if the remaining elements are on the same linear map.

	541 for (int i = 2; i < allLinkInfo.size(); i++) {

	542 final LinkInfo link = allLinkInfo.get(i);

	543 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null;

	544 }

	545

	546 return new PageParamInfo.LinearFormula(coefficient, delta);

	547 } // getPageParamLinearFormula

	548

	549 /**

	550 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s equence, based on

	551 * a pipeline of rules:

	552 * - first PageInfo must have a URL unless it is the first page

	553 * - there's only one plain number without URL in list

	554 * - if only two pages, they must be siblings
	cjhopman 2015/03/27 00:16:12 what's a sibling? what's a sibling? kuan 2015/03/31 17:17:50 Done. Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > what's a sibling? Done.
	555 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be

	556 * head/tail or have URLs.

	557 *

	558 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

	559 */

	560 private static boolean isPageNumberSeq(List<PageParamInfo.PageInfo> ascendin gNumbers) {
	cjhopman 2015/03/27 00:16:12 Try to avoid abbreviations in function names: s/Se Try to avoid abbreviations in function names: s/Seq/Sequence kuan 2015/03/31 17:17:50 Done. Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > Try to avoid abbreviations in function names: s/Seq/Sequence Done.
	561 if (ascendingNumbers.size() <= 1) return false;

	562

	563 // The first one must have a URL unless it is the first page.

	564 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0);

	565 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false;

	566

	567 // There's only one plain number without URL in ascending numbers group.

	568 boolean hasPlainNum = false;

	569 for (PageParamInfo.PageInfo page : ascendingNumbers) {

	570 if (page.mUrl.isEmpty()) {

	571 if (hasPlainNum) return false;

	572 hasPlainNum = true;

	573 }

	574 }

	575

	576 // If there are only two pages, they must be siblings.

	577 if (ascendingNumbers.size() == 2) {

	578 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum;

	579 }

	580

	581 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e.

	582 for (int i = 1; i < ascendingNumbers.size(); i++) {

	583 // If two adjacent numbers are not consecutive, we accept them only when:

	584 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n].

	585 // 2) both of them have URLs.

	586 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i);

	587 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1);

	588 if (currPage.mPageNum - prevPage.mPageNum != 1) {

	589 if (i != 1 && i != ascendingNumbers.size() - 1) return false;

	590 if (currPage.mUrl.isEmpty() \|\| prevPage.mUrl.isEmpty()) return f alse;

	591 }

	592 }

	593

	594 return true;

	595 } // isPageNumberSeq

	596

	597 private static RegExp sSlashExtRegExp = null; // Match either '/' or ".htm( l)".
	cjhopman 2015/03/27 00:16:12 This name needs to be more descriptive. This name needs to be more descriptive. kuan 2015/03/31 17:17:50 Done. Show quoted text On 2015/03/27 00:16:12, cjhopman wrote: > This name needs to be more descriptive. Done.
	598

	599 /**

	600 * Returns true if a URL matches the generated page pattern based on a pipel ine of rules:

	601 * - suffix (part of pattern after page param placeholder) must be same, and

	602 * - for query page parameter,

	603 * - scheme, host, and path must be same, and

	604 * - query components, except that for page number, must be same in order and value, and

	605 * - query value must be a plain number.

	606 * - for path page parameter that is part of a path component,

	607 * - if the first different character in path component is suffix, it must be a page parameter

	608 * separator, followed by the page parameter in the pattern

	609 * - else if it's page parameter, it and possible following digits must be a plain number.

	610 * - for path page parameter that is the entire path component,

	611 * - if URL has no page number param and previous path component, everythi ng else matches, or

	612 * - if prefix is the same, URL doesn't have anyhing else

	613 * - else url must have '/' at the same position as pattern's page paramet er path component,

	614 * followed by a plain number.

	615 *

	616 * @param url the URL to evalutate

	617 * @param pagePattern the URL page pattern to match with

	618 */

	619 static boolean isPagingUrl(String url, String pagePattern) {

	620 int pageParamPos = pagePattern.indexOf(PAGE_PARAM_PLACEHOLDER);

	621 if (pageParamPos == -1) return false;

	622

	623 int queryComponentStartPos = pagePattern.lastIndexOf('&', pageParamPos - 1);

	624 if (queryComponentStartPos == -1) { // Page number is the first query.

	625 queryComponentStartPos = pagePattern.lastIndexOf('?', pageParamPos - 1);

	626 }

	627

	628 final int urlLen = url.length();

	629 final int patternLen = pagePattern.length();

	630 boolean isDynamicParam = queryComponentStartPos > 0 &&

	631 pagePattern.charAt(pageParamPos - 1) == '=';

	632

	633 // Both url and patterm must have the same suffix, if available.

	634 int suffixLen = patternLen - pageParamPos - PAGE_PARAM_PLACEHOLDER_LEN;

	635 if (suffixLen != 0) {

	636 int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'.

	637 if (!url.regionMatches(urlLen - compareLen, pagePattern, patternLen - compareLen,

	638 compareLen)) {

	639 return false;

	640 }

	641 }

	642

	643 final int suffixPos = urlLen - suffixLen;

	644

	645 if (isDynamicParam) {

	646 // If page parameter is dynamic, the url matches the pattern only wh en:

	647 // 1. has same prefix (scheme, host, path)

	648 // 2. has same query components with same value (except page numbe r query) in the same

	649 // order.

	650 // Examples:

	651 // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&query C=v3

	652 // Returns true for:

	653 // - http://foo.com/a/b/?queryA=v1&queryC=v3

	654 // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3

	655 // Otherwise, returns false.

	656 //

	657 // If page pattern is http://foo.com/a/b?page=[*!]&query=a

	658 // Returns true for:

	659 // - http://foo.com/a/b?query=a

	660 // - http://foo.com/a/b?page=2&query=a

	661 // Otherwise, returns false.

	662 //

	663 // If page pattern is http://foo.com/a/b?page=[*!]

	664 // Returns true for:

	665 // - http://foo.com/a/b/

	666 // - http://foo.com/a/b.html

	667 // - http://foo.com/a/b.htm

	668 // - http://foo.com/a/b?page=2

	669 // Otherwise, returns false.

	670

	671 // Both url and pattern must have the same prefix.

	672 if (suffixPos < queryComponentStartPos \|\|

	673 !url.regionMatches(0, pagePattern, 0, queryComponentStartPos )) {

	674 return false;

	675 }

	676

	677 // If the url doesn't have page number query, it is fine.

	678 if (queryComponentStartPos == suffixPos) return true;

	679

	680 // If the only difference in the page param query component of url a nd pattern is "/",

	681 // ".html" or ".html", it is fine.

	682 String diffPart = url.substring(queryComponentStartPos, suffixPos).t oLowerCase();

	683 if (sSlashExtRegExp == null) sSlashExtRegExp = RegExp.compile("^\\/\| (.html?)$", "i");

	684 if (sSlashExtRegExp.test(diffPart)) return true;

	685

	686 // Both url and pattern must have the same query name.

	687 if (!url.regionMatches(queryComponentStartPos, pagePattern, queryCom ponentStartPos,

	688 pageParamPos - queryComponentStartPos)) {

	689 return false;

	690 }

	691

	692 return isPlainNumber(url.substring(pageParamPos, suffixPos));

	693 } // isDynamicParam

	694

	695 // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:

	696 // - www.foo.com/a/abc-2.html

	697 // - www.foo.com/a/abc.html.

	698 // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:

	699 // - www.foo.com/a/2/abc.html

	700 // - www.foo.com/a/abc.html

	701 // - www.foo.com/abc.html.

	702 int pageParamPathComponentPos = pagePattern.lastIndexOf('/', pageParamPo s);

	703 if (pageParamPathComponentPos == -1) return false;

	704

	705 // Handle case where page param is part of the path component (as oppose d to being the

	706 // entire path component).

	707 if (pagePattern.charAt(pageParamPos - 1) != '/') {

	708 // The page param path component of both url and pattern must have t he same prefix.

	709 if (urlLen < pageParamPathComponentPos + suffixLen \|\|

	710 !url.regionMatches(0, pagePattern, 0, pageParamPathComponent Pos)) {

	711 return false;

	712 }

	713

	714 // Find the first different character in page param path component j ust before

	715 // placeholder or suffix, then check if it's acceptable.

	716 int firstDiffPos = pageParamPathComponentPos;

	717 int maxPos = Math.min(pageParamPos, suffixPos);

	718 for (; firstDiffPos < maxPos; firstDiffPos++) {

	719 if (url.charAt(firstDiffPos) != pagePattern.charAt(firstDiffPos) ) break;

	720 }

	721 if (firstDiffPos == suffixPos) { // First different character is th e suffix.

	722 if (firstDiffPos + 1 == pageParamPos &&

	723 isPageParamSeparator(pagePattern.charAt(firstDiffPos))) {

	724 return true;

	725 }

	726 } else if (firstDiffPos == pageParamPos) { // First different chara cter is page param.

	727 if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) retur n true;

	728 }

	729

	730 return false;

	731 } // page param is part of the (not entire) path component.

	732

	733 // Handle case where page param is the entire path component.

	734 int prevPageParamPathComponentPos = pagePattern.lastIndexOf('/',

	735 pageParamPathComponentPos - 1);

	736 if (prevPageParamPathComponentPos != -1) {

	737 // The url doesn't have page number param and previous path componen t, like

	738 // www.foo.com/abc.html.

	739 if (prevPageParamPathComponentPos + suffixLen == urlLen) {

	740 return url.regionMatches(0, pagePattern, 0, prevPageParamPathCom ponentPos);

	741 }

	742 }

	743

	744 // If both url and pattern have the same prefix, url must have nothing e lse.

	745 if (url.regionMatches(0, pagePattern, 0, pageParamPathComponentPos)) {

	746 int acceptLen = pageParamPathComponentPos + suffixLen;

	747 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html.

	748 if (acceptLen == urlLen) return true;

	749 if (acceptLen > urlLen) return false;

	750

	751 // While we are here, the url must have page number param, so the ur l must have a '/'

	752 // at the pattern's path component start position.

	753 if (url.charAt(pageParamPathComponentPos) != '/') return false;

	754

	755 return isPlainNumber(url.substring(pageParamPathComponentPos + 1, su ffixPos));

	756 }

	757

	758 return false;

	759 } // isPagingUrl

	760

225 /**	761 /**

226 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in	762 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in

227 * alphabetical order.	763 * alphabetical order.

228 */	764 */

229 private static void initBadPageParamNames() {	765 private static void initBadPageParamNames() {

230 if (sBadPageParamNames != null) return;	766 if (sBadPageParamNames != null) return;

231	767

232 sBadPageParamNames = new HashSet<String>();	768 sBadPageParamNames = new HashSet<String>();

233 sBadPageParamNames.add("baixar-gratis");	769 sBadPageParamNames.add("baixar-gratis");

234 sBadPageParamNames.add("category");	770 sBadPageParamNames.add("category");

(...skipping 19 matching lines...) Expand all Loading...
254 sBadPageParamNames.add("sortby");	790 sBadPageParamNames.add("sortby");

255 sBadPageParamNames.add("subscriptions");	791 sBadPageParamNames.add("subscriptions");

256 sBadPageParamNames.add("tag");	792 sBadPageParamNames.add("tag");

257 sBadPageParamNames.add("tags");	793 sBadPageParamNames.add("tags");

258 sBadPageParamNames.add("video");	794 sBadPageParamNames.add("video");

259 sBadPageParamNames.add("videos");	795 sBadPageParamNames.add("videos");

260 sBadPageParamNames.add("w");	796 sBadPageParamNames.add("w");

261 sBadPageParamNames.add("wiki");	797 sBadPageParamNames.add("wiki");

262 } // initBadPageParamNames	798 } // initBadPageParamNames

263	799

	800 /**

	801 * Returns true if given string can be converted to a number >= 0.

	802 */

	803 private static boolean isPlainNumber(String str) {

	804 return StringUtil.toNumber(str) >= 0;

	805 } // isPlainNumber

	806

	807 /**

	808 * Returns true if given character is one of '-', '_', ';', ','.

	809 */

	810 public static native boolean isPageParamSeparator(Character c) /*-{

	811 return /[-_;,]/.test(c);

	812 }-*/;

	813

264 }	814 }

OLD	NEW

« no previous file with comments | « no previous file | javatests/org/chromium/distiller/PageParameterDetectorTest.java » ('j') | no next file with comments »