java/org/chromium/distiller/PageParameterDetector.java - Issue 1029593003: implement validations of pagination URLs

Side by Side Diff: java/org/chromium/distiller/PageParameterDetector.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: nit Created 5 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 package org.chromium.distiller;	5 package org.chromium.distiller;

6	6

7 import com.google.gwt.regexp.shared.MatchResult;	7 import com.google.gwt.regexp.shared.MatchResult;

8 import com.google.gwt.regexp.shared.RegExp;	8 import com.google.gwt.regexp.shared.RegExp;

9	9

10 import java.util.ArrayList;	10 import java.util.ArrayList;

(...skipping 11 matching lines...) Expand all Loading...
22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The	22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The

23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w hich contains the	23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w hich contains the

24 * whole content, called "single page".	24 * whole content, called "single page".

25 *	25 *

26 * Definitions:	26 * Definitions:

27 * A single page document is a document that contains the whole content.	27 * A single page document is a document that contains the whole content.

28 * A paging document is one of the partial pages.	28 * A paging document is one of the partial pages.

29 * "digital" means the text contains only digits.	29 * "digital" means the text contains only digits.

30 * A page pattern is a paging URL whose page parameter value is replaced with a place holder	30 * A page pattern is a paging URL whose page parameter value is replaced with a place holder

31 * (PAGE_PARAM_PLACEHOLDER).	31 * (PAGE_PARAM_PLACEHOLDER).

32 * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pat tern is	32 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is

33 * "http: www.foo.com/a/b-[!].html".	33 * "http://www.foo.com/a/b-[*!].html".

34 *	34 *

35 * This class extracts the page parameter from a document's outlinks.	35 * This class extracts the page parameter from a document's outlinks.

36 * The basic idea:	36 * The basic idea:

37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital anchor text.	37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital anchor text.

38 * #2. For each group, determine the relationship between digital anchor texts and digital parts	38 * #2. For each group, determine the relationship between digital anchor texts and digital parts

39 * (either a query value or a path component) in URL. If one part of a UR L is always a linear	39 * (either a query value or a path component) in URL. If one part of a UR L is always a linear

40 * map from its digital anchor text, we guess the part is the page parame ter of the URL.	40 * map from its digital anchor text, we guess the part is the page parame ter of the URL.

41 *	41 *

42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo llowing digital	42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo llowing digital

43 * outlinks:	43 * outlinks:

44 * <a href=http: *a/b?c=1&p=20>3</a>	44 * <a href=http: *a/b?c=1&p=20>3</a>

45 * <a href=http: *a/b?c=1&p=30>4</a>	45 * <a href=http: *a/b?c=1&p=30>4</a>

46 * <a href=http: *a/b?c=1&p=40>5</a>	46 * <a href=http: *a/b?c=1&p=40>5</a>

47 * <a href=http: *a/b?c=1&p=all>single page</a>	47 * <a href=http: *a/b?c=1&p=all>single page</a>

48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so	48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so

49 * guesses it is the page parameter. The associated page pattern is http: a/b? c=1&p=[!].	49 * guesses it is the page parameter. The associated page pattern is http: a/b? c=1&p=[!].

50 * Then, this class extracts the single page based on page parameter info. The single page url is	50 * Then, this class extracts the single page based on page parameter info. The single page url is

51 * http: *a/b?c=1&p=all.	51 * http: *a/b?c=1&p=all.

52 */	52 */

53 public class PageParameterDetector {	53 public class PageParameterDetector {

54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]";	54 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2;

	55

	56 static final String PAGE_PARAM_PLACEHOLDER = "[*!]";

	57 static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length( );

	58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0;

	59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1;

55	60

56 /**	61 /**

57 * Stores information about the link (anchor) after the page parameter is de tected:	62 * Stores information about the link (anchor) after the page parameter is de tected:

58 * - the page number (as represented by the original plain text) for the lin k	63 * - the page number (as represented by the original plain text) for the lin k

59 * - the original page parameter numeric component in the URL (this componen t would be replaced	64 * - the original page parameter numeric component in the URL (this componen t would be replaced

60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern)	65 * by PAGE_PARAM_PLACEHOLDER in the URL pattern)

61 * - the position of this link in the list of ascending numbers.	66 * - the position of this link in the list of ascending numbers.

62 */	67 */

63 static class LinkInfo {	68 static class LinkInfo {

64 private int mPageNum;	69 private int mPageNum;

65 private int mPageParamValue;	70 private int mPageParamValue;

66 private int mPosInAscendingList;	71 private int mPosInAscendingList;

67	72

68 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) {	73 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) {

69 mPageNum = pageNum;	74 mPageNum = pageNum;

70 mPageParamValue = pageParamValue;	75 mPageParamValue = pageParamValue;

71 mPosInAscendingList = posInAscendingList;	76 mPosInAscendingList = posInAscendingList;

72 }	77 }

73 } // LinkInfo	78 }

74	79

75 /**	80 /**

76 * Stores a map of URL pattern to its associated list of LinkInfo's.	81 * Stores a map of URL pattern to its associated list of LinkInfo's.

77 */	82 */

78 private static class PageCandidatesMap {	83 private static class PageCandidatesMap {

79 private final Map<String, List<LinkInfo>> map = new HashMap<String, List <LinkInfo>>();	84 private static class Info {

	85 private final PagePattern mPattern;

	86 private final List<LinkInfo> mLinks;

	87

	88 Info(PagePattern pattern, LinkInfo link) {

	89 mPattern = pattern;

	90 mLinks = new ArrayList<LinkInfo>();

	91 mLinks.add(link);

	92 }

	93 }

	94

	95 private final Map<String, Info> map = new HashMap<String, Info>();

80	96

81 /**	97 /**

82 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al ready exists, adds	98 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al ready exists, adds

83 * the link to the list of LinkInfo's. Otherwise, creates a new map ent ry.	99 * the link to the list of LinkInfo's. Otherwise, creates a new map ent ry.

	100 * Returns true if addition is successful.

84 */	101 */

85 private void add(String urlPattern, LinkInfo link) {	102 private boolean add(String urlPattern, LinkInfo link) {

86 if (map.containsKey(urlPattern)) {	103 if (map.containsKey(urlPattern)) {

87 map.get(urlPattern).add(link);	104 map.get(urlPattern).mLinks.add(link);

88 } else {	105 return true;

89 List<LinkInfo> links = new ArrayList<LinkInfo>();

90 links.add(link);

91 map.put(urlPattern, links);

92 }	106 }

	107 PagePattern pat = PagePattern.create(urlPattern);

	108 if (pat == null) return false;

	109 map.put(urlPattern, new Info(pat, link));

	110 return true;

93 }	111 }

94	112 }

95 } // PageCandidatesMap

96	113

97 // All the known bad page param names.	114 // All the known bad page param names.

98 private static Set<String> sBadPageParamNames = null;	115 private static Set<String> sBadPageParamNames = null;

99	116

100 /**	117 /**

101 * Extracts page parameter candidates from the query part of given URL and a dds the associated	118 * Extracts page parameter candidates from the query part of given URL and a dds the associated

102 * links into pageCandidates which is keyed by page pattern.	119 * links into pageCandidates which is keyed by page pattern.

103 *	120 *

104 * A page parameter candidate is one where:	121 * A page parameter candidate is one where:

105 * - the name of a query name-value component is not one of sBadPageParamNam es, and	122 * - the name of a query name-value component is not one of sBadPageParamNam es, and

(...skipping 17 matching lines...) Expand all Loading...
123 if (!queryName.isEmpty() && !queryValue.isEmpty() &&	140 if (!queryName.isEmpty() && !queryValue.isEmpty() &&

124 StringUtil.isStringAllDigits(queryValue) && !isPageParamName Bad(queryName)) {	141 StringUtil.isStringAllDigits(queryValue) && !isPageParamName Bad(queryName)) {

125 int value = StringUtil.toNumber(queryValue);	142 int value = StringUtil.toNumber(queryValue);

126 if (value >= 0) {	143 if (value >= 0) {

127 pageCandidates.add(	144 pageCandidates.add(

128 url.replaceQueryValue(queryName, queryValue, PAGE_PA RAM_PLACEHOLDER),	145 url.replaceQueryValue(queryName, queryValue, PAGE_PA RAM_PLACEHOLDER),

129 new LinkInfo(pageNum, value, posInAscendingNumbers)) ;	146 new LinkInfo(pageNum, value, posInAscendingNumbers)) ;

130 }	147 }

131 }	148 }

132 }	149 }

133 } // extractPageParamCandidatesFromQuery	150 }

134	151

135 private static RegExp sDigitsRegExp = null; // Match at least 1 digit.	152 private static RegExp sDigitsRegExp = null; // Match at least 1 digit.

136	153

137 /**	154 /**

138 * Extracts page parameter candidates from the path part of given URL (witho ut query components)	155 * Extracts page parameter candidates from the path part of given URL (witho ut query components)

139 * and adds the associated links into pageCandidates which is keyed by page pattern.	156 * and adds the associated links into pageCandidates which is keyed by page pattern.

140 *	157 *

141 * A page parameter candidate is one where a path component contains consecu tive digits which	158 * A page parameter candidate is one where a path component contains consecu tive digits which

142 * can be converted to a plain number (>= 0).	159 * can be converted to a plain number (>= 0).

143 * E.g. a URL path with 3 path components that contain plain numbers will ge nerate 3 URL page	160 * E.g. a URL path with 3 path components that contain plain numbers will ge nerate 3 URL page

(...skipping 25 matching lines...) Expand all Loading...
169	186

170 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat chEnd)) continue;	187 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat chEnd)) continue;

171	188

172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d));	189 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d));

173 if (value >= 0) {	190 if (value >= 0) {

174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER +	191 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER +

175 urlStr.substring(matchEnd),	192 urlStr.substring(matchEnd),

176 new LinkInfo(pageNum, value, posInAscendingNumbers));	193 new LinkInfo(pageNum, value, posInAscendingNumbers));

177 }	194 }

178 } // while there're matches	195 } // while there're matches

179 } // extractPageParamCandidatesFromPath	196 }

	197

	198 /**

	199 * Evaluates if the given list of LinkInfo's is a list of paging URLs:

	200 * - page numbers in list of LinkInfo's must be adjacent

	201 * - page numbers in list of ascending numbers must either

	202 * - be consecutive and form a page number sequence, or

	203 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b

	204 * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must

	205 * match page pattern, and the only outlink must be 2nd or 3rd page.

	206 *

	207 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null.

	208 *

	209 * @param allLinkInfo the list of LinkInfo's to evaluate

	210 * @param pagePattern the URL pattern to use

	211 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

	212 * @param firstPageUrl the URL of the PageInfo with mPageNum=1

	213 */

	214 private static PageParamInfo getPageParamInfo(PagePattern pagePattern,

	215 List<LinkInfo> allLinkInfo, List<PageParamInfo.PageInfo> ascendingNu mbers,

	216 String firstPageUrl) {

	217 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) {

	218 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers);

	219 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null;

	220

	221 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul a(allLinkInfo);

	222

	223 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu tive and of a page

	224 // number sequence.

	225 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null;

	226 if (!isPageNumberSequence(ascendingNumbers)) return null;

	227 PageParamInfo pageParamInfo = new PageParamInfo();

	228 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;

	229 pageParamInfo.mFormula = linearFormula;

	230 for (LinkInfo link : allLinkInfo) {

	231 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m PageNum,

	232 ascendingNumbers.get(link.mPosInAscendingList).mUrl));

	233 }

	234 return pageParamInfo;

	235 }

	236

	237 // Most of news article have no more than 3 pages and the first page pro bably doesn't have

	238 // any page parameter. If the first page url matches the the page patte rn, we treat it as

	239 // the first page of this pattern.

	240 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) {

	241 final LinkInfo onlyLink = allLinkInfo.get(0);

	242 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 &&

	243 onlyLink.mPosInAscendingList == 1;

	244 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 &&

	245 onlyLink.mPosInAscendingList == 2 &&

	246 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3

	247 // elements; check if previous element is previous page.

	248 ascendingNumbers.get(1).mPageNum == 2;

	249 // 1 LinkInfo means ascendingNumbers has >= 1 element.

	250 if (ascendingNumbers.get(0).mPageNum == 1 &&

	251 (secondPageIsOutlink \|\| thirdPageIsOutlink) &&

	252 pagePattern.isPagingUrl(firstPageUrl)) {

	253 // Has valid PageParamInfo, create and populate it.

	254 PageParamInfo pageParamInfo = new PageParamInfo();

	255 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER;

	256 int coefficient;

	257 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum;

	258 if (delta == 0 \|\| delta == 1) {

	259 coefficient = 1;

	260 } else {

	261 coefficient = onlyLink.mPageParamValue;

	262 delta = 0;

	263 }

	264 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic ient, delta);

	265 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir stPageUrl));

	266 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi nk.mPageNum,

	267 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) );

	268 return pageParamInfo;

	269 }

	270 }

	271

	272 return null;

	273 }

180	274

181 /**	275 /**

182 * Returns true if given name is backlisted as a known bad page param name.	276 * Returns true if given name is backlisted as a known bad page param name.

183 */	277 */

184 private static boolean isPageParamNameBad(String name) {	278 private static boolean isPageParamNameBad(String name) {

185 initBadPageParamNames();	279 initBadPageParamNames();

186 return sBadPageParamNames.contains(name.toLowerCase());	280 return sBadPageParamNames.contains(name.toLowerCase());

187 } // isPageParamNameBad	281 }

188	282

189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).	283 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).

190 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.	284 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.

191	285

192 /**	286 /**

193 * Returns true if:	287 * Returns true if:

194 * - the digitStart to digitEnd of urlStr is the last path component, and	288 * - the digitStart to digitEnd of urlStr is the last path component, and

195 * - the entire path component is numeric, and	289 * - the entire path component is numeric, and

196 * - the previous path component is a bad page param name.	290 * - the previous path component is a bad page param name.

197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad	291 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad

198 * page param.	292 * page param.

199 */	293 */

200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart,	294 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart,

201 int digitStart, int digitEnd) {	295 int digitEnd) {

202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.	296 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.

203 pathStart < digitStart - 1) { // Not the first path component.	297 pathStart < digitStart - 1) { // Not the first path component.

204 String postMatch = urlStr.substring(digitEnd).toLowerCase();	298 String postMatch = urlStr.substring(digitEnd).toLowerCase();

205 // Checks that this is the last path component, and trailing charact ers, if available,	299 // Checks that this is the last path component, and trailing charact ers, if available,

206 // are (s)htm(l) extensions.	300 // are (s)htm(l) extensions.

207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");	301 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");

208 if (sExtRegExp.test(postMatch)) {	302 if (sExtRegExp.test(postMatch)) {

209 // Entire component is numeric, get previous path component.	303 // Entire component is numeric, get previous path component.

210 if (sLastPathComponentRegExp == null) {	304 if (sLastPathComponentRegExp == null) {

211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;	305 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;

212 }	306 }

213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(	307 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(

214 urlStr.substring(pathStart + 1, digitStart));	308 urlStr.substring(pathStart + 1, digitStart));

215 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&	309 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&

216 isPageParamNameBad(prevPathComponent.getGroup(1))) {	310 isPageParamNameBad(prevPathComponent.getGroup(1))) {

217 return true;	311 return true;

218 }	312 }

219 } // last numeric path component	313 } // last numeric path component

220 }	314 }

221	315

222 return false;	316 return false;

223 } // isLastNumericPathComponentBad	317 }

	318

	319 /**

	320 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n umbers in list of

	321 * PageParamInfo.PageInfo's are consecutive.

	322 *

	323 * For adjacency, the page numbers in list of LinkInfo's must either be adja cent, or separated

	324 * by at most 1 plain text number which must represent the current page numb er in one of the

	325 * PageParamInfo.PageInfo's.

	326 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list

	327 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are

	328 * likely to be page size selection links (e.g. in the document "See 1-10, 1 1-20...").

	329 *

	330 * Returns a int value that is a combination of bits:

	331 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent

	332 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive.

	333 *

	334 * @param allLinkInfo the list of LinkInfo's to evaluate

	335 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

	336 */

	337 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo,

	338 List<PageParamInfo.PageInfo> ascendingNumbers) {

	339 int result = 0;

	340

	341 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is

	342 // current page number respresented in plain text.

	343 int firstPos = -1;

	344 int lastPos = -1;

	345 int gapPos = -1;

	346 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique.

	347 for (LinkInfo linkInfo : allLinkInfo) {

	348 final int currPos = linkInfo.mPosInAscendingList;

	349 if (lastPos == -1) {

	350 firstPos = currPos;

	351 } else if (currPos != lastPos + 1) {

	352 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6

	353 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not

	354 // adjacent.

	355 if (currPos <= lastPos \|\| currPos != lastPos + 2 \|\| gapPos != -1 ) return result;

	356 gapPos = currPos - 1;

	357 }

	358 // Make sure page param value, i.e. page number represented in plain text, is unique.

	359 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result;

	360 lastPos = currPos;

	361 } // for all LinkInfo's

	362

	363 result \|= PAGE_NUM_ADJACENT_MASK;

	364

	365 // Now, determine if page numbers in ascendingNumbers are consecutive.

	366

	367 // First, handle the gap.

	368 if (gapPos != -1) {

	369 if (gapPos <= 0 \|\| gapPos >= ascendingNumbers.size() - 1) return resu lt;

	370 // The "gap" should represent current page number in plain text.

	371 // Check if its adjacent page numbers are consecutive.

	372 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected.

	373 // This can eliminate links affecting the number of items on a page.

	374 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum;

	375 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 &&

	376 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) {

	377 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	378 }

	379 return result;

	380 }

	381

	382 // There is no gap. Check if at least one of the following cases is sat isfied:

	383 // Case #1: "[1] [2] ..." or "1 [2] ... ".

	384 if ((firstPos == 0 \|\| firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 &&

	385 ascendingNumbers.get(1).mPageNum == 2) {

	386 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	387 }

	388 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern.

	389 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 &&

	390 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) {

	391 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	392 }

	393 // Case #3: "... [n-1] [n]" or "... [n - 1] n".

	394 final int numbersSize = ascendingNumbers.size();

	395 if ((lastPos == numbersSize - 1 \|\| lastPos == numbersSize - 2) &&

	396 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 ==

	397 ascendingNumbers.get(numbersSize - 1).mPageNum) {

	398 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	399 }

	400 // Case #4: "... [i-1] [i] [i+1] ...".

	401 for (int i = firstPos + 1; i < lastPos; i++) {

	402 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) {

	403 return result \| PAGE_NUM_CONSECUTIVE_MASK;

	404 }

	405 }

	406

	407 // Otherwise, there's no pair of consecutive values.

	408 return result;

	409 }

	410

	411 /**

	412 *

	413 * Determines if the list of LinkInfo's form a linear formula:

	414 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0).

	415 *

	416 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2

	417 * LinkInfo's, and then validated against the remaining LinkInfo's.

	418 * The order of page numbers doesn't matter.

	419 *

	420 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta , if the page

	421 * parameter forumla could be determined. Otherwise, returns null.

	422 *

	423 * @param allLinkInfo the list of LinkInfo's to evaluate

	424 */

	425 private static PageParamInfo.LinearFormula getPageParamLinearFormula(

	426 List<LinkInfo> allLinkInfo) {

	427 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null;

	428

	429 final LinkInfo firstLink = allLinkInfo.get(0);

	430 final LinkInfo secondLink = allLinkInfo.get(1);

	431

	432 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) {

	433 return null;

	434 }

	435

	436 int deltaX = secondLink.mPageNum - firstLink.mPageNum;

	437 if (deltaX == 0) return null;

	438

	439 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue;

	440 int coefficient = deltaY / deltaX;

	441 if (coefficient == 0) return null;

	442

	443 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ;

	444 if (delta != 0 && delta != -coefficient) return null;

	445

	446 // Check if the remaining elements are on the same linear map.

	447 for (int i = 2; i < allLinkInfo.size(); i++) {

	448 final LinkInfo link = allLinkInfo.get(i);

	449 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null;

	450 }

	451

	452 return new PageParamInfo.LinearFormula(coefficient, delta);

	453 }

	454

	455 /**

	456 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s equence, based on

	457 * a pipeline of rules:

	458 * - first PageInfo must have a URL unless it is the first page

	459 * - there's only one plain number without URL in list

	460 * - if only two pages, they must be siblings - 2nd page number must follow 1st

	461 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be

	462 * head/tail or have URLs.

	463 *

	464 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's

	465 */

	466 private static boolean isPageNumberSequence(List<PageParamInfo.PageInfo> asc endingNumbers) {

	467 if (ascendingNumbers.size() <= 1) return false;

	468

	469 // The first one must have a URL unless it is the first page.

	470 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0);

	471 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false;

	472

	473 // There's only one plain number without URL in ascending numbers group.

	474 boolean hasPlainNum = false;

	475 for (PageParamInfo.PageInfo page : ascendingNumbers) {

	476 if (page.mUrl.isEmpty()) {

	477 if (hasPlainNum) return false;

	478 hasPlainNum = true;

	479 }

	480 }

	481

	482 // If there are only two pages, they must be siblings.

	483 if (ascendingNumbers.size() == 2) {

	484 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum;

	485 }

	486

	487 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e.

	488 for (int i = 1; i < ascendingNumbers.size(); i++) {

	489 // If two adjacent numbers are not consecutive, we accept them only when:

	490 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n].

	491 // 2) both of them have URLs.

	492 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i);

	493 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1);

	494 if (currPage.mPageNum - prevPage.mPageNum != 1) {

	495 if (i != 1 && i != ascendingNumbers.size() - 1) return false;

	496 if (currPage.mUrl.isEmpty() \|\| prevPage.mUrl.isEmpty()) return f alse;

	497 }

	498 }

	499

	500 return true;

	501 }

224	502

225 /**	503 /**

226 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in	504 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in

227 * alphabetical order.	505 * alphabetical order.

228 */	506 */

229 private static void initBadPageParamNames() {	507 private static void initBadPageParamNames() {

230 if (sBadPageParamNames != null) return;	508 if (sBadPageParamNames != null) return;

231	509

232 sBadPageParamNames = new HashSet<String>();	510 sBadPageParamNames = new HashSet<String>();

233 sBadPageParamNames.add("baixar-gratis");	511 sBadPageParamNames.add("baixar-gratis");

(...skipping 18 matching lines...) Expand all Loading...
252 sBadPageParamNames.add("search_keyword");	530 sBadPageParamNames.add("search_keyword");

253 sBadPageParamNames.add("search_query");	531 sBadPageParamNames.add("search_query");

254 sBadPageParamNames.add("sortby");	532 sBadPageParamNames.add("sortby");

255 sBadPageParamNames.add("subscriptions");	533 sBadPageParamNames.add("subscriptions");

256 sBadPageParamNames.add("tag");	534 sBadPageParamNames.add("tag");

257 sBadPageParamNames.add("tags");	535 sBadPageParamNames.add("tags");

258 sBadPageParamNames.add("video");	536 sBadPageParamNames.add("video");

259 sBadPageParamNames.add("videos");	537 sBadPageParamNames.add("videos");

260 sBadPageParamNames.add("w");	538 sBadPageParamNames.add("w");

261 sBadPageParamNames.add("wiki");	539 sBadPageParamNames.add("wiki");

262 } // initBadPageParamNames	540 }

263	541

264 }	542 }

OLD	NEW

« no previous file with comments | « no previous file | java/org/chromium/distiller/PagePattern.java » ('j') | java/org/chromium/distiller/PagePattern.java » ('J')