OLD | NEW |
---|---|
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import com.google.gwt.regexp.shared.MatchResult; | 7 import com.google.gwt.regexp.shared.MatchResult; |
8 import com.google.gwt.regexp.shared.RegExp; | 8 import com.google.gwt.regexp.shared.RegExp; |
9 | 9 |
10 import java.util.ArrayList; | 10 import java.util.ArrayList; |
11 import java.util.Arrays; | 11 import java.util.Arrays; |
(...skipping 10 matching lines...) Expand all Loading... | |
22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The | 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The |
23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w hich contains the | 23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w hich contains the |
24 * whole content, called "single page". | 24 * whole content, called "single page". |
25 * | 25 * |
26 * Definitions: | 26 * Definitions: |
27 * A single page document is a document that contains the whole content. | 27 * A single page document is a document that contains the whole content. |
28 * A paging document is one of the partial pages. | 28 * A paging document is one of the partial pages. |
29 * "digital" means the text contains only digits. | 29 * "digital" means the text contains only digits. |
30 * A page pattern is a paging URL whose page parameter value is replaced with a place holder | 30 * A page pattern is a paging URL whose page parameter value is replaced with a place holder |
31 * (PAGE_PARAM_PLACEHOLDER). | 31 * (PAGE_PARAM_PLACEHOLDER). |
32 * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pat tern is | 32 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is |
33 * "http: *www.foo.com/a/b-[*!].html". | 33 * "http://www.foo.com/a/b-[*!].html". |
34 * | 34 * |
35 * This class extracts the page parameter from a document's outlinks. | 35 * This class extracts the page parameter from a document's outlinks. |
36 * The basic idea: | 36 * The basic idea: |
37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital anchor text. | 37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital anchor text. |
38 * #2. For each group, determine the relationship between digital anchor texts and digital parts | 38 * #2. For each group, determine the relationship between digital anchor texts and digital parts |
39 * (either a query value or a path component) in URL. If one part of a UR L is always a linear | 39 * (either a query value or a path component) in URL. If one part of a UR L is always a linear |
40 * map from its digital anchor text, we guess the part is the page parame ter of the URL. | 40 * map from its digital anchor text, we guess the part is the page parame ter of the URL. |
41 * | 41 * |
42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo llowing digital | 42 * As an example, consider a document http://a/b?c=1&p=10, which contains the fo llowing digital |
43 * outlinks: | 43 * outlinks: |
44 * <a href=http: *a/b?c=1&p=20>3</a> | 44 * <a href=http://a/b?c=1&p=20>3</a> |
45 * <a href=http: *a/b?c=1&p=30>4</a> | 45 * <a href=http://a/b?c=1&p=30>4</a> |
46 * <a href=http: *a/b?c=1&p=40>5</a> | 46 * <a href=http://a/b?c=1&p=40>5</a> |
47 * <a href=http: *a/b?c=1&p=all>single page</a> | 47 * <a href=http://a/b?c=1&p=all>single page</a> |
48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so | 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so |
49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!]. | 49 * guesses it is the page parameter. The associated page pattern is http://a/b? c=1&p=[*!]. |
50 * Then, this class extracts the single page based on page parameter info. The single page url is | 50 * Then, this class extracts the single page based on page parameter info. The single page url is |
51 * http: *a/b?c=1&p=all. | 51 * http://a/b?c=1&p=all. |
52 */ | 52 */ |
53 public class PageParameterDetector { | 53 public class PageParameterDetector { |
54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | 54 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; |
55 | |
56 static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | |
57 static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length( ); | |
58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; | |
59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; | |
60 | |
61 /** | |
62 * The interface that page pattern handlers must implement to detect page pa rameter from | |
63 * potential pagination URLs. | |
64 */ | |
65 interface PagePattern { | |
66 /** | |
67 * Returns the string of the URL page pattern. | |
68 */ | |
69 String toString(); | |
70 | |
71 /** | |
72 * Returns the page number extracted from the URL during creation of obje ct that implements | |
73 * this interface. | |
74 */ | |
75 int getPageNumber(); | |
76 | |
77 /** | |
78 * Validates this page pattern according to the current document URL thr ough a pipeline of | |
79 * rules. | |
80 * | |
81 * Returns true if page pattern is valid. | |
82 * | |
83 * @param docUrl the current document URL | |
84 */ | |
85 boolean isValidFor(ParsedUrl docUrl); | |
86 | |
87 /** | |
88 * Returns true if a URL matches this page pattern based on a pipeline o f rules. | |
89 * | |
90 * @param url the URL to evalutate | |
91 */ | |
92 boolean isPagingUrl(String url); | |
93 } | |
55 | 94 |
56 /** | 95 /** |
57 * Stores information about the link (anchor) after the page parameter is de tected: | 96 * Stores information about the link (anchor) after the page parameter is de tected: |
58 * - the page number (as represented by the original plain text) for the lin k | 97 * - the page number (as represented by the original plain text) for the lin k |
59 * - the original page parameter numeric component in the URL (this componen t would be replaced | 98 * - the original page parameter numeric component in the URL (this componen t would be replaced |
60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) | 99 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) |
61 * - the position of this link in the list of ascending numbers. | 100 * - the position of this link in the list of ascending numbers. |
62 */ | 101 */ |
63 static class LinkInfo { | 102 static class LinkInfo { |
64 private int mPageNum; | 103 private int mPageNum; |
65 private int mPageParamValue; | 104 private int mPageParamValue; |
66 private int mPosInAscendingList; | 105 private int mPosInAscendingList; |
67 | 106 |
68 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { | 107 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { |
69 mPageNum = pageNum; | 108 mPageNum = pageNum; |
70 mPageParamValue = pageParamValue; | 109 mPageParamValue = pageParamValue; |
71 mPosInAscendingList = posInAscendingList; | 110 mPosInAscendingList = posInAscendingList; |
72 } | 111 } |
73 } // LinkInfo | 112 } |
74 | 113 |
75 /** | 114 /** |
76 * Stores a map of URL pattern to its associated list of LinkInfo's. | 115 * Stores a map of URL pattern to its associated list of LinkInfo's. |
77 */ | 116 */ |
78 private static class PageCandidatesMap { | 117 private static class PageCandidatesMap { |
79 private final Map<String, List<LinkInfo>> map = new HashMap<String, List <LinkInfo>>(); | 118 private static class Info { |
119 private final PagePattern mPattern; | |
120 private final List<LinkInfo> mLinks; | |
121 | |
122 Info(PagePattern pattern, LinkInfo link) { | |
123 mPattern = pattern; | |
124 mLinks = new ArrayList<LinkInfo>(); | |
125 mLinks.add(link); | |
126 } | |
127 } | |
128 | |
129 private final Map<String, Info> map = new HashMap<String, Info>(); | |
80 | 130 |
81 /** | 131 /** |
82 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al ready exists, adds | 132 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al ready exists, adds |
83 * the link to the list of LinkInfo's. Otherwise, creates a new map ent ry. | 133 * the link to the list of LinkInfo's. Otherwise, creates a new map ent ry. |
84 */ | 134 */ |
85 private void add(String urlPattern, LinkInfo link) { | 135 private void add(PagePattern pattern, LinkInfo link) { |
86 if (map.containsKey(urlPattern)) { | 136 final String patternStr = pattern.toString(); |
87 map.get(urlPattern).add(link); | 137 if (map.containsKey(patternStr)) { |
138 map.get(patternStr).mLinks.add(link); | |
88 } else { | 139 } else { |
89 List<LinkInfo> links = new ArrayList<LinkInfo>(); | 140 map.put(patternStr, new Info(pattern, link)); |
90 links.add(link); | |
91 map.put(urlPattern, links); | |
92 } | 141 } |
93 } | 142 } |
94 | 143 } |
95 } // PageCandidatesMap | |
96 | 144 |
97 // All the known bad page param names. | 145 // All the known bad page param names. |
98 private static Set<String> sBadPageParamNames = null; | 146 private static Set<String> sBadPageParamNames = null; |
99 | 147 |
100 /** | 148 /** |
101 * Extracts page parameter candidates from the query part of given URL and a dds the associated | 149 * Extracts page parameter candidates from the query part of given URL and a dds the associated |
102 * links into pageCandidates which is keyed by page pattern. | 150 * links into pageCandidates which is keyed by page pattern. |
103 * | 151 * |
104 * A page parameter candidate is one where: | 152 * A page parameter candidate is one where: |
105 * - the name of a query name-value component is not one of sBadPageParamNam es, and | 153 * - the name of a query name-value component is not one of sBadPageParamNam es, and |
106 * - the value of the query component is a plain number (>= 0). | 154 * - the value of the query component is a plain number (>= 0). |
107 * E.g. a URL query with 3 plain number query values will generate 3 URL pag e patterns with 3 | 155 * E.g. a URL query with 3 plain number query values will generate 3 URL pag e patterns with 3 |
108 * LinkInfo's, and hence 3 page parameter candidates. | 156 * LinkInfo's, and hence 3 page parameter candidates. |
109 * | 157 * |
110 * @param url ParsedUrl of the URL to process | 158 * @param url ParsedUrl of the URL to process |
111 * @param pageNum the page number as represented in original plain text | 159 * @param pageNum the page number as represented in original plain text |
112 * @param posInAscendingNumbers position of this page number in the list of ascending numbers | 160 * @param posInAscendingNumbers position of this page number in the list of ascending numbers |
113 * @param pageCandidates the map of URL pattern to its associated list of Li nkInfo's | 161 * @param pageCandidates the map of URL pattern to its associated list of Li nkInfo's |
114 */ | 162 */ |
115 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p ageNum, | 163 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p ageNum, |
116 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { | 164 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { |
117 String[][] queryParams = url.getQueryParams(); | 165 String[][] queryParams = url.getQueryParams(); |
118 if (queryParams.length == 0) return; // No query. | 166 if (queryParams.length == 0) return; // No query. |
119 | 167 |
120 for (String[] nameValue : queryParams) { | 168 for (String[] nameValue : queryParams) { |
121 final String queryName = nameValue[0]; | 169 PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0] , nameValue[1]); |
122 final String queryValue = nameValue[1]; | 170 if (pattern != null) { |
123 if (!queryName.isEmpty() && !queryValue.isEmpty() && | 171 pageCandidates.add(pattern, |
124 StringUtil.isStringAllDigits(queryValue) && !isPageParamName Bad(queryName)) { | 172 new LinkInfo(pageNum, pattern.getPageNumber(), posInAsce ndingNumbers)); |
125 int value = StringUtil.toNumber(queryValue); | |
126 if (value >= 0) { | |
127 pageCandidates.add( | |
128 url.replaceQueryValue(queryName, queryValue, PAGE_PA RAM_PLACEHOLDER), | |
129 new LinkInfo(pageNum, value, posInAscendingNumbers)) ; | |
130 } | |
131 } | 173 } |
132 } | 174 } |
133 } // extractPageParamCandidatesFromQuery | 175 } |
134 | 176 |
135 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. | 177 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. |
136 | 178 |
137 /** | 179 /** |
138 * Extracts page parameter candidates from the path part of given URL (witho ut query components) | 180 * Extracts page parameter candidates from the path part of given URL (witho ut query components) |
139 * and adds the associated links into pageCandidates which is keyed by page pattern. | 181 * and adds the associated links into pageCandidates which is keyed by page pattern. |
140 * | 182 * |
141 * A page parameter candidate is one where a path component contains consecu tive digits which | 183 * A page parameter candidate is one where a path component contains consecu tive digits which |
142 * can be converted to a plain number (>= 0). | 184 * can be converted to a plain number (>= 0). |
143 * E.g. a URL path with 3 path components that contain plain numbers will ge nerate 3 URL page | 185 * E.g. a URL path with 3 path components that contain plain numbers will ge nerate 3 URL page |
(...skipping 15 matching lines...) Expand all Loading... | |
159 final String urlStr = url.toString(); | 201 final String urlStr = url.toString(); |
160 final int pathStart = url.getOrigin().length(); | 202 final int pathStart = url.getOrigin().length(); |
161 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi" ); | 203 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi" ); |
162 sDigitsRegExp.setLastIndex(pathStart); | 204 sDigitsRegExp.setLastIndex(pathStart); |
163 while (true) { | 205 while (true) { |
164 MatchResult match = sDigitsRegExp.exec(urlStr); | 206 MatchResult match = sDigitsRegExp.exec(urlStr); |
165 if (match == null) break; | 207 if (match == null) break; |
166 | 208 |
167 final int matchEnd = sDigitsRegExp.getLastIndex(); | 209 final int matchEnd = sDigitsRegExp.getLastIndex(); |
168 final int matchStart = matchEnd - match.getGroup(1).length(); | 210 final int matchStart = matchEnd - match.getGroup(1).length(); |
169 | 211 PagePattern pattern = PathComponentPagePattern.create(url, pathStart , matchStart, |
170 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat chEnd)) continue; | 212 matchEnd); |
171 | 213 if (pattern != null) { |
172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d)); | 214 pageCandidates.add(pattern, |
173 if (value >= 0) { | 215 new LinkInfo(pageNum, pattern.getPageNumber(), posInAsce ndingNumbers)); |
174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER + | |
175 urlStr.substring(matchEnd), | |
176 new LinkInfo(pageNum, value, posInAscendingNumbers)); | |
177 } | 216 } |
178 } // while there're matches | 217 } // while there're matches |
179 } // extractPageParamCandidatesFromPath | 218 } |
219 | |
220 /** | |
221 * Evaluates if the given list of LinkInfo's is a list of paging URLs: | |
222 * - page numbers in list of LinkInfo's must be adjacent | |
223 * - page numbers in list of ascending numbers must either | |
224 * - be consecutive and form a page number sequence, or | |
225 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b | |
226 * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must | |
227 * match page pattern, and the only outlink must be 2nd or 3rd page. | |
228 * | |
229 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null. | |
230 * | |
231 * @param allLinkInfo the list of LinkInfo's to evaluate | |
232 * @param pagePattern the URL pattern to use | |
233 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
234 * @param firstPageUrl the URL of the PageInfo with mPageNum=1 | |
235 */ | |
236 private static PageParamInfo getPageParamInfo(PagePattern pagePattern, | |
cjhopman
2015/04/16 21:58:32
I feel like we have to many SomethingInfo types he
cjhopman
2015/04/16 21:58:32
Would it make sense for this function to be in the
kuan
2015/04/20 23:11:13
i'm bad at names, and ran out of them :(
kuan
2015/04/20 23:11:13
Done. to avoid cross-access between PageParameter
| |
237 List<LinkInfo> allLinkInfo, List<PageParamInfo.PageInfo> ascendingNu mbers, | |
238 String firstPageUrl) { | |
239 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { | |
240 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers); | |
241 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; | |
242 | |
243 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul a(allLinkInfo); | |
244 | |
245 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu tive and of a page | |
246 // number sequence. | |
247 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null; | |
248 if (!isPageNumberSequence(ascendingNumbers)) return null; | |
249 PageParamInfo pageParamInfo = new PageParamInfo(); | |
cjhopman
2015/04/16 21:58:32
Could we move some of the logic of creating the pa
kuan
2015/04/20 23:11:13
Done.
| |
250 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
251 pageParamInfo.mFormula = linearFormula; | |
252 for (LinkInfo link : allLinkInfo) { | |
253 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m PageNum, | |
254 ascendingNumbers.get(link.mPosInAscendingList).mUrl)); | |
255 } | |
256 return pageParamInfo; | |
257 } | |
258 | |
259 // Most of news article have no more than 3 pages and the first page pro bably doesn't have | |
260 // any page parameter. If the first page url matches the the page patte rn, we treat it as | |
261 // the first page of this pattern. | |
262 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { | |
263 final LinkInfo onlyLink = allLinkInfo.get(0); | |
264 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && | |
265 onlyLink.mPosInAscendingList == 1; | |
266 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && | |
267 onlyLink.mPosInAscendingList == 2 && | |
268 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3 | |
269 // elements; check if previous element is previous page. | |
270 ascendingNumbers.get(1).mPageNum == 2; | |
271 // 1 LinkInfo means ascendingNumbers has >= 1 element. | |
272 if (ascendingNumbers.get(0).mPageNum == 1 && | |
273 (secondPageIsOutlink || thirdPageIsOutlink) && | |
274 pagePattern.isPagingUrl(firstPageUrl)) { | |
275 // Has valid PageParamInfo, create and populate it. | |
276 PageParamInfo pageParamInfo = new PageParamInfo(); | |
277 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
278 int coefficient; | |
279 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; | |
280 if (delta == 0 || delta == 1) { | |
281 coefficient = 1; | |
282 } else { | |
283 coefficient = onlyLink.mPageParamValue; | |
284 delta = 0; | |
285 } | |
286 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic ient, delta); | |
287 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir stPageUrl)); | |
288 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi nk.mPageNum, | |
289 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) ); | |
290 return pageParamInfo; | |
291 } | |
292 } | |
293 | |
294 return null; | |
295 } | |
180 | 296 |
181 /** | 297 /** |
182 * Returns true if given name is backlisted as a known bad page param name. | 298 * Returns true if given name is backlisted as a known bad page param name. |
183 */ | 299 */ |
184 private static boolean isPageParamNameBad(String name) { | 300 static boolean isPageParamNameBad(String name) { |
185 initBadPageParamNames(); | 301 initBadPageParamNames(); |
186 return sBadPageParamNames.contains(name.toLowerCase()); | 302 return sBadPageParamNames.contains(name.toLowerCase()); |
187 } // isPageParamNameBad | 303 } |
188 | 304 |
189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | 305 /** |
190 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. | 306 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n umbers in list of |
191 | 307 * PageParamInfo.PageInfo's are consecutive. |
192 /** | 308 * |
193 * Returns true if: | 309 * For adjacency, the page numbers in list of LinkInfo's must either be adja cent, or separated |
194 * - the digitStart to digitEnd of urlStr is the last path component, and | 310 * by at most 1 plain text number which must represent the current page numb er in one of the |
195 * - the entire path component is numeric, and | 311 * PageParamInfo.PageInfo's. |
196 * - the previous path component is a bad page param name. | 312 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list |
197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad | 313 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are |
198 * page param. | 314 * likely to be page size selection links (e.g. in the document "See 1-10, 1 1-20..."). |
199 */ | 315 * |
200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, | 316 * Returns a int value that is a combination of bits: |
201 int digitStart, int digitEnd) { | 317 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent |
202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. | 318 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive. |
203 pathStart < digitStart - 1) { // Not the first path component. | 319 * |
204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | 320 * @param allLinkInfo the list of LinkInfo's to evaluate |
205 // Checks that this is the last path component, and trailing charact ers, if available, | 321 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
206 // are (s)htm(l) extensions. | 322 */ |
207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); | 323 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo, |
208 if (sExtRegExp.test(postMatch)) { | 324 List<PageParamInfo.PageInfo> ascendingNumbers) { |
209 // Entire component is numeric, get previous path component. | 325 int result = 0; |
210 if (sLastPathComponentRegExp == null) { | 326 |
211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; | 327 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is |
212 } | 328 // current page number respresented in plain text. |
213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | 329 int firstPos = -1; |
214 urlStr.substring(pathStart + 1, digitStart)); | 330 int lastPos = -1; |
215 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && | 331 int gapPos = -1; |
216 isPageParamNameBad(prevPathComponent.getGroup(1))) { | 332 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique. |
217 return true; | 333 for (LinkInfo linkInfo : allLinkInfo) { |
218 } | 334 final int currPos = linkInfo.mPosInAscendingList; |
219 } // last numeric path component | 335 if (lastPos == -1) { |
220 } | 336 firstPos = currPos; |
221 | 337 } else if (currPos != lastPos + 1) { |
222 return false; | 338 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6 |
223 } // isLastNumericPathComponentBad | 339 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not |
340 // adjacent. | |
341 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1 ) return result; | |
342 gapPos = currPos - 1; | |
343 } | |
344 // Make sure page param value, i.e. page number represented in plain text, is unique. | |
345 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; | |
346 lastPos = currPos; | |
347 } // for all LinkInfo's | |
348 | |
349 result |= PAGE_NUM_ADJACENT_MASK; | |
350 | |
351 // Now, determine if page numbers in ascendingNumbers are consecutive. | |
352 | |
353 // First, handle the gap. | |
354 if (gapPos != -1) { | |
355 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu lt; | |
356 // The "gap" should represent current page number in plain text. | |
357 // Check if its adjacent page numbers are consecutive. | |
358 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. | |
359 // This can eliminate links affecting the number of items on a page. | |
360 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; | |
361 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && | |
362 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) { | |
363 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
364 } | |
365 return result; | |
366 } | |
367 | |
368 // There is no gap. Check if at least one of the following cases is sat isfied: | |
369 // Case #1: "[1] [2] ..." or "1 [2] ... ". | |
370 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 && | |
371 ascendingNumbers.get(1).mPageNum == 2) { | |
372 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
373 } | |
374 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. | |
375 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && | |
376 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) { | |
377 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
378 } | |
379 // Case #3: "... [n-1] [n]" or "... [n - 1] n". | |
380 final int numbersSize = ascendingNumbers.size(); | |
381 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && | |
382 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == | |
383 ascendingNumbers.get(numbersSize - 1).mPageNum) { | |
384 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
385 } | |
386 // Case #4: "... [i-1] [i] [i+1] ...". | |
387 for (int i = firstPos + 1; i < lastPos; i++) { | |
388 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) { | |
389 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
390 } | |
391 } | |
392 | |
393 // Otherwise, there's no pair of consecutive values. | |
394 return result; | |
395 } | |
396 | |
397 /** | |
398 * | |
399 * Determines if the list of LinkInfo's form a linear formula: | |
400 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0). | |
401 * | |
402 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2 | |
403 * LinkInfo's, and then validated against the remaining LinkInfo's. | |
404 * The order of page numbers doesn't matter. | |
405 * | |
406 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta , if the page | |
407 * parameter forumla could be determined. Otherwise, returns null. | |
408 * | |
409 * @param allLinkInfo the list of LinkInfo's to evaluate | |
410 */ | |
411 // TODO(kuan): As this gets rolled out, reassesss the necessity of non-1 coe fficient support. | |
412 private static PageParamInfo.LinearFormula getPageParamLinearFormula( | |
413 List<LinkInfo> allLinkInfo) { | |
414 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; | |
415 | |
416 final LinkInfo firstLink = allLinkInfo.get(0); | |
417 final LinkInfo secondLink = allLinkInfo.get(1); | |
418 | |
419 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) { | |
420 return null; | |
421 } | |
422 | |
423 int deltaX = secondLink.mPageNum - firstLink.mPageNum; | |
424 if (deltaX == 0) return null; | |
425 | |
426 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; | |
427 int coefficient = deltaY / deltaX; | |
428 if (coefficient == 0) return null; | |
429 | |
430 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ; | |
431 if (delta != 0 && delta != -coefficient) return null; | |
432 | |
433 // Check if the remaining elements are on the same linear map. | |
434 for (int i = 2; i < allLinkInfo.size(); i++) { | |
435 final LinkInfo link = allLinkInfo.get(i); | |
436 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null; | |
437 } | |
438 | |
439 return new PageParamInfo.LinearFormula(coefficient, delta); | |
440 } | |
441 | |
442 /** | |
443 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s equence, based on | |
444 * a pipeline of rules: | |
445 * - first PageInfo must have a URL unless it is the first page | |
446 * - there's only one plain number without URL in list | |
447 * - if only two pages, they must be siblings - 2nd page number must follow 1st | |
448 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be | |
449 * head/tail or have URLs. | |
450 * | |
451 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
452 */ | |
453 private static boolean isPageNumberSequence(List<PageParamInfo.PageInfo> asc endingNumbers) { | |
454 if (ascendingNumbers.size() <= 1) return false; | |
455 | |
456 // The first one must have a URL unless it is the first page. | |
457 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0); | |
458 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; | |
459 | |
460 // There's only one plain number without URL in ascending numbers group. | |
461 boolean hasPlainNum = false; | |
462 for (PageParamInfo.PageInfo page : ascendingNumbers) { | |
463 if (page.mUrl.isEmpty()) { | |
464 if (hasPlainNum) return false; | |
465 hasPlainNum = true; | |
466 } | |
467 } | |
468 | |
469 // If there are only two pages, they must be siblings. | |
470 if (ascendingNumbers.size() == 2) { | |
471 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; | |
472 } | |
473 | |
474 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e. | |
475 for (int i = 1; i < ascendingNumbers.size(); i++) { | |
476 // If two adjacent numbers are not consecutive, we accept them only when: | |
477 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n]. | |
478 // 2) both of them have URLs. | |
479 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i); | |
480 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1); | |
481 if (currPage.mPageNum - prevPage.mPageNum != 1) { | |
482 if (i != 1 && i != ascendingNumbers.size() - 1) return false; | |
483 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f alse; | |
484 } | |
485 } | |
486 | |
487 return true; | |
488 } | |
489 | |
490 /** | |
491 * Returns true if given string can be converted to a number >= 0. | |
492 */ | |
493 static boolean isPlainNumber(String str) { | |
494 return StringUtil.toNumber(str) >= 0; | |
495 } | |
224 | 496 |
225 /** | 497 /** |
226 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in | 498 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in |
227 * alphabetical order. | 499 * alphabetical order. |
228 */ | 500 */ |
229 private static void initBadPageParamNames() { | 501 private static void initBadPageParamNames() { |
230 if (sBadPageParamNames != null) return; | 502 if (sBadPageParamNames != null) return; |
231 | 503 |
232 sBadPageParamNames = new HashSet<String>(); | 504 sBadPageParamNames = new HashSet<String>(); |
233 sBadPageParamNames.add("baixar-gratis"); | 505 sBadPageParamNames.add("baixar-gratis"); |
(...skipping 18 matching lines...) Expand all Loading... | |
252 sBadPageParamNames.add("search_keyword"); | 524 sBadPageParamNames.add("search_keyword"); |
253 sBadPageParamNames.add("search_query"); | 525 sBadPageParamNames.add("search_query"); |
254 sBadPageParamNames.add("sortby"); | 526 sBadPageParamNames.add("sortby"); |
255 sBadPageParamNames.add("subscriptions"); | 527 sBadPageParamNames.add("subscriptions"); |
256 sBadPageParamNames.add("tag"); | 528 sBadPageParamNames.add("tag"); |
257 sBadPageParamNames.add("tags"); | 529 sBadPageParamNames.add("tags"); |
258 sBadPageParamNames.add("video"); | 530 sBadPageParamNames.add("video"); |
259 sBadPageParamNames.add("videos"); | 531 sBadPageParamNames.add("videos"); |
260 sBadPageParamNames.add("w"); | 532 sBadPageParamNames.add("w"); |
261 sBadPageParamNames.add("wiki"); | 533 sBadPageParamNames.add("wiki"); |
262 } // initBadPageParamNames | 534 } |
263 | 535 |
264 } | 536 } |
OLD | NEW |