| OLD | NEW |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import com.google.gwt.regexp.shared.MatchResult; | 7 import com.google.gwt.regexp.shared.MatchResult; |
| 8 import com.google.gwt.regexp.shared.RegExp; | 8 import com.google.gwt.regexp.shared.RegExp; |
| 9 | 9 |
| 10 import java.util.ArrayList; | 10 import java.util.ArrayList; |
| (...skipping 11 matching lines...) Expand all Loading... |
| 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part
ial pages. The | 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part
ial pages. The |
| 23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w
hich contains the | 23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w
hich contains the |
| 24 * whole content, called "single page". | 24 * whole content, called "single page". |
| 25 * | 25 * |
| 26 * Definitions: | 26 * Definitions: |
| 27 * A single page document is a document that contains the whole content. | 27 * A single page document is a document that contains the whole content. |
| 28 * A paging document is one of the partial pages. | 28 * A paging document is one of the partial pages. |
| 29 * "digital" means the text contains only digits. | 29 * "digital" means the text contains only digits. |
| 30 * A page pattern is a paging URL whose page parameter value is replaced with a
place holder | 30 * A page pattern is a paging URL whose page parameter value is replaced with a
place holder |
| 31 * (PAGE_PARAM_PLACEHOLDER). | 31 * (PAGE_PARAM_PLACEHOLDER). |
| 32 * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pat
tern is | 32 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat
tern is |
| 33 * "http: *www.foo.com/a/b-[*!].html". | 33 * "http://www.foo.com/a/b-[*!].html". |
| 34 * | 34 * |
| 35 * This class extracts the page parameter from a document's outlinks. | 35 * This class extracts the page parameter from a document's outlinks. |
| 36 * The basic idea: | 36 * The basic idea: |
| 37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital
anchor text. | 37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital
anchor text. |
| 38 * #2. For each group, determine the relationship between digital anchor texts
and digital parts | 38 * #2. For each group, determine the relationship between digital anchor texts
and digital parts |
| 39 * (either a query value or a path component) in URL. If one part of a UR
L is always a linear | 39 * (either a query value or a path component) in URL. If one part of a UR
L is always a linear |
| 40 * map from its digital anchor text, we guess the part is the page parame
ter of the URL. | 40 * map from its digital anchor text, we guess the part is the page parame
ter of the URL. |
| 41 * | 41 * |
| 42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo
llowing digital | 42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo
llowing digital |
| 43 * outlinks: | 43 * outlinks: |
| 44 * <a href=http: *a/b?c=1&p=20>3</a> | 44 * <a href=http: *a/b?c=1&p=20>3</a> |
| 45 * <a href=http: *a/b?c=1&p=30>4</a> | 45 * <a href=http: *a/b?c=1&p=30>4</a> |
| 46 * <a href=http: *a/b?c=1&p=40>5</a> | 46 * <a href=http: *a/b?c=1&p=40>5</a> |
| 47 * <a href=http: *a/b?c=1&p=all>single page</a> | 47 * <a href=http: *a/b?c=1&p=all>single page</a> |
| 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10
- 10, and so | 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10
- 10, and so |
| 49 * guesses it is the page parameter. The associated page pattern is http: *a/b?
c=1&p=[*!]. | 49 * guesses it is the page parameter. The associated page pattern is http: *a/b?
c=1&p=[*!]. |
| 50 * Then, this class extracts the single page based on page parameter info. The
single page url is | 50 * Then, this class extracts the single page based on page parameter info. The
single page url is |
| 51 * http: *a/b?c=1&p=all. | 51 * http: *a/b?c=1&p=all. |
| 52 */ | 52 */ |
| 53 public class PageParameterDetector { | 53 public class PageParameterDetector { |
| 54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | 54 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; |
| 55 |
| 56 static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
| 57 static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length(
); |
| 58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; |
| 59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; |
| 55 | 60 |
| 56 /** | 61 /** |
| 57 * Stores information about the link (anchor) after the page parameter is de
tected: | 62 * Stores information about the link (anchor) after the page parameter is de
tected: |
| 58 * - the page number (as represented by the original plain text) for the lin
k | 63 * - the page number (as represented by the original plain text) for the lin
k |
| 59 * - the original page parameter numeric component in the URL (this componen
t would be replaced | 64 * - the original page parameter numeric component in the URL (this componen
t would be replaced |
| 60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) | 65 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) |
| 61 * - the position of this link in the list of ascending numbers. | 66 * - the position of this link in the list of ascending numbers. |
| 62 */ | 67 */ |
| 63 static class LinkInfo { | 68 static class LinkInfo { |
| 64 private int mPageNum; | 69 private int mPageNum; |
| 65 private int mPageParamValue; | 70 private int mPageParamValue; |
| 66 private int mPosInAscendingList; | 71 private int mPosInAscendingList; |
| 67 | 72 |
| 68 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { | 73 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { |
| 69 mPageNum = pageNum; | 74 mPageNum = pageNum; |
| 70 mPageParamValue = pageParamValue; | 75 mPageParamValue = pageParamValue; |
| 71 mPosInAscendingList = posInAscendingList; | 76 mPosInAscendingList = posInAscendingList; |
| 72 } | 77 } |
| 73 } // LinkInfo | 78 } |
| 74 | 79 |
| 75 /** | 80 /** |
| 76 * Stores a map of URL pattern to its associated list of LinkInfo's. | 81 * Stores a map of URL pattern to its associated list of LinkInfo's. |
| 77 */ | 82 */ |
| 78 private static class PageCandidatesMap { | 83 private static class PageCandidatesMap { |
| 79 private final Map<String, List<LinkInfo>> map = new HashMap<String, List
<LinkInfo>>(); | 84 private static class Info { |
| 85 private final PagePattern mPattern; |
| 86 private final List<LinkInfo> mLinks; |
| 87 |
| 88 Info(PagePattern pattern, LinkInfo link) { |
| 89 mPattern = pattern; |
| 90 mLinks = new ArrayList<LinkInfo>(); |
| 91 mLinks.add(link); |
| 92 } |
| 93 } |
| 94 |
| 95 private final Map<String, Info> map = new HashMap<String, Info>(); |
| 80 | 96 |
| 81 /** | 97 /** |
| 82 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al
ready exists, adds | 98 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al
ready exists, adds |
| 83 * the link to the list of LinkInfo's. Otherwise, creates a new map ent
ry. | 99 * the link to the list of LinkInfo's. Otherwise, creates a new map ent
ry. |
| 100 * Returns true if addition is successful. |
| 84 */ | 101 */ |
| 85 private void add(String urlPattern, LinkInfo link) { | 102 private boolean add(String urlPattern, LinkInfo link) { |
| 86 if (map.containsKey(urlPattern)) { | 103 if (map.containsKey(urlPattern)) { |
| 87 map.get(urlPattern).add(link); | 104 map.get(urlPattern).mLinks.add(link); |
| 88 } else { | 105 return true; |
| 89 List<LinkInfo> links = new ArrayList<LinkInfo>(); | |
| 90 links.add(link); | |
| 91 map.put(urlPattern, links); | |
| 92 } | 106 } |
| 107 PagePattern pat = PagePattern.create(urlPattern); |
| 108 if (pat == null) return false; |
| 109 map.put(urlPattern, new Info(pat, link)); |
| 110 return true; |
| 93 } | 111 } |
| 94 | 112 } |
| 95 } // PageCandidatesMap | |
| 96 | 113 |
| 97 // All the known bad page param names. | 114 // All the known bad page param names. |
| 98 private static Set<String> sBadPageParamNames = null; | 115 private static Set<String> sBadPageParamNames = null; |
| 99 | 116 |
| 100 /** | 117 /** |
| 101 * Extracts page parameter candidates from the query part of given URL and a
dds the associated | 118 * Extracts page parameter candidates from the query part of given URL and a
dds the associated |
| 102 * links into pageCandidates which is keyed by page pattern. | 119 * links into pageCandidates which is keyed by page pattern. |
| 103 * | 120 * |
| 104 * A page parameter candidate is one where: | 121 * A page parameter candidate is one where: |
| 105 * - the name of a query name-value component is not one of sBadPageParamNam
es, and | 122 * - the name of a query name-value component is not one of sBadPageParamNam
es, and |
| (...skipping 17 matching lines...) Expand all Loading... |
| 123 if (!queryName.isEmpty() && !queryValue.isEmpty() && | 140 if (!queryName.isEmpty() && !queryValue.isEmpty() && |
| 124 StringUtil.isStringAllDigits(queryValue) && !isPageParamName
Bad(queryName)) { | 141 StringUtil.isStringAllDigits(queryValue) && !isPageParamName
Bad(queryName)) { |
| 125 int value = StringUtil.toNumber(queryValue); | 142 int value = StringUtil.toNumber(queryValue); |
| 126 if (value >= 0) { | 143 if (value >= 0) { |
| 127 pageCandidates.add( | 144 pageCandidates.add( |
| 128 url.replaceQueryValue(queryName, queryValue, PAGE_PA
RAM_PLACEHOLDER), | 145 url.replaceQueryValue(queryName, queryValue, PAGE_PA
RAM_PLACEHOLDER), |
| 129 new LinkInfo(pageNum, value, posInAscendingNumbers))
; | 146 new LinkInfo(pageNum, value, posInAscendingNumbers))
; |
| 130 } | 147 } |
| 131 } | 148 } |
| 132 } | 149 } |
| 133 } // extractPageParamCandidatesFromQuery | 150 } |
| 134 | 151 |
| 135 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. | 152 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. |
| 136 | 153 |
| 137 /** | 154 /** |
| 138 * Extracts page parameter candidates from the path part of given URL (witho
ut query components) | 155 * Extracts page parameter candidates from the path part of given URL (witho
ut query components) |
| 139 * and adds the associated links into pageCandidates which is keyed by page
pattern. | 156 * and adds the associated links into pageCandidates which is keyed by page
pattern. |
| 140 * | 157 * |
| 141 * A page parameter candidate is one where a path component contains consecu
tive digits which | 158 * A page parameter candidate is one where a path component contains consecu
tive digits which |
| 142 * can be converted to a plain number (>= 0). | 159 * can be converted to a plain number (>= 0). |
| 143 * E.g. a URL path with 3 path components that contain plain numbers will ge
nerate 3 URL page | 160 * E.g. a URL path with 3 path components that contain plain numbers will ge
nerate 3 URL page |
| (...skipping 25 matching lines...) Expand all Loading... |
| 169 | 186 |
| 170 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat
chEnd)) continue; | 187 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat
chEnd)) continue; |
| 171 | 188 |
| 172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn
d)); | 189 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn
d)); |
| 173 if (value >= 0) { | 190 if (value >= 0) { |
| 174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_
PLACEHOLDER + | 191 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_
PLACEHOLDER + |
| 175 urlStr.substring(matchEnd), | 192 urlStr.substring(matchEnd), |
| 176 new LinkInfo(pageNum, value, posInAscendingNumbers)); | 193 new LinkInfo(pageNum, value, posInAscendingNumbers)); |
| 177 } | 194 } |
| 178 } // while there're matches | 195 } // while there're matches |
| 179 } // extractPageParamCandidatesFromPath | 196 } |
| 197 |
| 198 /** |
| 199 * Evaluates if the given list of LinkInfo's is a list of paging URLs: |
| 200 * - page numbers in list of LinkInfo's must be adjacent |
| 201 * - page numbers in list of ascending numbers must either |
| 202 * - be consecutive and form a page number sequence, or |
| 203 * - must construct a linear map with a linear formula: page_parameter = a
* page_number + b |
| 204 * - if there's only 1 LinkInfo, the first ascending number must be page 1,
first page URL must |
| 205 * match page pattern, and the only outlink must be 2nd or 3rd page. |
| 206 * |
| 207 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns
null. |
| 208 * |
| 209 * @param allLinkInfo the list of LinkInfo's to evaluate |
| 210 * @param pagePattern the URL pattern to use |
| 211 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| 212 * @param firstPageUrl the URL of the PageInfo with mPageNum=1 |
| 213 */ |
| 214 private static PageParamInfo getPageParamInfo(PagePattern pagePattern, |
| 215 List<LinkInfo> allLinkInfo, List<PageParamInfo.PageInfo> ascendingNu
mbers, |
| 216 String firstPageUrl) { |
| 217 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { |
| 218 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin
gNumbers); |
| 219 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; |
| 220 |
| 221 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul
a(allLinkInfo); |
| 222 |
| 223 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu
tive and of a page |
| 224 // number sequence. |
| 225 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS
K) return null; |
| 226 if (!isPageNumberSequence(ascendingNumbers)) return null; |
| 227 PageParamInfo pageParamInfo = new PageParamInfo(); |
| 228 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; |
| 229 pageParamInfo.mFormula = linearFormula; |
| 230 for (LinkInfo link : allLinkInfo) { |
| 231 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m
PageNum, |
| 232 ascendingNumbers.get(link.mPosInAscendingList).mUrl)); |
| 233 } |
| 234 return pageParamInfo; |
| 235 } |
| 236 |
| 237 // Most of news article have no more than 3 pages and the first page pro
bably doesn't have |
| 238 // any page parameter. If the first page url matches the the page patte
rn, we treat it as |
| 239 // the first page of this pattern. |
| 240 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { |
| 241 final LinkInfo onlyLink = allLinkInfo.get(0); |
| 242 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && |
| 243 onlyLink.mPosInAscendingList == 1; |
| 244 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && |
| 245 onlyLink.mPosInAscendingList == 2 && |
| 246 // onlyLink's pos is 2 (evaluated right before), so ascendin
gNumbers has >= 3 |
| 247 // elements; check if previous element is previous page. |
| 248 ascendingNumbers.get(1).mPageNum == 2; |
| 249 // 1 LinkInfo means ascendingNumbers has >= 1 element. |
| 250 if (ascendingNumbers.get(0).mPageNum == 1 && |
| 251 (secondPageIsOutlink || thirdPageIsOutlink) && |
| 252 pagePattern.isPagingUrl(firstPageUrl)) { |
| 253 // Has valid PageParamInfo, create and populate it. |
| 254 PageParamInfo pageParamInfo = new PageParamInfo(); |
| 255 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; |
| 256 int coefficient; |
| 257 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; |
| 258 if (delta == 0 || delta == 1) { |
| 259 coefficient = 1; |
| 260 } else { |
| 261 coefficient = onlyLink.mPageParamValue; |
| 262 delta = 0; |
| 263 } |
| 264 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic
ient, delta); |
| 265 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir
stPageUrl)); |
| 266 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi
nk.mPageNum, |
| 267 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl)
); |
| 268 return pageParamInfo; |
| 269 } |
| 270 } |
| 271 |
| 272 return null; |
| 273 } |
| 180 | 274 |
| 181 /** | 275 /** |
| 182 * Returns true if given name is backlisted as a known bad page param name. | 276 * Returns true if given name is backlisted as a known bad page param name. |
| 183 */ | 277 */ |
| 184 private static boolean isPageParamNameBad(String name) { | 278 private static boolean isPageParamNameBad(String name) { |
| 185 initBadPageParamNames(); | 279 initBadPageParamNames(); |
| 186 return sBadPageParamNames.contains(name.toLowerCase()); | 280 return sBadPageParamNames.contains(name.toLowerCase()); |
| 187 } // isPageParamNameBad | 281 } |
| 188 | 282 |
| 189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | 283 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). |
| 190 private static RegExp sLastPathComponentRegExp = null; // Match last path c
omponent. | 284 private static RegExp sLastPathComponentRegExp = null; // Match last path c
omponent. |
| 191 | 285 |
| 192 /** | 286 /** |
| 193 * Returns true if: | 287 * Returns true if: |
| 194 * - the digitStart to digitEnd of urlStr is the last path component, and | 288 * - the digitStart to digitEnd of urlStr is the last path component, and |
| 195 * - the entire path component is numeric, and | 289 * - the entire path component is numeric, and |
| 196 * - the previous path component is a bad page param name. | 290 * - the previous path component is a bad page param name. |
| 197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an
d "tag" is a bad | 291 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an
d "tag" is a bad |
| 198 * page param. | 292 * page param. |
| 199 */ | 293 */ |
| 200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, | 294 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i
nt digitStart, |
| 201 int digitStart, int digitEnd) { | 295 int digitEnd) { |
| 202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path
component. | 296 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path
component. |
| 203 pathStart < digitStart - 1) { // Not the first path component. | 297 pathStart < digitStart - 1) { // Not the first path component. |
| 204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | 298 String postMatch = urlStr.substring(digitEnd).toLowerCase(); |
| 205 // Checks that this is the last path component, and trailing charact
ers, if available, | 299 // Checks that this is the last path component, and trailing charact
ers, if available, |
| 206 // are (s)htm(l) extensions. | 300 // are (s)htm(l) extensions. |
| 207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$",
"i"); | 301 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$",
"i"); |
| 208 if (sExtRegExp.test(postMatch)) { | 302 if (sExtRegExp.test(postMatch)) { |
| 209 // Entire component is numeric, get previous path component. | 303 // Entire component is numeric, get previous path component. |
| 210 if (sLastPathComponentRegExp == null) { | 304 if (sLastPathComponentRegExp == null) { |
| 211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i")
; | 305 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i")
; |
| 212 } | 306 } |
| 213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | 307 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( |
| 214 urlStr.substring(pathStart + 1, digitStart)); | 308 urlStr.substring(pathStart + 1, digitStart)); |
| 215 if (prevPathComponent != null && prevPathComponent.getGroupCount
() > 1 && | 309 if (prevPathComponent != null && prevPathComponent.getGroupCount
() > 1 && |
| 216 isPageParamNameBad(prevPathComponent.getGroup(1))) { | 310 isPageParamNameBad(prevPathComponent.getGroup(1))) { |
| 217 return true; | 311 return true; |
| 218 } | 312 } |
| 219 } // last numeric path component | 313 } // last numeric path component |
| 220 } | 314 } |
| 221 | 315 |
| 222 return false; | 316 return false; |
| 223 } // isLastNumericPathComponentBad | 317 } |
| 318 |
| 319 /** |
| 320 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n
umbers in list of |
| 321 * PageParamInfo.PageInfo's are consecutive. |
| 322 * |
| 323 * For adjacency, the page numbers in list of LinkInfo's must either be adja
cent, or separated |
| 324 * by at most 1 plain text number which must represent the current page numb
er in one of the |
| 325 * PageParamInfo.PageInfo's. |
| 326 * For consecutiveness, there must be at least one pair of consecutive numbe
r values in the list |
| 327 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise,
these outlinks are |
| 328 * likely to be page size selection links (e.g. in the document "See 1-10, 1
1-20..."). |
| 329 * |
| 330 * Returns a int value that is a combination of bits: |
| 331 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent |
| 332 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons
ecutive. |
| 333 * |
| 334 * @param allLinkInfo the list of LinkInfo's to evaluate |
| 335 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| 336 */ |
| 337 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo, |
| 338 List<PageParamInfo.PageInfo> ascendingNumbers) { |
| 339 int result = 0; |
| 340 |
| 341 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i
.e. the gap is |
| 342 // current page number respresented in plain text. |
| 343 int firstPos = -1; |
| 344 int lastPos = -1; |
| 345 int gapPos = -1; |
| 346 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa
ge number is unique. |
| 347 for (LinkInfo linkInfo : allLinkInfo) { |
| 348 final int currPos = linkInfo.mPosInAscendingList; |
| 349 if (lastPos == -1) { |
| 350 firstPos = currPos; |
| 351 } else if (currPos != lastPos + 1) { |
| 352 // If position is not strictly ascending, or the gap size is > 1
(e.g. "[3] [4] 5 6 |
| 353 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a
llLinkInfo is not |
| 354 // adjacent. |
| 355 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1
) return result; |
| 356 gapPos = currPos - 1; |
| 357 } |
| 358 // Make sure page param value, i.e. page number represented in plain
text, is unique. |
| 359 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; |
| 360 lastPos = currPos; |
| 361 } // for all LinkInfo's |
| 362 |
| 363 result |= PAGE_NUM_ADJACENT_MASK; |
| 364 |
| 365 // Now, determine if page numbers in ascendingNumbers are consecutive. |
| 366 |
| 367 // First, handle the gap. |
| 368 if (gapPos != -1) { |
| 369 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu
lt; |
| 370 // The "gap" should represent current page number in plain text. |
| 371 // Check if its adjacent page numbers are consecutive. |
| 372 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. |
| 373 // This can eliminate links affecting the number of items on a page. |
| 374 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; |
| 375 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && |
| 376 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1)
{ |
| 377 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 378 } |
| 379 return result; |
| 380 } |
| 381 |
| 382 // There is no gap. Check if at least one of the following cases is sat
isfied: |
| 383 // Case #1: "[1] [2] ..." or "1 [2] ... ". |
| 384 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum
== 1 && |
| 385 ascendingNumbers.get(1).mPageNum == 2) { |
| 386 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 387 } |
| 388 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. |
| 389 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && |
| 390 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get(
0).mUrl.isEmpty()) { |
| 391 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 392 } |
| 393 // Case #3: "... [n-1] [n]" or "... [n - 1] n". |
| 394 final int numbersSize = ascendingNumbers.size(); |
| 395 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && |
| 396 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == |
| 397 ascendingNumbers.get(numbersSize - 1).mPageNum) { |
| 398 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 399 } |
| 400 // Case #4: "... [i-1] [i] [i+1] ...". |
| 401 for (int i = firstPos + 1; i < lastPos; i++) { |
| 402 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get
(i + 1).mPageNum) { |
| 403 return result | PAGE_NUM_CONSECUTIVE_MASK; |
| 404 } |
| 405 } |
| 406 |
| 407 // Otherwise, there's no pair of consecutive values. |
| 408 return result; |
| 409 } |
| 410 |
| 411 /** |
| 412 * |
| 413 * Determines if the list of LinkInfo's form a linear formula: |
| 414 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o
r delta == 0). |
| 415 * |
| 416 * The coefficient and delta are calculated from the page parameter values a
nd page numbers of 2 |
| 417 * LinkInfo's, and then validated against the remaining LinkInfo's. |
| 418 * The order of page numbers doesn't matter. |
| 419 * |
| 420 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta
, if the page |
| 421 * parameter forumla could be determined. Otherwise, returns null. |
| 422 * |
| 423 * @param allLinkInfo the list of LinkInfo's to evaluate |
| 424 */ |
| 425 private static PageParamInfo.LinearFormula getPageParamLinearFormula( |
| 426 List<LinkInfo> allLinkInfo) { |
| 427 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; |
| 428 |
| 429 final LinkInfo firstLink = allLinkInfo.get(0); |
| 430 final LinkInfo secondLink = allLinkInfo.get(1); |
| 431 |
| 432 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m
PageNum) > 4) { |
| 433 return null; |
| 434 } |
| 435 |
| 436 int deltaX = secondLink.mPageNum - firstLink.mPageNum; |
| 437 if (deltaX == 0) return null; |
| 438 |
| 439 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; |
| 440 int coefficient = deltaY / deltaX; |
| 441 if (coefficient == 0) return null; |
| 442 |
| 443 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum
; |
| 444 if (delta != 0 && delta != -coefficient) return null; |
| 445 |
| 446 // Check if the remaining elements are on the same linear map. |
| 447 for (int i = 2; i < allLinkInfo.size(); i++) { |
| 448 final LinkInfo link = allLinkInfo.get(i); |
| 449 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret
urn null; |
| 450 } |
| 451 |
| 452 return new PageParamInfo.LinearFormula(coefficient, delta); |
| 453 } |
| 454 |
| 455 /** |
| 456 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s
equence, based on |
| 457 * a pipeline of rules: |
| 458 * - first PageInfo must have a URL unless it is the first page |
| 459 * - there's only one plain number without URL in list |
| 460 * - if only two pages, they must be siblings - 2nd page number must follow
1st |
| 461 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu
tive numbers must be |
| 462 * head/tail or have URLs. |
| 463 * |
| 464 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| 465 */ |
| 466 private static boolean isPageNumberSequence(List<PageParamInfo.PageInfo> asc
endingNumbers) { |
| 467 if (ascendingNumbers.size() <= 1) return false; |
| 468 |
| 469 // The first one must have a URL unless it is the first page. |
| 470 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0); |
| 471 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; |
| 472 |
| 473 // There's only one plain number without URL in ascending numbers group. |
| 474 boolean hasPlainNum = false; |
| 475 for (PageParamInfo.PageInfo page : ascendingNumbers) { |
| 476 if (page.mUrl.isEmpty()) { |
| 477 if (hasPlainNum) return false; |
| 478 hasPlainNum = true; |
| 479 } |
| 480 } |
| 481 |
| 482 // If there are only two pages, they must be siblings. |
| 483 if (ascendingNumbers.size() == 2) { |
| 484 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; |
| 485 } |
| 486 |
| 487 // Check if page numbers in ascendingNumbers are adjacent and consecutiv
e. |
| 488 for (int i = 1; i < ascendingNumbers.size(); i++) { |
| 489 // If two adjacent numbers are not consecutive, we accept them only
when: |
| 490 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2
], [3]...[i], [n]. |
| 491 // 2) both of them have URLs. |
| 492 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i); |
| 493 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1); |
| 494 if (currPage.mPageNum - prevPage.mPageNum != 1) { |
| 495 if (i != 1 && i != ascendingNumbers.size() - 1) return false; |
| 496 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f
alse; |
| 497 } |
| 498 } |
| 499 |
| 500 return true; |
| 501 } |
| 224 | 502 |
| 225 /** | 503 /** |
| 226 * If sBadPageParamNames is null, initialize it with all the known bad page
param names, in | 504 * If sBadPageParamNames is null, initialize it with all the known bad page
param names, in |
| 227 * alphabetical order. | 505 * alphabetical order. |
| 228 */ | 506 */ |
| 229 private static void initBadPageParamNames() { | 507 private static void initBadPageParamNames() { |
| 230 if (sBadPageParamNames != null) return; | 508 if (sBadPageParamNames != null) return; |
| 231 | 509 |
| 232 sBadPageParamNames = new HashSet<String>(); | 510 sBadPageParamNames = new HashSet<String>(); |
| 233 sBadPageParamNames.add("baixar-gratis"); | 511 sBadPageParamNames.add("baixar-gratis"); |
| (...skipping 18 matching lines...) Expand all Loading... |
| 252 sBadPageParamNames.add("search_keyword"); | 530 sBadPageParamNames.add("search_keyword"); |
| 253 sBadPageParamNames.add("search_query"); | 531 sBadPageParamNames.add("search_query"); |
| 254 sBadPageParamNames.add("sortby"); | 532 sBadPageParamNames.add("sortby"); |
| 255 sBadPageParamNames.add("subscriptions"); | 533 sBadPageParamNames.add("subscriptions"); |
| 256 sBadPageParamNames.add("tag"); | 534 sBadPageParamNames.add("tag"); |
| 257 sBadPageParamNames.add("tags"); | 535 sBadPageParamNames.add("tags"); |
| 258 sBadPageParamNames.add("video"); | 536 sBadPageParamNames.add("video"); |
| 259 sBadPageParamNames.add("videos"); | 537 sBadPageParamNames.add("videos"); |
| 260 sBadPageParamNames.add("w"); | 538 sBadPageParamNames.add("w"); |
| 261 sBadPageParamNames.add("wiki"); | 539 sBadPageParamNames.add("wiki"); |
| 262 } // initBadPageParamNames | 540 } |
| 263 | 541 |
| 264 } | 542 } |
| OLD | NEW |