Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import com.google.gwt.regexp.shared.MatchResult; | 7 import com.google.gwt.regexp.shared.MatchResult; |
| 8 import com.google.gwt.regexp.shared.RegExp; | 8 import com.google.gwt.regexp.shared.RegExp; |
| 9 | 9 |
| 10 import java.util.ArrayList; | 10 import java.util.ArrayList; |
| 11 import java.util.Arrays; | 11 import java.util.Arrays; |
| (...skipping 10 matching lines...) Expand all Loading... | |
| 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The | 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part ial pages. The |
| 23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w hich contains the | 23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w hich contains the |
| 24 * whole content, called "single page". | 24 * whole content, called "single page". |
| 25 * | 25 * |
| 26 * Definitions: | 26 * Definitions: |
| 27 * A single page document is a document that contains the whole content. | 27 * A single page document is a document that contains the whole content. |
| 28 * A paging document is one of the partial pages. | 28 * A paging document is one of the partial pages. |
| 29 * "digital" means the text contains only digits. | 29 * "digital" means the text contains only digits. |
| 30 * A page pattern is a paging URL whose page parameter value is replaced with a place holder | 30 * A page pattern is a paging URL whose page parameter value is replaced with a place holder |
| 31 * (PAGE_PARAM_PLACEHOLDER). | 31 * (PAGE_PARAM_PLACEHOLDER). |
| 32 * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pat tern is | 32 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is |
| 33 * "http: *www.foo.com/a/b-[*!].html". | 33 * "http://www.foo.com/a/b-[*!].html". |
| 34 * | 34 * |
| 35 * This class extracts the page parameter from a document's outlinks. | 35 * This class extracts the page parameter from a document's outlinks. |
| 36 * The basic idea: | 36 * The basic idea: |
| 37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital anchor text. | 37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital anchor text. |
| 38 * #2. For each group, determine the relationship between digital anchor texts and digital parts | 38 * #2. For each group, determine the relationship between digital anchor texts and digital parts |
| 39 * (either a query value or a path component) in URL. If one part of a UR L is always a linear | 39 * (either a query value or a path component) in URL. If one part of a UR L is always a linear |
| 40 * map from its digital anchor text, we guess the part is the page parame ter of the URL. | 40 * map from its digital anchor text, we guess the part is the page parame ter of the URL. |
| 41 * | 41 * |
| 42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo llowing digital | 42 * As an example, consider a document http://a/b?c=1&p=10, which contains the fo llowing digital |
| 43 * outlinks: | 43 * outlinks: |
| 44 * <a href=http: *a/b?c=1&p=20>3</a> | 44 * <a href=http://a/b?c=1&p=20>3</a> |
| 45 * <a href=http: *a/b?c=1&p=30>4</a> | 45 * <a href=http://a/b?c=1&p=30>4</a> |
| 46 * <a href=http: *a/b?c=1&p=40>5</a> | 46 * <a href=http://a/b?c=1&p=40>5</a> |
| 47 * <a href=http: *a/b?c=1&p=all>single page</a> | 47 * <a href=http://a/b?c=1&p=all>single page</a> |
| 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so | 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10 - 10, and so |
| 49 * guesses it is the page parameter. The associated page pattern is http: *a/b? c=1&p=[*!]. | 49 * guesses it is the page parameter. The associated page pattern is http://a/b? c=1&p=[*!]. |
| 50 * Then, this class extracts the single page based on page parameter info. The single page url is | 50 * Then, this class extracts the single page based on page parameter info. The single page url is |
| 51 * http: *a/b?c=1&p=all. | 51 * http://a/b?c=1&p=all. |
| 52 */ | 52 */ |
| 53 public class PageParameterDetector { | 53 public class PageParameterDetector { |
| 54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | 54 private static final int MIN_LINKS_TO_JUSTIFY_LINEAR_MAP = 2; |
| 55 | |
| 56 static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | |
| 57 static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length( ); | |
| 58 static final int PAGE_NUM_ADJACENT_MASK = 1 << 0; | |
| 59 static final int PAGE_NUM_CONSECUTIVE_MASK = 1 << 1; | |
| 60 | |
| 61 /** | |
| 62 * The interface that page pattern handlers must implement to detect page pa rameter from | |
| 63 * potential pagination URLs. | |
| 64 */ | |
| 65 interface PagePattern { | |
| 66 /** | |
| 67 * Returns the string of the URL page pattern. | |
| 68 */ | |
| 69 String toString(); | |
| 70 | |
| 71 /** | |
| 72 * Returns the page number extracted from the URL during creation of obje ct that implements | |
| 73 * this interface. | |
| 74 */ | |
| 75 int getPageNumber(); | |
| 76 | |
| 77 /** | |
| 78 * Validates this page pattern according to the current document URL thr ough a pipeline of | |
| 79 * rules. | |
| 80 * | |
| 81 * Returns true if page pattern is valid. | |
| 82 * | |
| 83 * @param docUrl the current document URL | |
| 84 */ | |
| 85 boolean isValidFor(ParsedUrl docUrl); | |
| 86 | |
| 87 /** | |
| 88 * Returns true if a URL matches this page pattern based on a pipeline o f rules. | |
| 89 * | |
| 90 * @param url the URL to evalutate | |
| 91 */ | |
| 92 boolean isPagingUrl(String url); | |
| 93 } | |
| 55 | 94 |
| 56 /** | 95 /** |
| 57 * Stores information about the link (anchor) after the page parameter is de tected: | 96 * Stores information about the link (anchor) after the page parameter is de tected: |
| 58 * - the page number (as represented by the original plain text) for the lin k | 97 * - the page number (as represented by the original plain text) for the lin k |
| 59 * - the original page parameter numeric component in the URL (this componen t would be replaced | 98 * - the original page parameter numeric component in the URL (this componen t would be replaced |
| 60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) | 99 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) |
| 61 * - the position of this link in the list of ascending numbers. | 100 * - the position of this link in the list of ascending numbers. |
| 62 */ | 101 */ |
| 63 static class LinkInfo { | 102 static class LinkInfo { |
| 64 private int mPageNum; | 103 private int mPageNum; |
| 65 private int mPageParamValue; | 104 private int mPageParamValue; |
| 66 private int mPosInAscendingList; | 105 private int mPosInAscendingList; |
| 67 | 106 |
| 68 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { | 107 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { |
| 69 mPageNum = pageNum; | 108 mPageNum = pageNum; |
| 70 mPageParamValue = pageParamValue; | 109 mPageParamValue = pageParamValue; |
| 71 mPosInAscendingList = posInAscendingList; | 110 mPosInAscendingList = posInAscendingList; |
| 72 } | 111 } |
| 73 } // LinkInfo | 112 } |
| 74 | 113 |
| 75 /** | 114 /** |
| 76 * Stores a map of URL pattern to its associated list of LinkInfo's. | 115 * Stores a map of URL pattern to its associated list of LinkInfo's. |
| 77 */ | 116 */ |
| 78 private static class PageCandidatesMap { | 117 private static class PageCandidatesMap { |
| 79 private final Map<String, List<LinkInfo>> map = new HashMap<String, List <LinkInfo>>(); | 118 private static class Info { |
| 119 private final PagePattern mPattern; | |
| 120 private final List<LinkInfo> mLinks; | |
| 121 | |
| 122 Info(PagePattern pattern, LinkInfo link) { | |
| 123 mPattern = pattern; | |
| 124 mLinks = new ArrayList<LinkInfo>(); | |
| 125 mLinks.add(link); | |
| 126 } | |
| 127 } | |
| 128 | |
| 129 private final Map<String, Info> map = new HashMap<String, Info>(); | |
| 80 | 130 |
| 81 /** | 131 /** |
| 82 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al ready exists, adds | 132 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al ready exists, adds |
| 83 * the link to the list of LinkInfo's. Otherwise, creates a new map ent ry. | 133 * the link to the list of LinkInfo's. Otherwise, creates a new map ent ry. |
| 84 */ | 134 */ |
| 85 private void add(String urlPattern, LinkInfo link) { | 135 private void add(PagePattern pattern, LinkInfo link) { |
| 86 if (map.containsKey(urlPattern)) { | 136 final String patternStr = pattern.toString(); |
| 87 map.get(urlPattern).add(link); | 137 if (map.containsKey(patternStr)) { |
| 138 map.get(patternStr).mLinks.add(link); | |
| 88 } else { | 139 } else { |
| 89 List<LinkInfo> links = new ArrayList<LinkInfo>(); | 140 map.put(patternStr, new Info(pattern, link)); |
| 90 links.add(link); | |
| 91 map.put(urlPattern, links); | |
| 92 } | 141 } |
| 93 } | 142 } |
| 94 | 143 } |
| 95 } // PageCandidatesMap | |
| 96 | 144 |
| 97 // All the known bad page param names. | 145 // All the known bad page param names. |
| 98 private static Set<String> sBadPageParamNames = null; | 146 private static Set<String> sBadPageParamNames = null; |
| 99 | 147 |
| 100 /** | 148 /** |
| 101 * Extracts page parameter candidates from the query part of given URL and a dds the associated | 149 * Extracts page parameter candidates from the query part of given URL and a dds the associated |
| 102 * links into pageCandidates which is keyed by page pattern. | 150 * links into pageCandidates which is keyed by page pattern. |
| 103 * | 151 * |
| 104 * A page parameter candidate is one where: | 152 * A page parameter candidate is one where: |
| 105 * - the name of a query name-value component is not one of sBadPageParamNam es, and | 153 * - the name of a query name-value component is not one of sBadPageParamNam es, and |
| 106 * - the value of the query component is a plain number (>= 0). | 154 * - the value of the query component is a plain number (>= 0). |
| 107 * E.g. a URL query with 3 plain number query values will generate 3 URL pag e patterns with 3 | 155 * E.g. a URL query with 3 plain number query values will generate 3 URL pag e patterns with 3 |
| 108 * LinkInfo's, and hence 3 page parameter candidates. | 156 * LinkInfo's, and hence 3 page parameter candidates. |
| 109 * | 157 * |
| 110 * @param url ParsedUrl of the URL to process | 158 * @param url ParsedUrl of the URL to process |
| 111 * @param pageNum the page number as represented in original plain text | 159 * @param pageNum the page number as represented in original plain text |
| 112 * @param posInAscendingNumbers position of this page number in the list of ascending numbers | 160 * @param posInAscendingNumbers position of this page number in the list of ascending numbers |
| 113 * @param pageCandidates the map of URL pattern to its associated list of Li nkInfo's | 161 * @param pageCandidates the map of URL pattern to its associated list of Li nkInfo's |
| 114 */ | 162 */ |
| 115 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p ageNum, | 163 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p ageNum, |
| 116 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { | 164 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { |
| 117 String[][] queryParams = url.getQueryParams(); | 165 String[][] queryParams = url.getQueryParams(); |
| 118 if (queryParams.length == 0) return; // No query. | 166 if (queryParams.length == 0) return; // No query. |
| 119 | 167 |
| 120 for (String[] nameValue : queryParams) { | 168 for (String[] nameValue : queryParams) { |
| 121 final String queryName = nameValue[0]; | 169 PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0] , nameValue[1]); |
| 122 final String queryValue = nameValue[1]; | 170 if (pattern != null) { |
| 123 if (!queryName.isEmpty() && !queryValue.isEmpty() && | 171 pageCandidates.add(pattern, |
| 124 StringUtil.isStringAllDigits(queryValue) && !isPageParamName Bad(queryName)) { | 172 new LinkInfo(pageNum, pattern.getPageNumber(), posInAsce ndingNumbers)); |
| 125 int value = StringUtil.toNumber(queryValue); | |
| 126 if (value >= 0) { | |
| 127 pageCandidates.add( | |
| 128 url.replaceQueryValue(queryName, queryValue, PAGE_PA RAM_PLACEHOLDER), | |
| 129 new LinkInfo(pageNum, value, posInAscendingNumbers)) ; | |
| 130 } | |
| 131 } | 173 } |
| 132 } | 174 } |
| 133 } // extractPageParamCandidatesFromQuery | 175 } |
| 134 | 176 |
| 135 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. | 177 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. |
| 136 | 178 |
| 137 /** | 179 /** |
| 138 * Extracts page parameter candidates from the path part of given URL (witho ut query components) | 180 * Extracts page parameter candidates from the path part of given URL (witho ut query components) |
| 139 * and adds the associated links into pageCandidates which is keyed by page pattern. | 181 * and adds the associated links into pageCandidates which is keyed by page pattern. |
| 140 * | 182 * |
| 141 * A page parameter candidate is one where a path component contains consecu tive digits which | 183 * A page parameter candidate is one where a path component contains consecu tive digits which |
| 142 * can be converted to a plain number (>= 0). | 184 * can be converted to a plain number (>= 0). |
| 143 * E.g. a URL path with 3 path components that contain plain numbers will ge nerate 3 URL page | 185 * E.g. a URL path with 3 path components that contain plain numbers will ge nerate 3 URL page |
| (...skipping 15 matching lines...) Expand all Loading... | |
| 159 final String urlStr = url.toString(); | 201 final String urlStr = url.toString(); |
| 160 final int pathStart = url.getOrigin().length(); | 202 final int pathStart = url.getOrigin().length(); |
| 161 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi" ); | 203 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi" ); |
| 162 sDigitsRegExp.setLastIndex(pathStart); | 204 sDigitsRegExp.setLastIndex(pathStart); |
| 163 while (true) { | 205 while (true) { |
| 164 MatchResult match = sDigitsRegExp.exec(urlStr); | 206 MatchResult match = sDigitsRegExp.exec(urlStr); |
| 165 if (match == null) break; | 207 if (match == null) break; |
| 166 | 208 |
| 167 final int matchEnd = sDigitsRegExp.getLastIndex(); | 209 final int matchEnd = sDigitsRegExp.getLastIndex(); |
| 168 final int matchStart = matchEnd - match.getGroup(1).length(); | 210 final int matchStart = matchEnd - match.getGroup(1).length(); |
| 169 | 211 PagePattern pattern = PathComponentPagePattern.create(url, pathStart , matchStart, |
| 170 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat chEnd)) continue; | 212 matchEnd); |
| 171 | 213 if (pattern != null) { |
| 172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn d)); | 214 pageCandidates.add(pattern, |
| 173 if (value >= 0) { | 215 new LinkInfo(pageNum, pattern.getPageNumber(), posInAsce ndingNumbers)); |
| 174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_ PLACEHOLDER + | |
| 175 urlStr.substring(matchEnd), | |
| 176 new LinkInfo(pageNum, value, posInAscendingNumbers)); | |
| 177 } | 216 } |
| 178 } // while there're matches | 217 } // while there're matches |
| 179 } // extractPageParamCandidatesFromPath | 218 } |
| 219 | |
| 220 /** | |
| 221 * Evaluates if the given list of LinkInfo's is a list of paging URLs: | |
| 222 * - page numbers in list of LinkInfo's must be adjacent | |
| 223 * - page numbers in list of ascending numbers must either | |
| 224 * - be consecutive and form a page number sequence, or | |
| 225 * - must construct a linear map with a linear formula: page_parameter = a * page_number + b | |
| 226 * - if there's only 1 LinkInfo, the first ascending number must be page 1, first page URL must | |
| 227 * match page pattern, and the only outlink must be 2nd or 3rd page. | |
| 228 * | |
| 229 * Returns a populated PageParamInfo if evaluated true. Otherwise, returns null. | |
| 230 * | |
| 231 * @param allLinkInfo the list of LinkInfo's to evaluate | |
| 232 * @param pagePattern the URL pattern to use | |
| 233 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
| 234 * @param firstPageUrl the URL of the PageInfo with mPageNum=1 | |
| 235 */ | |
| 236 private static PageParamInfo getPageParamInfo(PagePattern pagePattern, | |
|
cjhopman
2015/04/16 21:58:32
I feel like we have to many SomethingInfo types he
cjhopman
2015/04/16 21:58:32
Would it make sense for this function to be in the
kuan
2015/04/20 23:11:13
i'm bad at names, and ran out of them :(
kuan
2015/04/20 23:11:13
Done. to avoid cross-access between PageParameter
| |
| 237 List<LinkInfo> allLinkInfo, List<PageParamInfo.PageInfo> ascendingNu mbers, | |
| 238 String firstPageUrl) { | |
| 239 if (allLinkInfo.size() >= MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) { | |
| 240 int result = arePageNumsAdjacentAndConsecutive(allLinkInfo, ascendin gNumbers); | |
| 241 if ((result & PAGE_NUM_ADJACENT_MASK) == 0) return null; | |
| 242 | |
| 243 PageParamInfo.LinearFormula linearFormula = getPageParamLinearFormul a(allLinkInfo); | |
| 244 | |
| 245 // PageParamInfo.Type.PAGE_NUMBER: ascending numbers must be consecu tive and of a page | |
| 246 // number sequence. | |
| 247 if ((result & PAGE_NUM_CONSECUTIVE_MASK) != PAGE_NUM_CONSECUTIVE_MAS K) return null; | |
| 248 if (!isPageNumberSequence(ascendingNumbers)) return null; | |
| 249 PageParamInfo pageParamInfo = new PageParamInfo(); | |
|
cjhopman
2015/04/16 21:58:32
Could we move some of the logic of creating the pa
kuan
2015/04/20 23:11:13
Done.
| |
| 250 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
| 251 pageParamInfo.mFormula = linearFormula; | |
| 252 for (LinkInfo link : allLinkInfo) { | |
| 253 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(link.m PageNum, | |
| 254 ascendingNumbers.get(link.mPosInAscendingList).mUrl)); | |
| 255 } | |
| 256 return pageParamInfo; | |
| 257 } | |
| 258 | |
| 259 // Most of news article have no more than 3 pages and the first page pro bably doesn't have | |
| 260 // any page parameter. If the first page url matches the the page patte rn, we treat it as | |
| 261 // the first page of this pattern. | |
| 262 if (allLinkInfo.size() == 1 && !firstPageUrl.isEmpty()) { | |
| 263 final LinkInfo onlyLink = allLinkInfo.get(0); | |
| 264 boolean secondPageIsOutlink = onlyLink.mPageNum == 2 && | |
| 265 onlyLink.mPosInAscendingList == 1; | |
| 266 boolean thirdPageIsOutlink = onlyLink.mPageNum == 3 && | |
| 267 onlyLink.mPosInAscendingList == 2 && | |
| 268 // onlyLink's pos is 2 (evaluated right before), so ascendin gNumbers has >= 3 | |
| 269 // elements; check if previous element is previous page. | |
| 270 ascendingNumbers.get(1).mPageNum == 2; | |
| 271 // 1 LinkInfo means ascendingNumbers has >= 1 element. | |
| 272 if (ascendingNumbers.get(0).mPageNum == 1 && | |
| 273 (secondPageIsOutlink || thirdPageIsOutlink) && | |
| 274 pagePattern.isPagingUrl(firstPageUrl)) { | |
| 275 // Has valid PageParamInfo, create and populate it. | |
| 276 PageParamInfo pageParamInfo = new PageParamInfo(); | |
| 277 pageParamInfo.mType = PageParamInfo.Type.PAGE_NUMBER; | |
| 278 int coefficient; | |
| 279 int delta = onlyLink.mPageParamValue - onlyLink.mPageNum; | |
| 280 if (delta == 0 || delta == 1) { | |
| 281 coefficient = 1; | |
| 282 } else { | |
| 283 coefficient = onlyLink.mPageParamValue; | |
| 284 delta = 0; | |
| 285 } | |
| 286 pageParamInfo.mFormula = new PageParamInfo.LinearFormula(coeffic ient, delta); | |
| 287 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(1, fir stPageUrl)); | |
| 288 pageParamInfo.mAllPageInfo.add(new PageParamInfo.PageInfo(onlyLi nk.mPageNum, | |
| 289 ascendingNumbers.get(onlyLink.mPosInAscendingList).mUrl) ); | |
| 290 return pageParamInfo; | |
| 291 } | |
| 292 } | |
| 293 | |
| 294 return null; | |
| 295 } | |
| 180 | 296 |
| 181 /** | 297 /** |
| 182 * Returns true if given name is backlisted as a known bad page param name. | 298 * Returns true if given name is backlisted as a known bad page param name. |
| 183 */ | 299 */ |
| 184 private static boolean isPageParamNameBad(String name) { | 300 static boolean isPageParamNameBad(String name) { |
| 185 initBadPageParamNames(); | 301 initBadPageParamNames(); |
| 186 return sBadPageParamNames.contains(name.toLowerCase()); | 302 return sBadPageParamNames.contains(name.toLowerCase()); |
| 187 } // isPageParamNameBad | 303 } |
| 188 | 304 |
| 189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | 305 /** |
| 190 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. | 306 * Detects if page numbers in list of LinkInfo's are adjacent, and if page n umbers in list of |
| 191 | 307 * PageParamInfo.PageInfo's are consecutive. |
| 192 /** | 308 * |
| 193 * Returns true if: | 309 * For adjacency, the page numbers in list of LinkInfo's must either be adja cent, or separated |
| 194 * - the digitStart to digitEnd of urlStr is the last path component, and | 310 * by at most 1 plain text number which must represent the current page numb er in one of the |
| 195 * - the entire path component is numeric, and | 311 * PageParamInfo.PageInfo's. |
| 196 * - the previous path component is a bad page param name. | 312 * For consecutiveness, there must be at least one pair of consecutive numbe r values in the list |
| 197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad | 313 * of LinkInfo's, or between a LinkInfo and a plain text number. Otherwise, these outlinks are |
| 198 * page param. | 314 * likely to be page size selection links (e.g. in the document "See 1-10, 1 1-20..."). |
| 199 */ | 315 * |
| 200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, | 316 * Returns a int value that is a combination of bits: |
| 201 int digitStart, int digitEnd) { | 317 * - bit for PAGE_PARAM_ADJACENT_MASK is set if allLinkInfo are adjacent |
| 202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. | 318 * - bit for PAGE_PARAM_CONSECUTIVE_MASK is set if ascendingNumbers are cons ecutive. |
| 203 pathStart < digitStart - 1) { // Not the first path component. | 319 * |
| 204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | 320 * @param allLinkInfo the list of LinkInfo's to evaluate |
| 205 // Checks that this is the last path component, and trailing charact ers, if available, | 321 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's |
| 206 // are (s)htm(l) extensions. | 322 */ |
| 207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); | 323 static int arePageNumsAdjacentAndConsecutive(List<LinkInfo> allLinkInfo, |
| 208 if (sExtRegExp.test(postMatch)) { | 324 List<PageParamInfo.PageInfo> ascendingNumbers) { |
| 209 // Entire component is numeric, get previous path component. | 325 int result = 0; |
| 210 if (sLastPathComponentRegExp == null) { | 326 |
| 211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; | 327 // Check if elements in allLinkInfo are adjacent or there's only 1 gap i .e. the gap is |
| 212 } | 328 // current page number respresented in plain text. |
| 213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | 329 int firstPos = -1; |
| 214 urlStr.substring(pathStart + 1, digitStart)); | 330 int lastPos = -1; |
| 215 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && | 331 int gapPos = -1; |
| 216 isPageParamNameBad(prevPathComponent.getGroup(1))) { | 332 Set<Integer> pageParamSet = new HashSet<Integer>(); // To check that pa ge number is unique. |
| 217 return true; | 333 for (LinkInfo linkInfo : allLinkInfo) { |
| 218 } | 334 final int currPos = linkInfo.mPosInAscendingList; |
| 219 } // last numeric path component | 335 if (lastPos == -1) { |
| 220 } | 336 firstPos = currPos; |
| 221 | 337 } else if (currPos != lastPos + 1) { |
| 222 return false; | 338 // If position is not strictly ascending, or the gap size is > 1 (e.g. "[3] [4] 5 6 |
| 223 } // isLastNumericPathComponentBad | 339 // [7]"), or there's more than 1 gap (e.g. "[3] 4 [5] 6 [7]"), a llLinkInfo is not |
| 340 // adjacent. | |
| 341 if (currPos <= lastPos || currPos != lastPos + 2 || gapPos != -1 ) return result; | |
| 342 gapPos = currPos - 1; | |
| 343 } | |
| 344 // Make sure page param value, i.e. page number represented in plain text, is unique. | |
| 345 if (!pageParamSet.add(linkInfo.mPageParamValue)) return result; | |
| 346 lastPos = currPos; | |
| 347 } // for all LinkInfo's | |
| 348 | |
| 349 result |= PAGE_NUM_ADJACENT_MASK; | |
| 350 | |
| 351 // Now, determine if page numbers in ascendingNumbers are consecutive. | |
| 352 | |
| 353 // First, handle the gap. | |
| 354 if (gapPos != -1) { | |
| 355 if (gapPos <= 0 || gapPos >= ascendingNumbers.size() - 1) return resu lt; | |
| 356 // The "gap" should represent current page number in plain text. | |
| 357 // Check if its adjacent page numbers are consecutive. | |
| 358 // e.g. "[1] [5] 6 [7] [12]" is accepted; "[4] 8 [16]" is rejected. | |
| 359 // This can eliminate links affecting the number of items on a page. | |
| 360 final int currPageNum = ascendingNumbers.get(gapPos).mPageNum; | |
| 361 if (ascendingNumbers.get(gapPos - 1).mPageNum == currPageNum - 1 && | |
| 362 ascendingNumbers.get(gapPos + 1).mPageNum == currPageNum + 1) { | |
| 363 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 364 } | |
| 365 return result; | |
| 366 } | |
| 367 | |
| 368 // There is no gap. Check if at least one of the following cases is sat isfied: | |
| 369 // Case #1: "[1] [2] ..." or "1 [2] ... ". | |
| 370 if ((firstPos == 0 || firstPos == 1) && ascendingNumbers.get(0).mPageNum == 1 && | |
| 371 ascendingNumbers.get(1).mPageNum == 2) { | |
| 372 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 373 } | |
| 374 // Case #2: "[1] 2 [3] ..." where [1] doesn't belong to current pattern. | |
| 375 if (firstPos == 2 && ascendingNumbers.get(2).mPageNum == 3 && | |
| 376 ascendingNumbers.get(1).mUrl.isEmpty() && !ascendingNumbers.get( 0).mUrl.isEmpty()) { | |
| 377 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 378 } | |
| 379 // Case #3: "... [n-1] [n]" or "... [n - 1] n". | |
| 380 final int numbersSize = ascendingNumbers.size(); | |
| 381 if ((lastPos == numbersSize - 1 || lastPos == numbersSize - 2) && | |
| 382 ascendingNumbers.get(numbersSize - 2).mPageNum + 1 == | |
| 383 ascendingNumbers.get(numbersSize - 1).mPageNum) { | |
| 384 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 385 } | |
| 386 // Case #4: "... [i-1] [i] [i+1] ...". | |
| 387 for (int i = firstPos + 1; i < lastPos; i++) { | |
| 388 if (ascendingNumbers.get(i - 1).mPageNum + 2 == ascendingNumbers.get (i + 1).mPageNum) { | |
| 389 return result | PAGE_NUM_CONSECUTIVE_MASK; | |
| 390 } | |
| 391 } | |
| 392 | |
| 393 // Otherwise, there's no pair of consecutive values. | |
| 394 return result; | |
| 395 } | |
| 396 | |
| 397 /** | |
| 398 * | |
| 399 * Determines if the list of LinkInfo's form a linear formula: | |
| 400 * pageParamValue = coefficient * pageNum + delta (delta == -coefficient o r delta == 0). | |
| 401 * | |
| 402 * The coefficient and delta are calculated from the page parameter values a nd page numbers of 2 | |
| 403 * LinkInfo's, and then validated against the remaining LinkInfo's. | |
| 404 * The order of page numbers doesn't matter. | |
| 405 * | |
| 406 * Returns PageParamInfo.LinearFormula, containing the coefficient and delta , if the page | |
| 407 * parameter forumla could be determined. Otherwise, returns null. | |
| 408 * | |
| 409 * @param allLinkInfo the list of LinkInfo's to evaluate | |
| 410 */ | |
| 411 // TODO(kuan): As this gets rolled out, reassesss the necessity of non-1 coe fficient support. | |
| 412 private static PageParamInfo.LinearFormula getPageParamLinearFormula( | |
| 413 List<LinkInfo> allLinkInfo) { | |
| 414 if (allLinkInfo.size() < MIN_LINKS_TO_JUSTIFY_LINEAR_MAP) return null; | |
| 415 | |
| 416 final LinkInfo firstLink = allLinkInfo.get(0); | |
| 417 final LinkInfo secondLink = allLinkInfo.get(1); | |
| 418 | |
| 419 if (allLinkInfo.size() == 2 && Math.max(firstLink.mPageNum, secondLink.m PageNum) > 4) { | |
| 420 return null; | |
| 421 } | |
| 422 | |
| 423 int deltaX = secondLink.mPageNum - firstLink.mPageNum; | |
| 424 if (deltaX == 0) return null; | |
| 425 | |
| 426 int deltaY = secondLink.mPageParamValue - firstLink.mPageParamValue; | |
| 427 int coefficient = deltaY / deltaX; | |
| 428 if (coefficient == 0) return null; | |
| 429 | |
| 430 int delta = firstLink.mPageParamValue - coefficient * firstLink.mPageNum ; | |
| 431 if (delta != 0 && delta != -coefficient) return null; | |
| 432 | |
| 433 // Check if the remaining elements are on the same linear map. | |
| 434 for (int i = 2; i < allLinkInfo.size(); i++) { | |
| 435 final LinkInfo link = allLinkInfo.get(i); | |
| 436 if (link.mPageParamValue != coefficient * link.mPageNum + delta) ret urn null; | |
| 437 } | |
| 438 | |
| 439 return new PageParamInfo.LinearFormula(coefficient, delta); | |
| 440 } | |
| 441 | |
| 442 /** | |
| 443 * Returns true if page numbers in list of PageParamInfo.PageInfo's form a s equence, based on | |
| 444 * a pipeline of rules: | |
| 445 * - first PageInfo must have a URL unless it is the first page | |
| 446 * - there's only one plain number without URL in list | |
| 447 * - if only two pages, they must be siblings - 2nd page number must follow 1st | |
| 448 * - page numbers must be adjacent and consecutive; otherwise, 2 non-consecu tive numbers must be | |
| 449 * head/tail or have URLs. | |
| 450 * | |
| 451 * @param ascendingNumbers list of PageInfo's with ascending mPageNum's | |
| 452 */ | |
| 453 private static boolean isPageNumberSequence(List<PageParamInfo.PageInfo> asc endingNumbers) { | |
| 454 if (ascendingNumbers.size() <= 1) return false; | |
| 455 | |
| 456 // The first one must have a URL unless it is the first page. | |
| 457 final PageParamInfo.PageInfo firstPage = ascendingNumbers.get(0); | |
| 458 if (firstPage.mPageNum != 1 && firstPage.mUrl.isEmpty()) return false; | |
| 459 | |
| 460 // There's only one plain number without URL in ascending numbers group. | |
| 461 boolean hasPlainNum = false; | |
| 462 for (PageParamInfo.PageInfo page : ascendingNumbers) { | |
| 463 if (page.mUrl.isEmpty()) { | |
| 464 if (hasPlainNum) return false; | |
| 465 hasPlainNum = true; | |
| 466 } | |
| 467 } | |
| 468 | |
| 469 // If there are only two pages, they must be siblings. | |
| 470 if (ascendingNumbers.size() == 2) { | |
| 471 return firstPage.mPageNum + 1 == ascendingNumbers.get(1).mPageNum; | |
| 472 } | |
| 473 | |
| 474 // Check if page numbers in ascendingNumbers are adjacent and consecutiv e. | |
| 475 for (int i = 1; i < ascendingNumbers.size(); i++) { | |
| 476 // If two adjacent numbers are not consecutive, we accept them only when: | |
| 477 // 1) one of them is head/tail, like [1],[n-i][n-i+1]..[n] or [1],[2 ], [3]...[i], [n]. | |
| 478 // 2) both of them have URLs. | |
| 479 final PageParamInfo.PageInfo currPage = ascendingNumbers.get(i); | |
| 480 final PageParamInfo.PageInfo prevPage = ascendingNumbers.get(i - 1); | |
| 481 if (currPage.mPageNum - prevPage.mPageNum != 1) { | |
| 482 if (i != 1 && i != ascendingNumbers.size() - 1) return false; | |
| 483 if (currPage.mUrl.isEmpty() || prevPage.mUrl.isEmpty()) return f alse; | |
| 484 } | |
| 485 } | |
| 486 | |
| 487 return true; | |
| 488 } | |
| 489 | |
| 490 /** | |
| 491 * Returns true if given string can be converted to a number >= 0. | |
| 492 */ | |
| 493 static boolean isPlainNumber(String str) { | |
| 494 return StringUtil.toNumber(str) >= 0; | |
| 495 } | |
| 224 | 496 |
| 225 /** | 497 /** |
| 226 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in | 498 * If sBadPageParamNames is null, initialize it with all the known bad page param names, in |
| 227 * alphabetical order. | 499 * alphabetical order. |
| 228 */ | 500 */ |
| 229 private static void initBadPageParamNames() { | 501 private static void initBadPageParamNames() { |
| 230 if (sBadPageParamNames != null) return; | 502 if (sBadPageParamNames != null) return; |
| 231 | 503 |
| 232 sBadPageParamNames = new HashSet<String>(); | 504 sBadPageParamNames = new HashSet<String>(); |
| 233 sBadPageParamNames.add("baixar-gratis"); | 505 sBadPageParamNames.add("baixar-gratis"); |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 252 sBadPageParamNames.add("search_keyword"); | 524 sBadPageParamNames.add("search_keyword"); |
| 253 sBadPageParamNames.add("search_query"); | 525 sBadPageParamNames.add("search_query"); |
| 254 sBadPageParamNames.add("sortby"); | 526 sBadPageParamNames.add("sortby"); |
| 255 sBadPageParamNames.add("subscriptions"); | 527 sBadPageParamNames.add("subscriptions"); |
| 256 sBadPageParamNames.add("tag"); | 528 sBadPageParamNames.add("tag"); |
| 257 sBadPageParamNames.add("tags"); | 529 sBadPageParamNames.add("tags"); |
| 258 sBadPageParamNames.add("video"); | 530 sBadPageParamNames.add("video"); |
| 259 sBadPageParamNames.add("videos"); | 531 sBadPageParamNames.add("videos"); |
| 260 sBadPageParamNames.add("w"); | 532 sBadPageParamNames.add("w"); |
| 261 sBadPageParamNames.add("wiki"); | 533 sBadPageParamNames.add("wiki"); |
| 262 } // initBadPageParamNames | 534 } |
| 263 | 535 |
| 264 } | 536 } |
| OLD | NEW |