| OLD | NEW |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
| 6 | 6 |
| 7 import com.google.gwt.regexp.shared.MatchResult; | 7 import com.google.gwt.regexp.shared.MatchResult; |
| 8 import com.google.gwt.regexp.shared.RegExp; | 8 import com.google.gwt.regexp.shared.RegExp; |
| 9 | 9 |
| 10 import java.util.ArrayList; | 10 import java.util.ArrayList; |
| 11 import java.util.Arrays; | 11 import java.util.Arrays; |
| 12 import java.util.Collections; | 12 import java.util.Collections; |
| 13 import java.util.HashMap; | 13 import java.util.HashMap; |
| 14 import java.util.HashSet; | 14 import java.util.HashSet; |
| 15 import java.util.List; | 15 import java.util.List; |
| 16 import java.util.Map; | 16 import java.util.Map; |
| 17 import java.util.Set; | 17 import java.util.Set; |
| 18 | 18 |
| 19 /** | 19 /** |
| 20 * Background: | 20 * Background: |
| 21 * The long article/news/forum thread/blog document may be partitioned into se
veral partial pages | 21 * The long article/news/forum thread/blog document may be partitioned into se
veral partial pages |
| 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part
ial pages. The | 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part
ial pages. The |
| 23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w
hich contains the | 23 * anchor text of those outlinks is numeric. |
| 24 * whole content, called "single page". | |
| 25 * | 24 * |
| 26 * Definitions: | 25 * Definitions: |
| 27 * A single page document is a document that contains the whole content. | |
| 28 * A paging document is one of the partial pages. | 26 * A paging document is one of the partial pages. |
| 29 * "digital" means the text contains only digits. | 27 * "digital" means the text contains only digits. |
| 30 * A page pattern is a paging URL whose page parameter value is replaced with a
place holder | 28 * A page pattern is a paging URL whose page parameter value is replaced with a
place holder |
| 31 * (PAGE_PARAM_PLACEHOLDER). | 29 * (PAGE_PARAM_PLACEHOLDER). |
| 32 * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pat
tern is | 30 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat
tern is |
| 33 * "http: *www.foo.com/a/b-[*!].html". | 31 * "http://www.foo.com/a/b-[*!].html". |
| 34 * | 32 * |
| 35 * This class extracts the page parameter from a document's outlinks. | 33 * This class extracts the page parameter from a document's outlinks. |
| 36 * The basic idea: | 34 * The basic idea: |
| 37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital
anchor text. | 35 * #1. Collect groups of adjacent plain text numbers and outlinks with digital
anchor text. |
| 38 * #2. For each group, determine the relationship between digital anchor texts
and digital parts | 36 * #2. For each group, determine the relationship between digital anchor texts
and digital parts |
| 39 * (either a query value or a path component) in URL. If one part of a UR
L is always a linear | 37 * (either a query value or a path component) in URL. If one part of a UR
L is always a linear |
| 40 * map from its digital anchor text, we guess the part is the page parame
ter of the URL. | 38 * map from its digital anchor text, we guess the part is the page parame
ter of the URL. |
| 41 * | 39 * |
| 42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo
llowing digital | 40 * As an example, consider a document http://a/b?c=1&p=10, which contains the fo
llowing digital |
| 43 * outlinks: | 41 * outlinks: |
| 44 * <a href=http: *a/b?c=1&p=20>3</a> | 42 * <a href=http://a/b?c=1&p=20>3</a> |
| 45 * <a href=http: *a/b?c=1&p=30>4</a> | 43 * <a href=http://a/b?c=1&p=30>4</a> |
| 46 * <a href=http: *a/b?c=1&p=40>5</a> | 44 * <a href=http://a/b?c=1&p=40>5</a> |
| 47 * <a href=http: *a/b?c=1&p=all>single page</a> | |
| 48 * This class finds that the "p" parameter is always equal to "anchor text" * 10
- 10, and so | 45 * This class finds that the "p" parameter is always equal to "anchor text" * 10
- 10, and so |
| 49 * guesses it is the page parameter. The associated page pattern is http: *a/b?
c=1&p=[*!]. | 46 * guesses it is the page parameter. The associated page pattern is http://a/b?
c=1&p=[*!]. |
| 50 * Then, this class extracts the single page based on page parameter info. The
single page url is | |
| 51 * http: *a/b?c=1&p=all. | |
| 52 */ | 47 */ |
| 53 public class PageParameterDetector { | 48 public class PageParameterDetector { |
| 54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | 49 static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
| 50 static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length(
); |
| 55 | 51 |
| 56 /** | 52 /** |
| 57 * Stores information about the link (anchor) after the page parameter is de
tected: | 53 * The interface that page pattern handlers must implement to detect page pa
rameter from |
| 58 * - the page number (as represented by the original plain text) for the lin
k | 54 * potential pagination URLs. |
| 59 * - the original page parameter numeric component in the URL (this componen
t would be replaced | |
| 60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) | |
| 61 * - the position of this link in the list of ascending numbers. | |
| 62 */ | 55 */ |
| 63 static class LinkInfo { | 56 interface PagePattern { |
| 64 private int mPageNum; | 57 /** |
| 65 private int mPageParamValue; | 58 * Returns the string of the URL page pattern. |
| 66 private int mPosInAscendingList; | 59 */ |
| 60 String toString(); |
| 67 | 61 |
| 68 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { | 62 /** |
| 69 mPageNum = pageNum; | 63 * Returns the page number extracted from the URL during creation of obje
ct that implements |
| 70 mPageParamValue = pageParamValue; | 64 * this interface. |
| 71 mPosInAscendingList = posInAscendingList; | 65 */ |
| 72 } | 66 int getPageNumber(); |
| 73 } // LinkInfo | 67 |
| 68 /** |
| 69 * Validates this page pattern according to the current document URL thr
ough a pipeline of |
| 70 * rules. |
| 71 * |
| 72 * Returns true if page pattern is valid. |
| 73 * |
| 74 * @param docUrl the current document URL |
| 75 */ |
| 76 boolean isValidFor(ParsedUrl docUrl); |
| 77 |
| 78 /** |
| 79 * Returns true if a URL matches this page pattern based on a pipeline o
f rules. |
| 80 * |
| 81 * @param url the URL to evalutate |
| 82 */ |
| 83 boolean isPagingUrl(String url); |
| 84 } |
| 74 | 85 |
| 75 /** | 86 /** |
| 76 * Stores a map of URL pattern to its associated list of LinkInfo's. | 87 * Stores a map of URL pattern to its associated list of PageLinkInfo's. |
| 77 */ | 88 */ |
| 78 private static class PageCandidatesMap { | 89 private static class PageCandidatesMap { |
| 79 private final Map<String, List<LinkInfo>> map = new HashMap<String, List
<LinkInfo>>(); | 90 private static class Info { |
| 91 private final PagePattern mPattern; |
| 92 private final List<PageLinkInfo> mLinks; |
| 80 | 93 |
| 81 /** | 94 Info(PagePattern pattern, PageLinkInfo link) { |
| 82 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al
ready exists, adds | 95 mPattern = pattern; |
| 83 * the link to the list of LinkInfo's. Otherwise, creates a new map ent
ry. | 96 mLinks = new ArrayList<PageLinkInfo>(); |
| 84 */ | 97 mLinks.add(link); |
| 85 private void add(String urlPattern, LinkInfo link) { | |
| 86 if (map.containsKey(urlPattern)) { | |
| 87 map.get(urlPattern).add(link); | |
| 88 } else { | |
| 89 List<LinkInfo> links = new ArrayList<LinkInfo>(); | |
| 90 links.add(link); | |
| 91 map.put(urlPattern, links); | |
| 92 } | 98 } |
| 93 } | 99 } |
| 94 | 100 |
| 95 } // PageCandidatesMap | 101 private final Map<String, Info> map = new HashMap<String, Info>(); |
| 102 |
| 103 /** |
| 104 * Adds urlPattern with its PageLinkInfo into the map. If the urlPatter
n already exists, |
| 105 * adds the link to the list of LinkInfo's. Otherwise, creates a new ma
p entry. |
| 106 */ |
| 107 private void add(PagePattern pattern, PageLinkInfo link) { |
| 108 final String patternStr = pattern.toString(); |
| 109 if (map.containsKey(patternStr)) { |
| 110 map.get(patternStr).mLinks.add(link); |
| 111 } else { |
| 112 map.put(patternStr, new Info(pattern, link)); |
| 113 } |
| 114 } |
| 115 } |
| 96 | 116 |
| 97 // All the known bad page param names. | 117 // All the known bad page param names. |
| 98 private static Set<String> sBadPageParamNames = null; | 118 private static Set<String> sBadPageParamNames = null; |
| 99 | 119 |
| 100 /** | 120 /** |
| 101 * Extracts page parameter candidates from the query part of given URL and a
dds the associated | 121 * Extracts page parameter candidates from the query part of given URL and a
dds the associated |
| 102 * links into pageCandidates which is keyed by page pattern. | 122 * links into pageCandidates which is keyed by page pattern. |
| 103 * | 123 * |
| 104 * A page parameter candidate is one where: | 124 * A page parameter candidate is one where: |
| 105 * - the name of a query name-value component is not one of sBadPageParamNam
es, and | 125 * - the name of a query name-value component is not one of sBadPageParamNam
es, and |
| 106 * - the value of the query component is a plain number (>= 0). | 126 * - the value of the query component is a plain number (>= 0). |
| 107 * E.g. a URL query with 3 plain number query values will generate 3 URL pag
e patterns with 3 | 127 * E.g. a URL query with 3 plain number query values will generate 3 URL pag
e patterns with 3 |
| 108 * LinkInfo's, and hence 3 page parameter candidates. | 128 * PageLinkInfo's, and hence 3 page parameter candidates. |
| 109 * | 129 * |
| 110 * @param url ParsedUrl of the URL to process | 130 * @param url ParsedUrl of the URL to process |
| 111 * @param pageNum the page number as represented in original plain text | 131 * @param pageNum the page number as represented in original plain text |
| 112 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers | 132 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers |
| 113 * @param pageCandidates the map of URL pattern to its associated list of Li
nkInfo's | 133 * @param pageCandidates the map of URL pattern to its associated list of Pa
geLinkInfo's |
| 114 */ | 134 */ |
| 115 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p
ageNum, | 135 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p
ageNum, |
| 116 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { | 136 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { |
| 117 String[][] queryParams = url.getQueryParams(); | 137 String[][] queryParams = url.getQueryParams(); |
| 118 if (queryParams.length == 0) return; // No query. | 138 if (queryParams.length == 0) return; // No query. |
| 119 | 139 |
| 120 for (String[] nameValue : queryParams) { | 140 for (String[] nameValue : queryParams) { |
| 121 final String queryName = nameValue[0]; | 141 PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0]
, nameValue[1]); |
| 122 final String queryValue = nameValue[1]; | 142 if (pattern != null) { |
| 123 if (!queryName.isEmpty() && !queryValue.isEmpty() && | 143 pageCandidates.add(pattern, |
| 124 StringUtil.isStringAllDigits(queryValue) && !isPageParamName
Bad(queryName)) { | 144 new PageLinkInfo(pageNum, pattern.getPageNumber(), posIn
AscendingNumbers)); |
| 125 int value = StringUtil.toNumber(queryValue); | |
| 126 if (value >= 0) { | |
| 127 pageCandidates.add( | |
| 128 url.replaceQueryValue(queryName, queryValue, PAGE_PA
RAM_PLACEHOLDER), | |
| 129 new LinkInfo(pageNum, value, posInAscendingNumbers))
; | |
| 130 } | |
| 131 } | 145 } |
| 132 } | 146 } |
| 133 } // extractPageParamCandidatesFromQuery | 147 } |
| 134 | 148 |
| 135 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. | 149 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. |
| 136 | 150 |
| 137 /** | 151 /** |
| 138 * Extracts page parameter candidates from the path part of given URL (witho
ut query components) | 152 * Extracts page parameter candidates from the path part of given URL (witho
ut query components) |
| 139 * and adds the associated links into pageCandidates which is keyed by page
pattern. | 153 * and adds the associated links into pageCandidates which is keyed by page
pattern. |
| 140 * | 154 * |
| 141 * A page parameter candidate is one where a path component contains consecu
tive digits which | 155 * A page parameter candidate is one where a path component contains consecu
tive digits which |
| 142 * can be converted to a plain number (>= 0). | 156 * can be converted to a plain number (>= 0). |
| 143 * E.g. a URL path with 3 path components that contain plain numbers will ge
nerate 3 URL page | 157 * E.g. a URL path with 3 path components that contain plain numbers will ge
nerate 3 URL page |
| 144 * patterns with 3 LinkInfo's, and hence 3 page parameter candidates. | 158 * patterns with 3 PageLinkInfo's, and hence 3 page parameter candidates. |
| 145 * | 159 * |
| 146 * @param url ParsedUrl of the URL to process | 160 * @param url ParsedUrl of the URL to process |
| 147 * @param pageNum the page number as represented in original plain text | 161 * @param pageNum the page number as represented in original plain text |
| 148 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers | 162 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers |
| 149 * @param pageCandidates the map of URL pattern to its associated list of Li
nkInfo's | 163 * @param pageCandidates the map of URL pattern to its associated list of Pa
geLinkInfo's |
| 150 */ | 164 */ |
| 151 | 165 |
| 152 private static void extractPageParamCandidatesFromPath(ParsedUrl url, int pa
geNum, | 166 private static void extractPageParamCandidatesFromPath(ParsedUrl url, int pa
geNum, |
| 153 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { | 167 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { |
| 154 String path = url.getTrimmedPath(); | 168 String path = url.getTrimmedPath(); |
| 155 if (path.isEmpty() || !StringUtil.containsDigit(path)) return; | 169 if (path.isEmpty() || !StringUtil.containsDigit(path)) return; |
| 156 | 170 |
| 157 // Extract digits (either one or consecutive) from path, replace the dig
it(s) with | 171 // Extract digits (either one or consecutive) from path, replace the dig
it(s) with |
| 158 // PAGE_PARAM_PLACEHOLDER to fomulate the page pattern, add it as page c
andidate. | 172 // PAGE_PARAM_PLACEHOLDER to fomulate the page pattern, add it as page c
andidate. |
| 159 final String urlStr = url.toString(); | 173 final String urlStr = url.toString(); |
| 160 final int pathStart = url.getOrigin().length(); | 174 final int pathStart = url.getOrigin().length(); |
| 161 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi"
); | 175 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi"
); |
| 162 sDigitsRegExp.setLastIndex(pathStart); | 176 sDigitsRegExp.setLastIndex(pathStart); |
| 163 while (true) { | 177 while (true) { |
| 164 MatchResult match = sDigitsRegExp.exec(urlStr); | 178 MatchResult match = sDigitsRegExp.exec(urlStr); |
| 165 if (match == null) break; | 179 if (match == null) break; |
| 166 | 180 |
| 167 final int matchEnd = sDigitsRegExp.getLastIndex(); | 181 final int matchEnd = sDigitsRegExp.getLastIndex(); |
| 168 final int matchStart = matchEnd - match.getGroup(1).length(); | 182 final int matchStart = matchEnd - match.getGroup(1).length(); |
| 169 | 183 PagePattern pattern = PathComponentPagePattern.create(url, pathStart
, matchStart, |
| 170 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat
chEnd)) continue; | 184 matchEnd); |
| 171 | 185 if (pattern != null) { |
| 172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn
d)); | 186 pageCandidates.add(pattern, |
| 173 if (value >= 0) { | 187 new PageLinkInfo(pageNum, pattern.getPageNumber(), posIn
AscendingNumbers)); |
| 174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_
PLACEHOLDER + | |
| 175 urlStr.substring(matchEnd), | |
| 176 new LinkInfo(pageNum, value, posInAscendingNumbers)); | |
| 177 } | 188 } |
| 178 } // while there're matches | 189 } // while there're matches |
| 179 } // extractPageParamCandidatesFromPath | 190 } |
| 180 | 191 |
| 181 /** | 192 /** |
| 182 * Returns true if given name is backlisted as a known bad page param name. | 193 * Returns true if given name is backlisted as a known bad page param name. |
| 183 */ | 194 */ |
| 184 private static boolean isPageParamNameBad(String name) { | 195 static boolean isPageParamNameBad(String name) { |
| 185 initBadPageParamNames(); | 196 initBadPageParamNames(); |
| 186 return sBadPageParamNames.contains(name.toLowerCase()); | 197 return sBadPageParamNames.contains(name.toLowerCase()); |
| 187 } // isPageParamNameBad | 198 } |
| 188 | |
| 189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | |
| 190 private static RegExp sLastPathComponentRegExp = null; // Match last path c
omponent. | |
| 191 | 199 |
| 192 /** | 200 /** |
| 193 * Returns true if: | 201 * Returns true if given string can be converted to a number >= 0. |
| 194 * - the digitStart to digitEnd of urlStr is the last path component, and | |
| 195 * - the entire path component is numeric, and | |
| 196 * - the previous path component is a bad page param name. | |
| 197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an
d "tag" is a bad | |
| 198 * page param. | |
| 199 */ | 202 */ |
| 200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, | 203 static boolean isPlainNumber(String str) { |
| 201 int digitStart, int digitEnd) { | 204 return StringUtil.toNumber(str) >= 0; |
| 202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path
component. | 205 } |
| 203 pathStart < digitStart - 1) { // Not the first path component. | |
| 204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | |
| 205 // Checks that this is the last path component, and trailing charact
ers, if available, | |
| 206 // are (s)htm(l) extensions. | |
| 207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$",
"i"); | |
| 208 if (sExtRegExp.test(postMatch)) { | |
| 209 // Entire component is numeric, get previous path component. | |
| 210 if (sLastPathComponentRegExp == null) { | |
| 211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i")
; | |
| 212 } | |
| 213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | |
| 214 urlStr.substring(pathStart + 1, digitStart)); | |
| 215 if (prevPathComponent != null && prevPathComponent.getGroupCount
() > 1 && | |
| 216 isPageParamNameBad(prevPathComponent.getGroup(1))) { | |
| 217 return true; | |
| 218 } | |
| 219 } // last numeric path component | |
| 220 } | |
| 221 | |
| 222 return false; | |
| 223 } // isLastNumericPathComponentBad | |
| 224 | 206 |
| 225 /** | 207 /** |
| 226 * If sBadPageParamNames is null, initialize it with all the known bad page
param names, in | 208 * If sBadPageParamNames is null, initialize it with all the known bad page
param names, in |
| 227 * alphabetical order. | 209 * alphabetical order. |
| 228 */ | 210 */ |
| 229 private static void initBadPageParamNames() { | 211 private static void initBadPageParamNames() { |
| 230 if (sBadPageParamNames != null) return; | 212 if (sBadPageParamNames != null) return; |
| 231 | 213 |
| 232 sBadPageParamNames = new HashSet<String>(); | 214 sBadPageParamNames = new HashSet<String>(); |
| 233 sBadPageParamNames.add("baixar-gratis"); | 215 sBadPageParamNames.add("baixar-gratis"); |
| (...skipping 18 matching lines...) Expand all Loading... |
| 252 sBadPageParamNames.add("search_keyword"); | 234 sBadPageParamNames.add("search_keyword"); |
| 253 sBadPageParamNames.add("search_query"); | 235 sBadPageParamNames.add("search_query"); |
| 254 sBadPageParamNames.add("sortby"); | 236 sBadPageParamNames.add("sortby"); |
| 255 sBadPageParamNames.add("subscriptions"); | 237 sBadPageParamNames.add("subscriptions"); |
| 256 sBadPageParamNames.add("tag"); | 238 sBadPageParamNames.add("tag"); |
| 257 sBadPageParamNames.add("tags"); | 239 sBadPageParamNames.add("tags"); |
| 258 sBadPageParamNames.add("video"); | 240 sBadPageParamNames.add("video"); |
| 259 sBadPageParamNames.add("videos"); | 241 sBadPageParamNames.add("videos"); |
| 260 sBadPageParamNames.add("w"); | 242 sBadPageParamNames.add("w"); |
| 261 sBadPageParamNames.add("wiki"); | 243 sBadPageParamNames.add("wiki"); |
| 262 } // initBadPageParamNames | 244 } |
| 263 | 245 |
| 264 } | 246 } |
| OLD | NEW |