OLD | NEW |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 package org.chromium.distiller; | 5 package org.chromium.distiller; |
6 | 6 |
7 import com.google.gwt.regexp.shared.MatchResult; | 7 import com.google.gwt.regexp.shared.MatchResult; |
8 import com.google.gwt.regexp.shared.RegExp; | 8 import com.google.gwt.regexp.shared.RegExp; |
9 | 9 |
10 import java.util.ArrayList; | 10 import java.util.ArrayList; |
11 import java.util.Arrays; | 11 import java.util.Arrays; |
12 import java.util.Collections; | 12 import java.util.Collections; |
13 import java.util.HashMap; | 13 import java.util.HashMap; |
14 import java.util.HashSet; | 14 import java.util.HashSet; |
15 import java.util.List; | 15 import java.util.List; |
16 import java.util.Map; | 16 import java.util.Map; |
17 import java.util.Set; | 17 import java.util.Set; |
18 | 18 |
19 /** | 19 /** |
20 * Background: | 20 * Background: |
21 * The long article/news/forum thread/blog document may be partitioned into se
veral partial pages | 21 * The long article/news/forum thread/blog document may be partitioned into se
veral partial pages |
22 * by webmaster. Each partial page has outlinks pointing to the adjacent part
ial pages. The | 22 * by webmaster. Each partial page has outlinks pointing to the adjacent part
ial pages. The |
23 * anchor text of those outlinks is numeric. Meanwhile, there may be a page w
hich contains the | 23 * anchor text of those outlinks is numeric. |
24 * whole content, called "single page". | |
25 * | 24 * |
26 * Definitions: | 25 * Definitions: |
27 * A single page document is a document that contains the whole content. | |
28 * A paging document is one of the partial pages. | 26 * A paging document is one of the partial pages. |
29 * "digital" means the text contains only digits. | 27 * "digital" means the text contains only digits. |
30 * A page pattern is a paging URL whose page parameter value is replaced with a
place holder | 28 * A page pattern is a paging URL whose page parameter value is replaced with a
place holder |
31 * (PAGE_PARAM_PLACEHOLDER). | 29 * (PAGE_PARAM_PLACEHOLDER). |
32 * Example: if the original url is "http: *www.foo.com/a/b-3.html", the page pat
tern is | 30 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat
tern is |
33 * "http: *www.foo.com/a/b-[*!].html". | 31 * "http://www.foo.com/a/b-[*!].html". |
34 * | 32 * |
35 * This class extracts the page parameter from a document's outlinks. | 33 * This class extracts the page parameter from a document's outlinks. |
36 * The basic idea: | 34 * The basic idea: |
37 * #1. Collect groups of adjacent plain text numbers and outlinks with digital
anchor text. | 35 * #1. Collect groups of adjacent plain text numbers and outlinks with digital
anchor text. |
38 * #2. For each group, determine the relationship between digital anchor texts
and digital parts | 36 * #2. For each group, determine the relationship between digital anchor texts
and digital parts |
39 * (either a query value or a path component) in URL. If one part of a UR
L is always a linear | 37 * (either a query value or a path component) in URL. If one part of a UR
L is always a linear |
40 * map from its digital anchor text, we guess the part is the page parame
ter of the URL. | 38 * map from its digital anchor text, we guess the part is the page parame
ter of the URL. |
41 * | 39 * |
42 * As an example, consider a document http: *a/b?c=1&p=10, which contains the fo
llowing digital | 40 * As an example, consider a document http://a/b?c=1&p=10, which contains the fo
llowing digital |
43 * outlinks: | 41 * outlinks: |
44 * <a href=http: *a/b?c=1&p=20>3</a> | 42 * <a href=http://a/b?c=1&p=20>3</a> |
45 * <a href=http: *a/b?c=1&p=30>4</a> | 43 * <a href=http://a/b?c=1&p=30>4</a> |
46 * <a href=http: *a/b?c=1&p=40>5</a> | 44 * <a href=http://a/b?c=1&p=40>5</a> |
47 * <a href=http: *a/b?c=1&p=all>single page</a> | |
48 * This class finds that the "p" parameter is always equal to "anchor text" * 10
- 10, and so | 45 * This class finds that the "p" parameter is always equal to "anchor text" * 10
- 10, and so |
49 * guesses it is the page parameter. The associated page pattern is http: *a/b?
c=1&p=[*!]. | 46 * guesses it is the page parameter. The associated page pattern is http://a/b?
c=1&p=[*!]. |
50 * Then, this class extracts the single page based on page parameter info. The
single page url is | |
51 * http: *a/b?c=1&p=all. | |
52 */ | 47 */ |
53 public class PageParameterDetector { | 48 public class PageParameterDetector { |
54 private static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; | 49 static final String PAGE_PARAM_PLACEHOLDER = "[*!]"; |
| 50 static final int PAGE_PARAM_PLACEHOLDER_LEN = PAGE_PARAM_PLACEHOLDER.length(
); |
55 | 51 |
56 /** | 52 /** |
57 * Stores information about the link (anchor) after the page parameter is de
tected: | 53 * The interface that page pattern handlers must implement to detect page pa
rameter from |
58 * - the page number (as represented by the original plain text) for the lin
k | 54 * potential pagination URLs. |
59 * - the original page parameter numeric component in the URL (this componen
t would be replaced | |
60 * by PAGE_PARAM_PLACEHOLDER in the URL pattern) | |
61 * - the position of this link in the list of ascending numbers. | |
62 */ | 55 */ |
63 static class LinkInfo { | 56 interface PagePattern { |
64 private int mPageNum; | 57 /** |
65 private int mPageParamValue; | 58 * Returns the string of the URL page pattern. |
66 private int mPosInAscendingList; | 59 */ |
| 60 String toString(); |
67 | 61 |
68 LinkInfo(int pageNum, int pageParamValue, int posInAscendingList) { | 62 /** |
69 mPageNum = pageNum; | 63 * Returns the page number extracted from the URL during creation of obje
ct that implements |
70 mPageParamValue = pageParamValue; | 64 * this interface. |
71 mPosInAscendingList = posInAscendingList; | 65 */ |
72 } | 66 int getPageNumber(); |
73 } // LinkInfo | 67 |
| 68 /** |
| 69 * Validates this page pattern according to the current document URL thr
ough a pipeline of |
| 70 * rules. |
| 71 * |
| 72 * Returns true if page pattern is valid. |
| 73 * |
| 74 * @param docUrl the current document URL |
| 75 */ |
| 76 boolean isValidFor(ParsedUrl docUrl); |
| 77 |
| 78 /** |
| 79 * Returns true if a URL matches this page pattern based on a pipeline o
f rules. |
| 80 * |
| 81 * @param url the URL to evalutate |
| 82 */ |
| 83 boolean isPagingUrl(String url); |
| 84 } |
74 | 85 |
75 /** | 86 /** |
76 * Stores a map of URL pattern to its associated list of LinkInfo's. | 87 * Stores a map of URL pattern to its associated list of PageLinkInfo's. |
77 */ | 88 */ |
78 private static class PageCandidatesMap { | 89 private static class PageCandidatesMap { |
79 private final Map<String, List<LinkInfo>> map = new HashMap<String, List
<LinkInfo>>(); | 90 private static class Info { |
| 91 private final PagePattern mPattern; |
| 92 private final List<PageLinkInfo> mLinks; |
80 | 93 |
81 /** | 94 Info(PagePattern pattern, PageLinkInfo link) { |
82 * Adds urlPattern with its LinkInfo into the map. If the urlPattern al
ready exists, adds | 95 mPattern = pattern; |
83 * the link to the list of LinkInfo's. Otherwise, creates a new map ent
ry. | 96 mLinks = new ArrayList<PageLinkInfo>(); |
84 */ | 97 mLinks.add(link); |
85 private void add(String urlPattern, LinkInfo link) { | |
86 if (map.containsKey(urlPattern)) { | |
87 map.get(urlPattern).add(link); | |
88 } else { | |
89 List<LinkInfo> links = new ArrayList<LinkInfo>(); | |
90 links.add(link); | |
91 map.put(urlPattern, links); | |
92 } | 98 } |
93 } | 99 } |
94 | 100 |
95 } // PageCandidatesMap | 101 private final Map<String, Info> map = new HashMap<String, Info>(); |
| 102 |
| 103 /** |
| 104 * Adds urlPattern with its PageLinkInfo into the map. If the urlPatter
n already exists, |
| 105 * adds the link to the list of LinkInfo's. Otherwise, creates a new ma
p entry. |
| 106 */ |
| 107 private void add(PagePattern pattern, PageLinkInfo link) { |
| 108 final String patternStr = pattern.toString(); |
| 109 if (map.containsKey(patternStr)) { |
| 110 map.get(patternStr).mLinks.add(link); |
| 111 } else { |
| 112 map.put(patternStr, new Info(pattern, link)); |
| 113 } |
| 114 } |
| 115 } |
96 | 116 |
97 // All the known bad page param names. | 117 // All the known bad page param names. |
98 private static Set<String> sBadPageParamNames = null; | 118 private static Set<String> sBadPageParamNames = null; |
99 | 119 |
100 /** | 120 /** |
101 * Extracts page parameter candidates from the query part of given URL and a
dds the associated | 121 * Extracts page parameter candidates from the query part of given URL and a
dds the associated |
102 * links into pageCandidates which is keyed by page pattern. | 122 * links into pageCandidates which is keyed by page pattern. |
103 * | 123 * |
104 * A page parameter candidate is one where: | 124 * A page parameter candidate is one where: |
105 * - the name of a query name-value component is not one of sBadPageParamNam
es, and | 125 * - the name of a query name-value component is not one of sBadPageParamNam
es, and |
106 * - the value of the query component is a plain number (>= 0). | 126 * - the value of the query component is a plain number (>= 0). |
107 * E.g. a URL query with 3 plain number query values will generate 3 URL pag
e patterns with 3 | 127 * E.g. a URL query with 3 plain number query values will generate 3 URL pag
e patterns with 3 |
108 * LinkInfo's, and hence 3 page parameter candidates. | 128 * PageLinkInfo's, and hence 3 page parameter candidates. |
109 * | 129 * |
110 * @param url ParsedUrl of the URL to process | 130 * @param url ParsedUrl of the URL to process |
111 * @param pageNum the page number as represented in original plain text | 131 * @param pageNum the page number as represented in original plain text |
112 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers | 132 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers |
113 * @param pageCandidates the map of URL pattern to its associated list of Li
nkInfo's | 133 * @param pageCandidates the map of URL pattern to its associated list of Pa
geLinkInfo's |
114 */ | 134 */ |
115 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p
ageNum, | 135 private static void extractPageParamCandidatesFromQuery(ParsedUrl url, int p
ageNum, |
116 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { | 136 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { |
117 String[][] queryParams = url.getQueryParams(); | 137 String[][] queryParams = url.getQueryParams(); |
118 if (queryParams.length == 0) return; // No query. | 138 if (queryParams.length == 0) return; // No query. |
119 | 139 |
120 for (String[] nameValue : queryParams) { | 140 for (String[] nameValue : queryParams) { |
121 final String queryName = nameValue[0]; | 141 PagePattern pattern = QueryParamPagePattern.create(url, nameValue[0]
, nameValue[1]); |
122 final String queryValue = nameValue[1]; | 142 if (pattern != null) { |
123 if (!queryName.isEmpty() && !queryValue.isEmpty() && | 143 pageCandidates.add(pattern, |
124 StringUtil.isStringAllDigits(queryValue) && !isPageParamName
Bad(queryName)) { | 144 new PageLinkInfo(pageNum, pattern.getPageNumber(), posIn
AscendingNumbers)); |
125 int value = StringUtil.toNumber(queryValue); | |
126 if (value >= 0) { | |
127 pageCandidates.add( | |
128 url.replaceQueryValue(queryName, queryValue, PAGE_PA
RAM_PLACEHOLDER), | |
129 new LinkInfo(pageNum, value, posInAscendingNumbers))
; | |
130 } | |
131 } | 145 } |
132 } | 146 } |
133 } // extractPageParamCandidatesFromQuery | 147 } |
134 | 148 |
135 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. | 149 private static RegExp sDigitsRegExp = null; // Match at least 1 digit. |
136 | 150 |
137 /** | 151 /** |
138 * Extracts page parameter candidates from the path part of given URL (witho
ut query components) | 152 * Extracts page parameter candidates from the path part of given URL (witho
ut query components) |
139 * and adds the associated links into pageCandidates which is keyed by page
pattern. | 153 * and adds the associated links into pageCandidates which is keyed by page
pattern. |
140 * | 154 * |
141 * A page parameter candidate is one where a path component contains consecu
tive digits which | 155 * A page parameter candidate is one where a path component contains consecu
tive digits which |
142 * can be converted to a plain number (>= 0). | 156 * can be converted to a plain number (>= 0). |
143 * E.g. a URL path with 3 path components that contain plain numbers will ge
nerate 3 URL page | 157 * E.g. a URL path with 3 path components that contain plain numbers will ge
nerate 3 URL page |
144 * patterns with 3 LinkInfo's, and hence 3 page parameter candidates. | 158 * patterns with 3 PageLinkInfo's, and hence 3 page parameter candidates. |
145 * | 159 * |
146 * @param url ParsedUrl of the URL to process | 160 * @param url ParsedUrl of the URL to process |
147 * @param pageNum the page number as represented in original plain text | 161 * @param pageNum the page number as represented in original plain text |
148 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers | 162 * @param posInAscendingNumbers position of this page number in the list of
ascending numbers |
149 * @param pageCandidates the map of URL pattern to its associated list of Li
nkInfo's | 163 * @param pageCandidates the map of URL pattern to its associated list of Pa
geLinkInfo's |
150 */ | 164 */ |
151 | 165 |
152 private static void extractPageParamCandidatesFromPath(ParsedUrl url, int pa
geNum, | 166 private static void extractPageParamCandidatesFromPath(ParsedUrl url, int pa
geNum, |
153 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { | 167 int posInAscendingNumbers, PageCandidatesMap pageCandidates) { |
154 String path = url.getTrimmedPath(); | 168 String path = url.getTrimmedPath(); |
155 if (path.isEmpty() || !StringUtil.containsDigit(path)) return; | 169 if (path.isEmpty() || !StringUtil.containsDigit(path)) return; |
156 | 170 |
157 // Extract digits (either one or consecutive) from path, replace the dig
it(s) with | 171 // Extract digits (either one or consecutive) from path, replace the dig
it(s) with |
158 // PAGE_PARAM_PLACEHOLDER to fomulate the page pattern, add it as page c
andidate. | 172 // PAGE_PARAM_PLACEHOLDER to fomulate the page pattern, add it as page c
andidate. |
159 final String urlStr = url.toString(); | 173 final String urlStr = url.toString(); |
160 final int pathStart = url.getOrigin().length(); | 174 final int pathStart = url.getOrigin().length(); |
161 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi"
); | 175 if (sDigitsRegExp == null) sDigitsRegExp = RegExp.compile("(\\d+)", "gi"
); |
162 sDigitsRegExp.setLastIndex(pathStart); | 176 sDigitsRegExp.setLastIndex(pathStart); |
163 while (true) { | 177 while (true) { |
164 MatchResult match = sDigitsRegExp.exec(urlStr); | 178 MatchResult match = sDigitsRegExp.exec(urlStr); |
165 if (match == null) break; | 179 if (match == null) break; |
166 | 180 |
167 final int matchEnd = sDigitsRegExp.getLastIndex(); | 181 final int matchEnd = sDigitsRegExp.getLastIndex(); |
168 final int matchStart = matchEnd - match.getGroup(1).length(); | 182 final int matchStart = matchEnd - match.getGroup(1).length(); |
169 | 183 PagePattern pattern = PathComponentPagePattern.create(url, pathStart
, matchStart, |
170 if (isLastNumericPathComponentBad(urlStr, pathStart, matchStart, mat
chEnd)) continue; | 184 matchEnd); |
171 | 185 if (pattern != null) { |
172 int value = StringUtil.toNumber(urlStr.substring(matchStart, matchEn
d)); | 186 pageCandidates.add(pattern, |
173 if (value >= 0) { | 187 new PageLinkInfo(pageNum, pattern.getPageNumber(), posIn
AscendingNumbers)); |
174 pageCandidates.add(urlStr.substring(0, matchStart) + PAGE_PARAM_
PLACEHOLDER + | |
175 urlStr.substring(matchEnd), | |
176 new LinkInfo(pageNum, value, posInAscendingNumbers)); | |
177 } | 188 } |
178 } // while there're matches | 189 } // while there're matches |
179 } // extractPageParamCandidatesFromPath | 190 } |
180 | 191 |
181 /** | 192 /** |
182 * Returns true if given name is backlisted as a known bad page param name. | 193 * Returns true if given name is backlisted as a known bad page param name. |
183 */ | 194 */ |
184 private static boolean isPageParamNameBad(String name) { | 195 static boolean isPageParamNameBad(String name) { |
185 initBadPageParamNames(); | 196 initBadPageParamNames(); |
186 return sBadPageParamNames.contains(name.toLowerCase()); | 197 return sBadPageParamNames.contains(name.toLowerCase()); |
187 } // isPageParamNameBad | 198 } |
188 | |
189 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | |
190 private static RegExp sLastPathComponentRegExp = null; // Match last path c
omponent. | |
191 | 199 |
192 /** | 200 /** |
193 * Returns true if: | 201 * Returns true if given string can be converted to a number >= 0. |
194 * - the digitStart to digitEnd of urlStr is the last path component, and | |
195 * - the entire path component is numeric, and | |
196 * - the previous path component is a bad page param name. | |
197 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an
d "tag" is a bad | |
198 * page param. | |
199 */ | 202 */ |
200 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, | 203 static boolean isPlainNumber(String str) { |
201 int digitStart, int digitEnd) { | 204 return StringUtil.toNumber(str) >= 0; |
202 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path
component. | 205 } |
203 pathStart < digitStart - 1) { // Not the first path component. | |
204 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | |
205 // Checks that this is the last path component, and trailing charact
ers, if available, | |
206 // are (s)htm(l) extensions. | |
207 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$",
"i"); | |
208 if (sExtRegExp.test(postMatch)) { | |
209 // Entire component is numeric, get previous path component. | |
210 if (sLastPathComponentRegExp == null) { | |
211 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i")
; | |
212 } | |
213 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | |
214 urlStr.substring(pathStart + 1, digitStart)); | |
215 if (prevPathComponent != null && prevPathComponent.getGroupCount
() > 1 && | |
216 isPageParamNameBad(prevPathComponent.getGroup(1))) { | |
217 return true; | |
218 } | |
219 } // last numeric path component | |
220 } | |
221 | |
222 return false; | |
223 } // isLastNumericPathComponentBad | |
224 | 206 |
225 /** | 207 /** |
226 * If sBadPageParamNames is null, initialize it with all the known bad page
param names, in | 208 * If sBadPageParamNames is null, initialize it with all the known bad page
param names, in |
227 * alphabetical order. | 209 * alphabetical order. |
228 */ | 210 */ |
229 private static void initBadPageParamNames() { | 211 private static void initBadPageParamNames() { |
230 if (sBadPageParamNames != null) return; | 212 if (sBadPageParamNames != null) return; |
231 | 213 |
232 sBadPageParamNames = new HashSet<String>(); | 214 sBadPageParamNames = new HashSet<String>(); |
233 sBadPageParamNames.add("baixar-gratis"); | 215 sBadPageParamNames.add("baixar-gratis"); |
(...skipping 18 matching lines...) Expand all Loading... |
252 sBadPageParamNames.add("search_keyword"); | 234 sBadPageParamNames.add("search_keyword"); |
253 sBadPageParamNames.add("search_query"); | 235 sBadPageParamNames.add("search_query"); |
254 sBadPageParamNames.add("sortby"); | 236 sBadPageParamNames.add("sortby"); |
255 sBadPageParamNames.add("subscriptions"); | 237 sBadPageParamNames.add("subscriptions"); |
256 sBadPageParamNames.add("tag"); | 238 sBadPageParamNames.add("tag"); |
257 sBadPageParamNames.add("tags"); | 239 sBadPageParamNames.add("tags"); |
258 sBadPageParamNames.add("video"); | 240 sBadPageParamNames.add("video"); |
259 sBadPageParamNames.add("videos"); | 241 sBadPageParamNames.add("videos"); |
260 sBadPageParamNames.add("w"); | 242 sBadPageParamNames.add("w"); |
261 sBadPageParamNames.add("wiki"); | 243 sBadPageParamNames.add("wiki"); |
262 } // initBadPageParamNames | 244 } |
263 | 245 |
264 } | 246 } |
OLD | NEW |