OLD | NEW |
---|---|
(Empty) | |
1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 package org.chromium.distiller; | |
6 | |
7 import com.google.gwt.regexp.shared.MatchResult; | |
8 import com.google.gwt.regexp.shared.RegExp; | |
9 | |
10 /** | |
11 * This class detects the page parameter in the path of a potential pagination U RL. If detected, | |
12 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH OLDER, then creates | |
13 * and returns a new object. This object can then be accessed via PageParameter Detector.PagePattern | |
14 * interface to: | |
15 * - validate the generated URL page pattern against the document URL | |
16 * - determine if a URL is a paging URL based on the page pattern. | |
17 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is | |
18 * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDete ctor.java). | |
19 */ | |
20 public class PathComponentPagePattern implements PageParameterDetector.PagePatte rn { | |
21 private final ParsedUrl mUrl; | |
22 private final int mPageNumber; | |
23 private final int mPlaceholderStart; | |
24 private final String mUrlStr; | |
25 // Start position of path component containing placeholder. | |
26 private int mPlaceholderSegmentStart; | |
27 // Page param path component in list of path components. | |
28 private int mParamIndex = -1; | |
29 private final String mPrefix; // The part of the page pattern before the pla ceholder. | |
30 private String mSuffix = ""; // The part of the page pattern after the plac eholder. | |
31 | |
32 /** | |
33 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER. | |
34 */ | |
35 static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart , int digitStart, | |
36 int digitEnd) { | |
37 try { | |
38 return new PathComponentPagePattern(url, pathStart, digitStart, digi tEnd); | |
39 } catch (Exception e) { | |
cjhopman
2015/04/16 21:58:32
We shouldn't be throwing/catching the base Excepti
kuan
2015/04/20 23:11:13
Done. ditto for QueryParamPagePattern.
| |
40 return null; | |
41 } | |
42 } | |
43 | |
44 @Override | |
45 public String toString() { | |
46 return mUrlStr; | |
47 } | |
48 | |
49 @Override | |
50 public int getPageNumber() { | |
51 return mPageNumber; | |
52 } | |
53 | |
54 /** | |
55 * Returns true if pattern and URL are sufficiently similar and the pattern' s components are not | |
56 * calendar digits. | |
57 * | |
58 * @param docUrl the current document URL | |
59 */ | |
60 @Override | |
61 public boolean isValidFor(ParsedUrl docUrl) { | |
62 final int urlPathComponentsLen = docUrl.getPathComponents().length; | |
63 final int patternPathComponentsLen = mUrl.getPathComponents().length; | |
64 | |
65 // If the page param is inside of path components, both the pattern and doc URL must have | |
cjhopman
2015/04/16 21:58:32
We know that the page param is inside of path comp
kuan
2015/04/20 23:11:13
Done.
| |
66 // the similar path. | |
67 if (urlPathComponentsLen > patternPathComponentsLen) return false; | |
68 | |
69 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must | |
70 // be at least half of the entire component in doc URL, e.g doc URL is | |
71 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]". | |
72 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { | |
73 final String urlComponent = docUrl.getPathComponents()[0]; | |
74 final String patternComponent = mUrl.getPathComponents()[0]; | |
75 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent); | |
76 int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, pat ternComponent, | |
77 commonPrefixLen); | |
78 return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.lengt h(); | |
79 } | |
80 | |
81 if (!hasSamePathComponentsAs(docUrl)) return false; | |
82 | |
83 if (isCalendarPage()) return false; | |
84 | |
85 return true; | |
86 } | |
87 | |
88 /** | |
89 * Returns true if a URL matches this page pattern based on a pipeline of ru les: | |
90 * - suffix (part of pattern after page param placeholder) must be same, and | |
91 * - for path page parameter that is part of a path component, | |
92 * - if the first different character in path component is suffix, it must be a page parameter | |
93 * separator, followed by the page parameter in the pattern | |
94 * - else if it's page parameter, it and possible following digits must be a plain number. | |
95 * - for path page parameter that is the entire path component, | |
96 * - if URL has no page number param and previous path component, everythi ng else matches, or | |
97 * - if prefix is the same, URL doesn't have anyhing else | |
98 * - else url must have '/' at the same position as pattern's page paramet er path component, | |
99 * followed by a plain number. | |
100 * | |
101 * @param url the URL to evalutate | |
102 */ | |
103 @Override | |
104 public boolean isPagingUrl(String url) { | |
105 // Both url and pattern must have the same suffix, if available. | |
106 if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false; | |
107 | |
108 return isPartialPathComponent() ? isPartialPathComponentPagingUrl(url) : | |
109 isEntirePathComponentPagingUrl(url); | |
110 } | |
111 | |
112 private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStar t, int digitEnd) | |
113 throws Exception { | |
114 final String urlStr = url.toString(); | |
115 if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEn d)) { | |
116 throw new Exception("Bad last numeric path component"); | |
117 } | |
118 | |
119 String valueStr = urlStr.substring(digitStart, digitEnd); | |
120 int value = StringUtil.toNumber(valueStr); | |
121 if (value < 0) { | |
122 throw new Exception("Value in path component is an invalid number: " + valueStr); | |
123 } | |
124 | |
125 String pattern = urlStr.substring(0, digitStart) + | |
126 PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring( digitEnd); | |
127 mUrl = ParsedUrl.create(pattern); | |
128 if (mUrl == null) throw new Exception("Invalid URL: " + pattern); | |
129 mUrlStr = pattern; | |
130 mPageNumber = value; | |
131 mPlaceholderStart = digitStart; | |
132 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart); | |
133 determineParamIndex(); | |
134 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart); | |
135 // Determine suffix, if available. | |
136 final int urlLen = mUrlStr.length(); | |
137 int suffixLen = urlLen - mPlaceholderStart - | |
138 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN; | |
139 if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen); | |
140 } | |
141 | |
142 private boolean isPartialPathComponent() { | |
143 return mUrlStr.charAt(mPlaceholderStart - 1) != '/'; | |
cjhopman
2015/04/16 21:58:32
this is strange, shouldn't
foo.com/article/2page/
kuan
2015/04/20 23:11:13
good point, so i did more testing w/ the original
| |
144 } | |
145 | |
146 private void determineParamIndex() { | |
147 final String[] pathComponents = mUrl.getPathComponents(); | |
148 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) { | |
149 if (pathComponents[mParamIndex].contains( | |
150 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) { | |
151 break; | |
152 } | |
153 } | |
154 } | |
155 | |
156 /** | |
157 * Returns true if, except for the path component containing the page param, the other path | |
158 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g.: | |
159 * - doc URL is /thread/12, pattern is /thread/12/page/[*!] | |
160 * returns true because "thread" and "12" in doc URL match those in patter n | |
161 * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo | |
162 * returns false because "foo" in doc URL doesn't match "page" in pattern whose page param | |
163 path component comes after. | |
164 * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo | |
165 * returns true because "foo" in doc URL would match "foo" in pattern whos e page param path | |
166 * component is skipped when matching. | |
167 */ | |
168 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) { | |
169 determineParamIndex(); | |
cjhopman
2015/04/16 21:58:32
don't need to call this, it's done in the construc
kuan
2015/04/20 23:11:13
Done. i forgot :(
| |
170 final String[] urlComponents = docUrl.getPathComponents(); | |
171 final String[] patternComponents = mUrl.getPathComponents(); | |
172 boolean passedParamComponent = false; | |
173 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) { | |
174 if (i == mParamIndex && !passedParamComponent) { | |
175 passedParamComponent = true; | |
176 // Repeat current path component if doc URL has less components (as per comments | |
177 // just above, doc URL may have less components). | |
178 if (urlComponents.length < patternComponents.length) i--; | |
179 continue; | |
180 } | |
181 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false; | |
182 } | |
183 | |
184 return true; | |
185 } | |
186 | |
187 /** | |
188 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a | |
189 * false-positive. | |
190 */ | |
191 private boolean isCalendarPage() { | |
192 determineParamIndex(); | |
193 if (mParamIndex < 2) return false; | |
194 | |
195 // Only if param is the entire path component. This handles some cases erroneously | |
196 // considered false-positives e.g. first page is | |
197 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html, | |
198 // and second page is | |
199 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html, | |
200 // would be considered false-positives otherwise because of "2014" and " 07". | |
201 final String[] patternComponents = mUrl.getPathComponents(); | |
202 if (patternComponents[mParamIndex].length() != | |
203 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) { | |
204 return false; | |
205 } | |
206 | |
207 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]); | |
208 if (month > 0 && month <= 12) { | |
209 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]); | |
210 if (year > 1970 && year < 3000) return true; | |
211 } | |
212 | |
213 return false; | |
214 } | |
215 | |
216 private static int getLongestCommonPrefixLength(String str1, String str2) { | |
217 if (str1.isEmpty() || str2.isEmpty()) return 0; | |
218 | |
219 int limit = Math.min(str1.length(), str2.length()); | |
220 int i = 0; | |
221 for (; i < limit; i++) { | |
222 if (str1.charAt(i) != str2.charAt(i)) break; | |
223 } | |
224 return i; | |
225 } | |
226 | |
227 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) { | |
228 int commonSuffixLen = 0; | |
229 for (int i = str1.length() - 1, j = str2.length() - 1; | |
230 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { | |
231 if (str1.charAt(i) != str2.charAt(i)) break; | |
232 } | |
233 return commonSuffixLen; | |
234 } | |
235 | |
236 /** | |
237 * Returns true if url is a paging URL based on the page pattern where the p age param is part | |
238 * of a path component. | |
239 * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: | |
240 * - www.foo.com/a/abc-2.html | |
241 * - www.foo.com/a/abc.html. | |
242 */ | |
243 private boolean isPartialPathComponentPagingUrl(String url) { | |
244 final int urlLen = url.length(); | |
245 final int suffixStart = urlLen - mSuffix.length(); | |
246 | |
247 // The page param path component of both url and pattern must have the s ame prefix. | |
248 if (!url.startsWith(mPrefix)) return false; | |
249 | |
250 // Find the first different character in page param path component just before | |
251 // placeholder or suffix, then check if it's acceptable. | |
252 int firstDiffPos = mPlaceholderSegmentStart; | |
253 int maxPos = Math.min(mPlaceholderStart, suffixStart); | |
254 for (; firstDiffPos < maxPos; firstDiffPos++) { | |
255 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break; | |
256 } | |
257 if (firstDiffPos == suffixStart) { // First different character is the suffix. | |
258 if (firstDiffPos + 1 == mPlaceholderStart && | |
259 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) { | |
260 return true; | |
261 } | |
262 } else if (firstDiffPos == mPlaceholderStart) { // First different char acter is page param. | |
263 if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos, suffixStart))) { | |
264 return true; | |
265 } | |
266 } | |
267 | |
268 return false; | |
269 } | |
270 | |
271 /** | |
272 * Returns true if url is a paging URL based on the page pattern where the p age param is the | |
273 * entire path component. | |
274 * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: | |
275 * - www.foo.com/a/2/abc.html | |
276 * - www.foo.com/a/abc.html | |
277 * - www.foo.com/abc.html. | |
278 */ | |
279 private boolean isEntirePathComponentPagingUrl(String url) { | |
280 final int urlLen = url.length(); | |
281 final int suffixLen = mSuffix.length(); | |
282 final int suffixStart = url.length() - suffixLen; | |
283 | |
284 int prevComponentPos = mUrl.getPath().lastIndexOf('/', | |
285 // We're only looking in the path, so the reverse search should start at the index | |
286 // excluding the url's origin. | |
287 mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length()); | |
288 if (prevComponentPos != -1) { | |
289 // Now, add back the url's origin to the index of previous path comp onent. | |
290 prevComponentPos += mUrl.getOrigin().length(); | |
291 if (prevComponentPos + suffixLen == urlLen) { | |
292 // The url doesn't have page number param and previous path comp onent, like | |
293 // www.foo.com/abc.html. | |
294 return url.regionMatches(0, mUrlStr, 0, prevComponentPos); | |
295 } | |
296 } | |
297 | |
298 // If both url and pattern have the same prefix, url must have nothing e lse. | |
299 if (url.startsWith(mPrefix)) { | |
300 int acceptLen = mPlaceholderSegmentStart + suffixLen; | |
301 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html. | |
302 if (acceptLen == urlLen) return true; | |
303 if (acceptLen > urlLen) return false; | |
304 | |
305 // While we are here, the url must have page number param, so the ur l must have a '/' | |
306 // at the pattern's path component start position. | |
307 if (url.charAt(mPlaceholderSegmentStart) != '/') return false; | |
308 | |
309 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholde rSegmentStart + 1, | |
310 suffixStart)); | |
311 } | |
312 | |
313 return false; | |
314 } | |
315 | |
316 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | |
317 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. | |
318 | |
319 /** | |
320 * Returns true if: | |
321 * - the digitStart to digitEnd of urlStr is the last path component, and | |
322 * - the entire path component is numeric, and | |
323 * - the previous path component is a bad page param name. | |
324 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad | |
325 * page param. | |
326 */ | |
327 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart, | |
328 int digitEnd) { | |
329 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. | |
330 pathStart < digitStart - 1) { // Not the first path component. | |
331 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | |
332 // Checks that this is the last path component, and trailing charact ers, if available, | |
333 // are (s)htm(l) extensions. | |
334 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); | |
335 if (sExtRegExp.test(postMatch)) { | |
336 // Entire component is numeric, get previous path component. | |
337 if (sLastPathComponentRegExp == null) { | |
338 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; | |
339 } | |
340 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | |
341 urlStr.substring(pathStart + 1, digitStart)); | |
342 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && | |
343 PageParameterDetector.isPageParamNameBad(prevPathCompone nt.getGroup(1))) { | |
344 return true; | |
345 } | |
346 } // last numeric path component | |
347 } | |
348 | |
349 return false; | |
350 } | |
351 | |
352 /** | |
353 * Returns true if given character is one of '-', '_', ';', ','. | |
354 */ | |
355 private static native boolean isPageParamSeparator(Character c) /*-{ | |
356 return /[-_;,]/.test(c); | |
357 }-*/; | |
358 | |
359 } | |
OLD | NEW |