Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 package org.chromium.distiller; | |
| 6 | |
| 7 import com.google.gwt.regexp.shared.RegExp; | |
| 8 | |
| 9 /** | |
| 10 * This class handles the URL page pattern generated from a potential pagination URL, i.e. a URL | |
| 11 * containing the PageParameterDetector.PAGE_PARAM_PLACEHOLDER (see comments at top of | |
| 12 * PageParameterDetector.java). | |
| 13 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is | |
| 14 * "http://www.foo.com/a/b-[*!].html". | |
| 15 * This class determines the type of pattern (inside query or path), validates p attern against | |
| 16 * document URL, and determines if it's a paging URL based on the document URL. | |
| 17 */ | |
| 18 public class PagePattern { | |
|
cjhopman
2015/04/07 00:45:49
After commenting a bit below, I wonder if it would
kuan
2015/04/10 22:41:27
Done. i did two - QueryParamPagePattern and PathCo
| |
| 19 private final ParsedUrl mUrl; | |
| 20 private final int mPageParamPos; | |
|
cjhopman
2015/04/07 00:45:49
Some of this code below was hard for me to follow.
kuan
2015/04/10 22:41:27
Done.
| |
| 21 private final String mUrlStr; | |
| 22 private final int mQueryPos; | |
| 23 private int mComponentPos; // Start position of either page param query or path component. | |
| 24 private int mParamIndex = -1; // Page param path component position in list of path components. | |
| 25 | |
| 26 /** | |
| 27 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER. | |
| 28 */ | |
| 29 static PagePattern create(String url) { | |
|
cjhopman
2015/04/07 00:45:48
How about having a create method for page pattern
kuan
2015/04/10 22:41:27
Done.
| |
| 30 try { | |
| 31 return new PagePattern(url); | |
| 32 } catch (Exception e) { | |
| 33 return null; | |
| 34 } | |
| 35 } | |
| 36 | |
| 37 /** | |
| 38 * Validates this page pattern according to the current document URL through a pipeline of | |
| 39 * rules: | |
| 40 * - for query page parameter, pattern and URL must have same path component s. | |
| 41 * - for path page parameter, | |
| 42 * - pattern and URL must have same number of path components. | |
|
cjhopman
2015/04/07 00:45:49
Personally, I'd have these function-level comments
kuan
2015/04/10 22:41:28
Done.
| |
| 43 * - if only 1 path component, both must have long-enough common prefix an d suffix. | |
| 44 * - else all pattern's components, except for page parameter, must be sam e as url's. | |
| 45 * - lastly, pattern's components cannot be calendar digits. | |
| 46 * | |
| 47 * Returns true if page pattern is valid. | |
| 48 * | |
| 49 * @param docUrl the current document URL | |
| 50 * @param pagePattern the page pattern to validate | |
| 51 */ | |
| 52 boolean isValidFor(ParsedUrl docUrl) { | |
|
cjhopman
2015/04/07 00:45:48
Would it make sense to check validity when the pag
kuan
2015/04/10 22:41:27
i don't believe so. multiple pagination URLs coul
| |
| 53 // If page parameter is a query, page pattern and doc URL must have the same path. | |
| 54 if (isQueryPattern()) { | |
| 55 return docUrl.getTrimmedPath().equalsIgnoreCase(mUrl.getTrimmedPath( )); | |
| 56 } | |
| 57 | |
| 58 final int urlPathComponentsLen = docUrl.getPathComponents().length; | |
| 59 final int patternPathComponentsLen = mUrl.getPathComponents().length; | |
| 60 | |
| 61 // If the page param is inside of path components, both the pattern and doc URL must have | |
| 62 // the similar path. | |
| 63 if (urlPathComponentsLen > patternPathComponentsLen) return false; | |
| 64 | |
| 65 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must | |
| 66 // be at least half of the entire component in doc URL, e.g doc URL is | |
| 67 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]". | |
| 68 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { | |
| 69 final String urlComponent = docUrl.getPathComponents()[0]; | |
| 70 final String patternComponent = mUrl.getPathComponents()[0]; | |
| 71 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent); | |
| 72 return (getLongestCommonSuffixLength(urlComponent, patternComponent, commonPrefixLen) + | |
|
cjhopman
2015/04/07 00:45:48
nit: could you put the common suffix in a local va
kuan
2015/04/10 22:41:28
Done.
| |
| 73 commonPrefixLen) * 2 >= urlComponent.length(); | |
| 74 } | |
| 75 | |
| 76 if (!hasSamePathComponentsAs(docUrl)) return false; | |
| 77 | |
| 78 if (isCalendarPage()) return false; | |
|
cjhopman
2015/04/07 00:45:48
Why do we check this here? it doesn't really depen
kuan
2015/04/10 22:41:28
it's true it doesn't depend on doc url. however,
| |
| 79 | |
| 80 return true; | |
| 81 } | |
| 82 | |
| 83 private static RegExp sSlashOrHtmExtRegExp = null; // Match either '/' or " .htm(l)". | |
| 84 | |
| 85 /** | |
| 86 * Returns true if a URL matches this page pattern based on a pipeline of ru les: | |
| 87 * - suffix (part of pattern after page param placeholder) must be same, and | |
| 88 * - for query page parameter, | |
| 89 * - scheme, host, and path must be same, and | |
| 90 * - query components, except that for page number, must be same in order and value, and | |
| 91 * - query value must be a plain number. | |
| 92 * - for path page parameter that is part of a path component, | |
| 93 * - if the first different character in path component is suffix, it must be a page parameter | |
| 94 * separator, followed by the page parameter in the pattern | |
| 95 * - else if it's page parameter, it and possible following digits must be a plain number. | |
| 96 * - for path page parameter that is the entire path component, | |
| 97 * - if URL has no page number param and previous path component, everythi ng else matches, or | |
| 98 * - if prefix is the same, URL doesn't have anyhing else | |
| 99 * - else url must have '/' at the same position as pattern's page paramet er path component, | |
| 100 * followed by a plain number. | |
| 101 * | |
| 102 * @param url the URL to evalutate | |
| 103 * @param pagePattern the URL page pattern to match with | |
| 104 */ | |
| 105 boolean isPagingUrl(String url) { | |
| 106 final int urlLen = url.length(); | |
| 107 final int patternLen = mUrlStr.length(); | |
| 108 boolean isDynamicParam = isDynamic(); | |
| 109 | |
| 110 // Both url and patterm must have the same suffix, if available. | |
|
cjhopman
2015/04/07 00:45:48
nit: s/patterm/pattern
kuan
2015/04/10 22:41:28
Done.
| |
| 111 int suffixLen = patternLen - mPageParamPos - | |
| 112 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN; | |
| 113 if (suffixLen != 0) { | |
| 114 int compareLen = suffixLen - (isDynamicParam ? 1 : 0); // Excludes '&' or '?'. | |
| 115 if (!url.regionMatches(urlLen - compareLen, mUrlStr, patternLen - co mpareLen, | |
| 116 compareLen)) { | |
| 117 return false; | |
| 118 } | |
| 119 } | |
| 120 | |
| 121 final int suffixPos = urlLen - suffixLen; | |
| 122 | |
|
cjhopman
2015/04/07 00:45:48
Could we split these three cases into their own fu
kuan
2015/04/10 22:41:27
Done.
| |
| 123 if (isDynamicParam) { | |
| 124 // If page parameter is dynamic, the url matches the pattern only wh en: | |
| 125 // 1. has same prefix (scheme, host, path) | |
| 126 // 2. has same query components with same value (except page numbe r query) in the same | |
| 127 // order. | |
| 128 // Examples: | |
| 129 // If page pattern is http://foo.com/a/b?queryA=v1&queryB=[*!]&query C=v3 | |
| 130 // Returns true for: | |
| 131 // - http://foo.com/a/b/?queryA=v1&queryC=v3 | |
| 132 // - http://foo.com/a/b/?queryA=v1&queyrB=4&queryC=v3 | |
| 133 // Otherwise, returns false. | |
| 134 // | |
| 135 // If page pattern is http://foo.com/a/b?page=[*!]&query=a | |
| 136 // Returns true for: | |
| 137 // - http://foo.com/a/b?query=a | |
| 138 // - http://foo.com/a/b?page=2&query=a | |
| 139 // Otherwise, returns false. | |
| 140 // | |
| 141 // If page pattern is http://foo.com/a/b?page=[*!] | |
| 142 // Returns true for: | |
| 143 // - http://foo.com/a/b/ | |
| 144 // - http://foo.com/a/b.html | |
| 145 // - http://foo.com/a/b.htm | |
| 146 // - http://foo.com/a/b?page=2 | |
| 147 // Otherwise, returns false. | |
| 148 | |
| 149 // Both url and pattern must have the same prefix. | |
| 150 if (suffixPos < mComponentPos || !url.regionMatches(0, mUrlStr, 0, m ComponentPos)) { | |
|
cjhopman
2015/04/07 00:45:48
how about:
url.endsWith(mRequiredSuffix)
and cal
kuan
2015/04/10 22:41:27
Done. i assume u mean for lines 110-119 (at the b
| |
| 151 return false; | |
| 152 } | |
| 153 | |
| 154 // If the url doesn't have page number query, it is fine. | |
| 155 if (mComponentPos == suffixPos) return true; | |
| 156 | |
| 157 // If the only difference in the page param query component of url a nd pattern is "/", | |
| 158 // ".html" or ".html", it is fine. | |
|
cjhopman
2015/04/07 00:45:48
s/.html/.htm for one of the two
kuan
2015/04/10 22:41:28
Done.
| |
| 159 String diffPart = url.substring(mComponentPos, suffixPos).toLowerCas e(); | |
|
cjhopman
2015/04/07 00:45:48
I'm not really sure what diffpart is here. What is
kuan
2015/04/10 22:41:28
yes, it's for the last set of examples.
| |
| 160 if (sSlashOrHtmExtRegExp == null) { | |
| 161 sSlashOrHtmExtRegExp = RegExp.compile("^\\/|(.html?)$", "i"); | |
| 162 } | |
| 163 if (sSlashOrHtmExtRegExp.test(diffPart)) return true; | |
| 164 | |
| 165 // Both url and pattern must have the same query name. | |
| 166 if (!url.regionMatches(mComponentPos, mUrlStr, mComponentPos, | |
| 167 mPageParamPos - mComponentPos)) { | |
| 168 return false; | |
| 169 } | |
| 170 | |
| 171 return isPlainNumber(url.substring(mPageParamPos, suffixPos)); | |
| 172 } // isDynamicParam | |
| 173 | |
| 174 // If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: | |
| 175 // - www.foo.com/a/abc-2.html | |
| 176 // - www.foo.com/a/abc.html. | |
| 177 // If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: | |
| 178 // - www.foo.com/a/2/abc.html | |
|
cjhopman
2015/04/07 00:45:49
The second part of this comment confused me becaus
kuan
2015/04/10 22:41:28
yes, i've now split up the comments since the func
| |
| 179 // - www.foo.com/a/abc.html | |
| 180 // - www.foo.com/abc.html. | |
| 181 | |
| 182 // Handle case where page param is part of the path component (as oppose d to being the | |
| 183 // entire path component). | |
| 184 if (isPartialPathComponent()) { | |
| 185 // The page param path component of both url and pattern must have t he same prefix. | |
| 186 if (urlLen < mComponentPos + suffixLen || | |
|
cjhopman
2015/04/07 00:45:49
url.startsWith(mRequiredPrefix)?
I guess if you d
kuan
2015/04/10 22:41:27
Done. i'm not sure wha tu mean by "make it clear"
| |
| 187 !url.regionMatches(0, mUrlStr, 0, mComponentPos)) { | |
| 188 return false; | |
| 189 } | |
| 190 | |
| 191 // Find the first different character in page param path component j ust before | |
| 192 // placeholder or suffix, then check if it's acceptable. | |
| 193 int firstDiffPos = mComponentPos; | |
| 194 int maxPos = Math.min(mPageParamPos, suffixPos); | |
| 195 for (; firstDiffPos < maxPos; firstDiffPos++) { | |
| 196 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) br eak; | |
| 197 } | |
| 198 if (firstDiffPos == suffixPos) { // First different character is th e suffix. | |
| 199 if (firstDiffPos + 1 == mPageParamPos && | |
|
cjhopman
2015/04/07 00:45:49
Hm, if I read this correctly, the following return
kuan
2015/04/10 22:41:28
yes it returns false, i verified in original code.
| |
| 200 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) { | |
| 201 return true; | |
| 202 } | |
| 203 } else if (firstDiffPos == mPageParamPos) { // First different char acter is page param. | |
| 204 if (isPlainNumber(url.substring(firstDiffPos, suffixPos))) retur n true; | |
| 205 } | |
| 206 | |
| 207 return false; | |
| 208 } // page param is part of the (not entire) path component. | |
| 209 | |
| 210 // Handle case where page param is the entire path component. | |
|
cjhopman
2015/04/07 00:45:49
This comment looks like it applies to just the nex
kuan
2015/04/10 22:41:28
Done. comment is no longer needed since i've extr
| |
| 211 int prevComponentPos = mUrlStr.lastIndexOf('/', mComponentPos - 1); | |
|
cjhopman
2015/04/07 00:45:48
Should this only be looking at the path? If not, w
kuan
2015/04/10 22:41:28
Done. good catch.
| |
| 212 if (prevComponentPos != -1) { | |
| 213 // The url doesn't have page number param and previous path componen t, like | |
|
cjhopman
2015/04/07 00:45:49
Stated like this, it seems the comment applies to
kuan
2015/04/10 22:41:28
Done.
| |
| 214 // www.foo.com/abc.html. | |
| 215 if (prevComponentPos + suffixLen == urlLen) { | |
| 216 return url.regionMatches(0, mUrlStr, 0, prevComponentPos); | |
| 217 } | |
| 218 } | |
| 219 | |
| 220 // If both url and pattern have the same prefix, url must have nothing e lse. | |
| 221 if (url.regionMatches(0, mUrlStr, 0, mComponentPos)) { | |
| 222 int acceptLen = mComponentPos + suffixLen; | |
| 223 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html. | |
| 224 if (acceptLen == urlLen) return true; | |
| 225 if (acceptLen > urlLen) return false; | |
| 226 | |
| 227 // While we are here, the url must have page number param, so the ur l must have a '/' | |
| 228 // at the pattern's path component start position. | |
| 229 if (url.charAt(mComponentPos) != '/') return false; | |
| 230 | |
| 231 return isPlainNumber(url.substring(mComponentPos + 1, suffixPos)); | |
| 232 } | |
| 233 | |
| 234 return false; | |
| 235 } | |
| 236 | |
| 237 private PagePattern(String urlStr) throws Exception { | |
| 238 mUrl = ParsedUrl.create(urlStr); | |
| 239 if (mUrl == null) throw new Exception("Invalid URL: " + urlStr); | |
| 240 mPageParamPos = urlStr.indexOf(PageParameterDetector.PAGE_PARAM_PLACEHOL DER); | |
| 241 if (mPageParamPos == -1) { | |
| 242 throw new Exception("Required page param placeholder is missing in " + urlStr); | |
| 243 } | |
| 244 mUrlStr = urlStr; | |
| 245 mQueryPos = mUrlStr.lastIndexOf('?', mPageParamPos - 1); | |
| 246 if (isQueryPattern()) { // Page param is one of the query components. | |
| 247 mComponentPos = mUrlStr.lastIndexOf('&', mPageParamPos - 1); | |
| 248 // Page param is the first query. | |
| 249 if (mComponentPos == -1) mComponentPos = mQueryPos; | |
| 250 } else { // Page param is one of the path components. | |
| 251 mComponentPos = mUrlStr.lastIndexOf('/', mPageParamPos); | |
| 252 } | |
| 253 } | |
| 254 | |
| 255 private boolean isQueryPattern() { | |
| 256 return mQueryPos != -1; | |
| 257 } | |
| 258 | |
| 259 private boolean isPathPattern() { | |
| 260 return !isQueryPattern() && mComponentPos != -1; | |
| 261 } | |
| 262 | |
| 263 private boolean isDynamic() { | |
| 264 return isQueryPattern() && mComponentPos > 0 && mUrlStr.charAt(mPagePara mPos - 1) == '='; | |
| 265 } | |
| 266 | |
| 267 private boolean isPartialPathComponent() { | |
| 268 return isPathPattern() && mUrlStr.charAt(mPageParamPos - 1) != '/'; | |
| 269 } | |
| 270 | |
| 271 private void determineParamIndex() { | |
|
cjhopman
2015/04/07 00:45:48
We should just call this in the constructor so tha
kuan
2015/04/10 22:41:27
Done.
| |
| 272 if (mParamIndex != -1 || !isPathPattern()) return; | |
| 273 final String[] pathComponents = mUrl.getPathComponents(); | |
| 274 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) { | |
| 275 if (pathComponents[mParamIndex].contains( | |
| 276 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) { | |
| 277 break; | |
| 278 } | |
| 279 } | |
| 280 } | |
| 281 | |
| 282 /** | |
| 283 * Returns true if, except for the path component containing the page param, the other path | |
| 284 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g. | |
| 285 * doc URL is /thread/12 and pattern is /thread/12/page/[*!]. | |
| 286 */ | |
| 287 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) { | |
| 288 determineParamIndex(); | |
| 289 final String[] urlComponents = docUrl.getPathComponents(); | |
| 290 final String[] patternComponents = mUrl.getPathComponents(); | |
| 291 boolean passedParamComponent = false; | |
| 292 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) { | |
| 293 if (i == mParamIndex && !passedParamComponent) { | |
| 294 passedParamComponent = true; | |
| 295 // Repeat current path component if doc URL has less components (as per comments | |
| 296 // just above, doc URL may have less components). | |
| 297 if (urlComponents.length < patternComponents.length) i--; | |
| 298 continue; | |
| 299 } | |
| 300 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false; | |
| 301 } | |
| 302 | |
| 303 return true; | |
| 304 } | |
| 305 | |
| 306 /** | |
| 307 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a | |
| 308 * false-positive. | |
| 309 */ | |
| 310 private boolean isCalendarPage() { | |
| 311 determineParamIndex(); | |
| 312 if (mParamIndex < 2) return false; | |
| 313 | |
| 314 // Only if param is the entire path component. This handles some cases erroneously | |
| 315 // considered false-positives e.g. first page is | |
| 316 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html, | |
| 317 // and second page is | |
| 318 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html, | |
| 319 // would be considered false-positives otherwise because of "2014" and " 07". | |
| 320 final String[] patternComponents = mUrl.getPathComponents(); | |
| 321 if (patternComponents[mParamIndex].length() != | |
| 322 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) { | |
| 323 return false; | |
| 324 } | |
| 325 | |
| 326 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]); | |
| 327 if (month > 0 && month <= 12) { | |
| 328 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]); | |
| 329 if (year > 1970 && year < 3000) return true; | |
| 330 } | |
| 331 | |
| 332 return false; | |
| 333 } | |
| 334 | |
| 335 private static int getLongestCommonPrefixLength(String str1, String str2) { | |
| 336 if (str1.isEmpty() || str2.isEmpty()) return 0; | |
| 337 | |
| 338 int limit = Math.min(str1.length(), str2.length()); | |
| 339 int i = 0; | |
| 340 for (; i < limit; i++) { | |
| 341 if (str1.charAt(i) != str2.charAt(i)) break; | |
| 342 } | |
| 343 return i; | |
| 344 } | |
| 345 | |
| 346 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) { | |
| 347 int commonSuffixLen = 0; | |
| 348 for (int i = str1.length() - 1, j = str2.length() - 1; | |
| 349 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { | |
| 350 if (str1.charAt(i) != str2.charAt(i)) break; | |
| 351 } | |
| 352 return commonSuffixLen; | |
| 353 } | |
| 354 | |
| 355 /** | |
| 356 * Returns true if given string can be converted to a number >= 0. | |
| 357 */ | |
| 358 private static boolean isPlainNumber(String str) { | |
| 359 return StringUtil.toNumber(str) >= 0; | |
| 360 } | |
| 361 | |
| 362 /** | |
| 363 * Returns true if given character is one of '-', '_', ';', ','. | |
| 364 */ | |
| 365 public static native boolean isPageParamSeparator(Character c) /*-{ | |
| 366 return /[-_;,]/.test(c); | |
| 367 }-*/; | |
| 368 | |
| 369 } | |
| OLD | NEW |