Chromium Code Reviews| OLD | NEW |
|---|---|
| (Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 package org.chromium.distiller; | |
| 6 | |
| 7 import com.google.gwt.regexp.shared.MatchResult; | |
| 8 import com.google.gwt.regexp.shared.RegExp; | |
| 9 | |
| 10 /** | |
| 11 * This class detects the page parameter in the path of a potential pagination U RL. If detected, | |
| 12 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH OLDER, then creates | |
| 13 * and returns a new object. This object can then be accessed via PageParameter Detector.PagePattern | |
| 14 * interface to: | |
| 15 * - validate the generated URL page pattern against the document URL | |
| 16 * - determine if a URL is a paging URL based on the page pattern. | |
| 17 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is | |
| 18 * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDete ctor.java). | |
| 19 */ | |
| 20 public class PathComponentPagePattern implements PageParameterDetector.PagePatte rn { | |
| 21 private final ParsedUrl mUrl; | |
| 22 private final int mPageNumber; | |
| 23 private final int mPlaceholderStart; | |
| 24 private final String mUrlStr; | |
| 25 // Start position of path component containing placeholder. | |
| 26 private int mPlaceholderSegmentStart; | |
| 27 // Page param path component in list of path components. | |
| 28 private int mParamIndex = -1; | |
| 29 private final String mPrefix; // The part of the page pattern before the pla ceholder. | |
| 30 private String mSuffix = ""; // The part of the page pattern after the plac eholder. | |
| 31 | |
| 32 /** | |
| 33 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER. | |
| 34 */ | |
| 35 static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart , int digitStart, | |
| 36 int digitEnd) { | |
| 37 try { | |
| 38 return new PathComponentPagePattern(url, pathStart, digitStart, digi tEnd); | |
| 39 } catch (Exception e) { | |
|
cjhopman
2015/04/16 21:58:32
We shouldn't be throwing/catching the base Excepti
kuan
2015/04/20 23:11:13
Done. ditto for QueryParamPagePattern.
| |
| 40 return null; | |
| 41 } | |
| 42 } | |
| 43 | |
| 44 @Override | |
| 45 public String toString() { | |
| 46 return mUrlStr; | |
| 47 } | |
| 48 | |
| 49 @Override | |
| 50 public int getPageNumber() { | |
| 51 return mPageNumber; | |
| 52 } | |
| 53 | |
| 54 /** | |
| 55 * Returns true if pattern and URL are sufficiently similar and the pattern' s components are not | |
| 56 * calendar digits. | |
| 57 * | |
| 58 * @param docUrl the current document URL | |
| 59 */ | |
| 60 @Override | |
| 61 public boolean isValidFor(ParsedUrl docUrl) { | |
| 62 final int urlPathComponentsLen = docUrl.getPathComponents().length; | |
| 63 final int patternPathComponentsLen = mUrl.getPathComponents().length; | |
| 64 | |
| 65 // If the page param is inside of path components, both the pattern and doc URL must have | |
|
cjhopman
2015/04/16 21:58:32
We know that the page param is inside of path comp
kuan
2015/04/20 23:11:13
Done.
| |
| 66 // the similar path. | |
| 67 if (urlPathComponentsLen > patternPathComponentsLen) return false; | |
| 68 | |
| 69 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must | |
| 70 // be at least half of the entire component in doc URL, e.g doc URL is | |
| 71 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]". | |
| 72 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { | |
| 73 final String urlComponent = docUrl.getPathComponents()[0]; | |
| 74 final String patternComponent = mUrl.getPathComponents()[0]; | |
| 75 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent); | |
| 76 int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, pat ternComponent, | |
| 77 commonPrefixLen); | |
| 78 return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.lengt h(); | |
| 79 } | |
| 80 | |
| 81 if (!hasSamePathComponentsAs(docUrl)) return false; | |
| 82 | |
| 83 if (isCalendarPage()) return false; | |
| 84 | |
| 85 return true; | |
| 86 } | |
| 87 | |
| 88 /** | |
| 89 * Returns true if a URL matches this page pattern based on a pipeline of ru les: | |
| 90 * - suffix (part of pattern after page param placeholder) must be same, and | |
| 91 * - for path page parameter that is part of a path component, | |
| 92 * - if the first different character in path component is suffix, it must be a page parameter | |
| 93 * separator, followed by the page parameter in the pattern | |
| 94 * - else if it's page parameter, it and possible following digits must be a plain number. | |
| 95 * - for path page parameter that is the entire path component, | |
| 96 * - if URL has no page number param and previous path component, everythi ng else matches, or | |
| 97 * - if prefix is the same, URL doesn't have anyhing else | |
| 98 * - else url must have '/' at the same position as pattern's page paramet er path component, | |
| 99 * followed by a plain number. | |
| 100 * | |
| 101 * @param url the URL to evalutate | |
| 102 */ | |
| 103 @Override | |
| 104 public boolean isPagingUrl(String url) { | |
| 105 // Both url and pattern must have the same suffix, if available. | |
| 106 if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false; | |
| 107 | |
| 108 return isPartialPathComponent() ? isPartialPathComponentPagingUrl(url) : | |
| 109 isEntirePathComponentPagingUrl(url); | |
| 110 } | |
| 111 | |
| 112 private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStar t, int digitEnd) | |
| 113 throws Exception { | |
| 114 final String urlStr = url.toString(); | |
| 115 if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEn d)) { | |
| 116 throw new Exception("Bad last numeric path component"); | |
| 117 } | |
| 118 | |
| 119 String valueStr = urlStr.substring(digitStart, digitEnd); | |
| 120 int value = StringUtil.toNumber(valueStr); | |
| 121 if (value < 0) { | |
| 122 throw new Exception("Value in path component is an invalid number: " + valueStr); | |
| 123 } | |
| 124 | |
| 125 String pattern = urlStr.substring(0, digitStart) + | |
| 126 PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring( digitEnd); | |
| 127 mUrl = ParsedUrl.create(pattern); | |
| 128 if (mUrl == null) throw new Exception("Invalid URL: " + pattern); | |
| 129 mUrlStr = pattern; | |
| 130 mPageNumber = value; | |
| 131 mPlaceholderStart = digitStart; | |
| 132 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart); | |
| 133 determineParamIndex(); | |
| 134 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart); | |
| 135 // Determine suffix, if available. | |
| 136 final int urlLen = mUrlStr.length(); | |
| 137 int suffixLen = urlLen - mPlaceholderStart - | |
| 138 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN; | |
| 139 if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen); | |
| 140 } | |
| 141 | |
| 142 private boolean isPartialPathComponent() { | |
| 143 return mUrlStr.charAt(mPlaceholderStart - 1) != '/'; | |
|
cjhopman
2015/04/16 21:58:32
this is strange, shouldn't
foo.com/article/2page/
kuan
2015/04/20 23:11:13
good point, so i did more testing w/ the original
| |
| 144 } | |
| 145 | |
| 146 private void determineParamIndex() { | |
| 147 final String[] pathComponents = mUrl.getPathComponents(); | |
| 148 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) { | |
| 149 if (pathComponents[mParamIndex].contains( | |
| 150 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) { | |
| 151 break; | |
| 152 } | |
| 153 } | |
| 154 } | |
| 155 | |
| 156 /** | |
| 157 * Returns true if, except for the path component containing the page param, the other path | |
| 158 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g.: | |
| 159 * - doc URL is /thread/12, pattern is /thread/12/page/[*!] | |
| 160 * returns true because "thread" and "12" in doc URL match those in patter n | |
| 161 * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo | |
| 162 * returns false because "foo" in doc URL doesn't match "page" in pattern whose page param | |
| 163 path component comes after. | |
| 164 * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo | |
| 165 * returns true because "foo" in doc URL would match "foo" in pattern whos e page param path | |
| 166 * component is skipped when matching. | |
| 167 */ | |
| 168 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) { | |
| 169 determineParamIndex(); | |
|
cjhopman
2015/04/16 21:58:32
don't need to call this, it's done in the construc
kuan
2015/04/20 23:11:13
Done. i forgot :(
| |
| 170 final String[] urlComponents = docUrl.getPathComponents(); | |
| 171 final String[] patternComponents = mUrl.getPathComponents(); | |
| 172 boolean passedParamComponent = false; | |
| 173 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) { | |
| 174 if (i == mParamIndex && !passedParamComponent) { | |
| 175 passedParamComponent = true; | |
| 176 // Repeat current path component if doc URL has less components (as per comments | |
| 177 // just above, doc URL may have less components). | |
| 178 if (urlComponents.length < patternComponents.length) i--; | |
| 179 continue; | |
| 180 } | |
| 181 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false; | |
| 182 } | |
| 183 | |
| 184 return true; | |
| 185 } | |
| 186 | |
| 187 /** | |
| 188 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a | |
| 189 * false-positive. | |
| 190 */ | |
| 191 private boolean isCalendarPage() { | |
| 192 determineParamIndex(); | |
| 193 if (mParamIndex < 2) return false; | |
| 194 | |
| 195 // Only if param is the entire path component. This handles some cases erroneously | |
| 196 // considered false-positives e.g. first page is | |
| 197 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html, | |
| 198 // and second page is | |
| 199 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html, | |
| 200 // would be considered false-positives otherwise because of "2014" and " 07". | |
| 201 final String[] patternComponents = mUrl.getPathComponents(); | |
| 202 if (patternComponents[mParamIndex].length() != | |
| 203 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) { | |
| 204 return false; | |
| 205 } | |
| 206 | |
| 207 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]); | |
| 208 if (month > 0 && month <= 12) { | |
| 209 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]); | |
| 210 if (year > 1970 && year < 3000) return true; | |
| 211 } | |
| 212 | |
| 213 return false; | |
| 214 } | |
| 215 | |
| 216 private static int getLongestCommonPrefixLength(String str1, String str2) { | |
| 217 if (str1.isEmpty() || str2.isEmpty()) return 0; | |
| 218 | |
| 219 int limit = Math.min(str1.length(), str2.length()); | |
| 220 int i = 0; | |
| 221 for (; i < limit; i++) { | |
| 222 if (str1.charAt(i) != str2.charAt(i)) break; | |
| 223 } | |
| 224 return i; | |
| 225 } | |
| 226 | |
| 227 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) { | |
| 228 int commonSuffixLen = 0; | |
| 229 for (int i = str1.length() - 1, j = str2.length() - 1; | |
| 230 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { | |
| 231 if (str1.charAt(i) != str2.charAt(i)) break; | |
| 232 } | |
| 233 return commonSuffixLen; | |
| 234 } | |
| 235 | |
| 236 /** | |
| 237 * Returns true if url is a paging URL based on the page pattern where the p age param is part | |
| 238 * of a path component. | |
| 239 * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: | |
| 240 * - www.foo.com/a/abc-2.html | |
| 241 * - www.foo.com/a/abc.html. | |
| 242 */ | |
| 243 private boolean isPartialPathComponentPagingUrl(String url) { | |
| 244 final int urlLen = url.length(); | |
| 245 final int suffixStart = urlLen - mSuffix.length(); | |
| 246 | |
| 247 // The page param path component of both url and pattern must have the s ame prefix. | |
| 248 if (!url.startsWith(mPrefix)) return false; | |
| 249 | |
| 250 // Find the first different character in page param path component just before | |
| 251 // placeholder or suffix, then check if it's acceptable. | |
| 252 int firstDiffPos = mPlaceholderSegmentStart; | |
| 253 int maxPos = Math.min(mPlaceholderStart, suffixStart); | |
| 254 for (; firstDiffPos < maxPos; firstDiffPos++) { | |
| 255 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break; | |
| 256 } | |
| 257 if (firstDiffPos == suffixStart) { // First different character is the suffix. | |
| 258 if (firstDiffPos + 1 == mPlaceholderStart && | |
| 259 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) { | |
| 260 return true; | |
| 261 } | |
| 262 } else if (firstDiffPos == mPlaceholderStart) { // First different char acter is page param. | |
| 263 if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos, suffixStart))) { | |
| 264 return true; | |
| 265 } | |
| 266 } | |
| 267 | |
| 268 return false; | |
| 269 } | |
| 270 | |
| 271 /** | |
| 272 * Returns true if url is a paging URL based on the page pattern where the p age param is the | |
| 273 * entire path component. | |
| 274 * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: | |
| 275 * - www.foo.com/a/2/abc.html | |
| 276 * - www.foo.com/a/abc.html | |
| 277 * - www.foo.com/abc.html. | |
| 278 */ | |
| 279 private boolean isEntirePathComponentPagingUrl(String url) { | |
| 280 final int urlLen = url.length(); | |
| 281 final int suffixLen = mSuffix.length(); | |
| 282 final int suffixStart = url.length() - suffixLen; | |
| 283 | |
| 284 int prevComponentPos = mUrl.getPath().lastIndexOf('/', | |
| 285 // We're only looking in the path, so the reverse search should start at the index | |
| 286 // excluding the url's origin. | |
| 287 mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length()); | |
| 288 if (prevComponentPos != -1) { | |
| 289 // Now, add back the url's origin to the index of previous path comp onent. | |
| 290 prevComponentPos += mUrl.getOrigin().length(); | |
| 291 if (prevComponentPos + suffixLen == urlLen) { | |
| 292 // The url doesn't have page number param and previous path comp onent, like | |
| 293 // www.foo.com/abc.html. | |
| 294 return url.regionMatches(0, mUrlStr, 0, prevComponentPos); | |
| 295 } | |
| 296 } | |
| 297 | |
| 298 // If both url and pattern have the same prefix, url must have nothing e lse. | |
| 299 if (url.startsWith(mPrefix)) { | |
| 300 int acceptLen = mPlaceholderSegmentStart + suffixLen; | |
| 301 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html. | |
| 302 if (acceptLen == urlLen) return true; | |
| 303 if (acceptLen > urlLen) return false; | |
| 304 | |
| 305 // While we are here, the url must have page number param, so the ur l must have a '/' | |
| 306 // at the pattern's path component start position. | |
| 307 if (url.charAt(mPlaceholderSegmentStart) != '/') return false; | |
| 308 | |
| 309 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholde rSegmentStart + 1, | |
| 310 suffixStart)); | |
| 311 } | |
| 312 | |
| 313 return false; | |
| 314 } | |
| 315 | |
| 316 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). | |
| 317 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent. | |
| 318 | |
| 319 /** | |
| 320 * Returns true if: | |
| 321 * - the digitStart to digitEnd of urlStr is the last path component, and | |
| 322 * - the entire path component is numeric, and | |
| 323 * - the previous path component is a bad page param name. | |
| 324 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad | |
| 325 * page param. | |
| 326 */ | |
| 327 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart, | |
| 328 int digitEnd) { | |
| 329 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component. | |
| 330 pathStart < digitStart - 1) { // Not the first path component. | |
| 331 String postMatch = urlStr.substring(digitEnd).toLowerCase(); | |
| 332 // Checks that this is the last path component, and trailing charact ers, if available, | |
| 333 // are (s)htm(l) extensions. | |
| 334 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i"); | |
| 335 if (sExtRegExp.test(postMatch)) { | |
| 336 // Entire component is numeric, get previous path component. | |
| 337 if (sLastPathComponentRegExp == null) { | |
| 338 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ; | |
| 339 } | |
| 340 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( | |
| 341 urlStr.substring(pathStart + 1, digitStart)); | |
| 342 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 && | |
| 343 PageParameterDetector.isPageParamNameBad(prevPathCompone nt.getGroup(1))) { | |
| 344 return true; | |
| 345 } | |
| 346 } // last numeric path component | |
| 347 } | |
| 348 | |
| 349 return false; | |
| 350 } | |
| 351 | |
| 352 /** | |
| 353 * Returns true if given character is one of '-', '_', ';', ','. | |
| 354 */ | |
| 355 private static native boolean isPageParamSeparator(Character c) /*-{ | |
| 356 return /[-_;,]/.test(c); | |
| 357 }-*/; | |
| 358 | |
| 359 } | |
| OLD | NEW |