OLD | NEW |
(Empty) | |
| 1 // Copyright 2015 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. |
| 4 |
| 5 package org.chromium.distiller; |
| 6 |
| 7 import com.google.gwt.regexp.shared.MatchResult; |
| 8 import com.google.gwt.regexp.shared.RegExp; |
| 9 |
| 10 /** |
| 11 * This class detects the page parameter in the path of a potential pagination U
RL. If detected, |
| 12 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH
OLDER, then creates |
| 13 * and returns a new object. This object can then be accessed via PageParameter
Detector.PagePattern |
| 14 * interface to: |
| 15 * - validate the generated URL page pattern against the document URL |
| 16 * - determine if a URL is a paging URL based on the page pattern. |
| 17 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat
tern is |
| 18 * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDete
ctor.java). |
| 19 */ |
| 20 public class PathComponentPagePattern implements PageParameterDetector.PagePatte
rn { |
| 21 private final ParsedUrl mUrl; |
| 22 private final int mPageNumber; |
| 23 private final int mPlaceholderStart; |
| 24 private final String mUrlStr; |
| 25 // Start position of path component containing placeholder. |
| 26 private int mPlaceholderSegmentStart; |
| 27 // Page param path component in list of path components. |
| 28 private int mParamIndex = -1; |
| 29 private final String mPrefix; // The part of the page pattern before the pla
ceholder. |
| 30 private String mSuffix = ""; // The part of the page pattern after the plac
eholder. |
| 31 |
| 32 /** |
| 33 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO
LDER. |
| 34 */ |
| 35 static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart
, int digitStart, |
| 36 int digitEnd) { |
| 37 try { |
| 38 return new PathComponentPagePattern(url, pathStart, digitStart, digi
tEnd); |
| 39 } catch (IllegalArgumentException e) { |
| 40 return null; |
| 41 } |
| 42 } |
| 43 |
| 44 @Override |
| 45 public String toString() { |
| 46 return mUrlStr; |
| 47 } |
| 48 |
| 49 @Override |
| 50 public int getPageNumber() { |
| 51 return mPageNumber; |
| 52 } |
| 53 |
| 54 /** |
| 55 * Returns true if pattern and URL are sufficiently similar and the pattern'
s components are not |
| 56 * calendar digits. |
| 57 * |
| 58 * @param docUrl the current document URL |
| 59 */ |
| 60 @Override |
| 61 public boolean isValidFor(ParsedUrl docUrl) { |
| 62 final int urlPathComponentsLen = docUrl.getPathComponents().length; |
| 63 final int patternPathComponentsLen = mUrl.getPathComponents().length; |
| 64 |
| 65 // Both the pattern and doc URL must have the similar path. |
| 66 if (urlPathComponentsLen > patternPathComponentsLen) return false; |
| 67 |
| 68 // If both doc URL and page pattern have only 1 component, their common
prefix+suffix must |
| 69 // be at least half of the entire component in doc URL, e.g doc URL is |
| 70 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads
-132-[*!]". |
| 71 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) { |
| 72 final String urlComponent = docUrl.getPathComponents()[0]; |
| 73 final String patternComponent = mUrl.getPathComponents()[0]; |
| 74 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat
ternComponent); |
| 75 int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, pat
ternComponent, |
| 76 commonPrefixLen); |
| 77 return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.lengt
h(); |
| 78 } |
| 79 |
| 80 if (!hasSamePathComponentsAs(docUrl)) return false; |
| 81 |
| 82 if (isCalendarPage()) return false; |
| 83 |
| 84 return true; |
| 85 } |
| 86 |
| 87 /** |
| 88 * Returns true if a URL matches this page pattern based on a pipeline of ru
les: |
| 89 * - suffix (part of pattern after page param placeholder) must be same, and |
| 90 * - different set of rules depending on if page param is at start of path c
omponent or not. |
| 91 * |
| 92 * @param url the URL to evalutate |
| 93 */ |
| 94 @Override |
| 95 public boolean isPagingUrl(String url) { |
| 96 // Both url and pattern must have the same suffix, if available. |
| 97 if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false; |
| 98 |
| 99 return atStartOfPathComponent() ? isPagingUrlForStartOfPathComponent(url
) : |
| 100 isPagingUrlForNotStartOfPathComponent(url); |
| 101 } |
| 102 |
| 103 private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStar
t, int digitEnd) |
| 104 throws IllegalArgumentException { |
| 105 final String urlStr = url.toString(); |
| 106 if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEn
d)) { |
| 107 throw new IllegalArgumentException("Bad last numeric path component"
); |
| 108 } |
| 109 |
| 110 String valueStr = urlStr.substring(digitStart, digitEnd); |
| 111 int value = StringUtil.toNumber(valueStr); |
| 112 if (value < 0) { |
| 113 throw new IllegalArgumentException("Value in path component is an in
valid number: " + |
| 114 valueStr); |
| 115 } |
| 116 |
| 117 String pattern = urlStr.substring(0, digitStart) + |
| 118 PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring(
digitEnd); |
| 119 mUrl = ParsedUrl.create(pattern); |
| 120 if (mUrl == null) throw new IllegalArgumentException("Invalid URL: " + p
attern); |
| 121 mUrlStr = pattern; |
| 122 mPageNumber = value; |
| 123 mPlaceholderStart = digitStart; |
| 124 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart); |
| 125 determineParamIndex(); |
| 126 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart); |
| 127 // Determine suffix, if available. |
| 128 final int urlLen = mUrlStr.length(); |
| 129 int suffixLen = urlLen - mPlaceholderStart - |
| 130 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN; |
| 131 if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen); |
| 132 } |
| 133 |
| 134 private boolean atStartOfPathComponent() { |
| 135 return mUrlStr.charAt(mPlaceholderStart - 1) == '/'; |
| 136 } |
| 137 |
| 138 private void determineParamIndex() { |
| 139 final String[] pathComponents = mUrl.getPathComponents(); |
| 140 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++
) { |
| 141 if (pathComponents[mParamIndex].contains( |
| 142 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) { |
| 143 break; |
| 144 } |
| 145 } |
| 146 } |
| 147 |
| 148 /** |
| 149 * Returns true if, except for the path component containing the page param,
the other path |
| 150 * components of doc URL are the same as pattern's. But pattern may have mo
re components, e.g.: |
| 151 * - doc URL is /thread/12, pattern is /thread/12/page/[*!] |
| 152 * returns true because "thread" and "12" in doc URL match those in patter
n |
| 153 * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo |
| 154 * returns false because "foo" in doc URL doesn't match "page" in pattern
whose page param |
| 155 path component comes after. |
| 156 * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo |
| 157 * returns true because "foo" in doc URL would match "foo" in pattern whos
e page param path |
| 158 * component is skipped when matching. |
| 159 */ |
| 160 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) { |
| 161 final String[] urlComponents = docUrl.getPathComponents(); |
| 162 final String[] patternComponents = mUrl.getPathComponents(); |
| 163 boolean passedParamComponent = false; |
| 164 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents
.length; i++, j++) { |
| 165 if (i == mParamIndex && !passedParamComponent) { |
| 166 passedParamComponent = true; |
| 167 // Repeat current path component if doc URL has less components
(as per comments |
| 168 // just above, doc URL may have less components). |
| 169 if (urlComponents.length < patternComponents.length) i--; |
| 170 continue; |
| 171 } |
| 172 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return
false; |
| 173 } |
| 174 |
| 175 return true; |
| 176 } |
| 177 |
| 178 /** |
| 179 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which
would be a |
| 180 * false-positive. |
| 181 */ |
| 182 private boolean isCalendarPage() { |
| 183 if (mParamIndex < 2) return false; |
| 184 |
| 185 // Only if param is the entire path component. This handles some cases
erroneously |
| 186 // considered false-positives e.g. first page is |
| 187 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-
questions-109467.html, |
| 188 // and second page is |
| 189 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal-
questions-109467_Page2.html, |
| 190 // would be considered false-positives otherwise because of "2014" and "
07". |
| 191 final String[] patternComponents = mUrl.getPathComponents(); |
| 192 if (patternComponents[mParamIndex].length() != |
| 193 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) { |
| 194 return false; |
| 195 } |
| 196 |
| 197 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]); |
| 198 if (month > 0 && month <= 12) { |
| 199 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]); |
| 200 if (year > 1970 && year < 3000) return true; |
| 201 } |
| 202 |
| 203 return false; |
| 204 } |
| 205 |
| 206 private static int getLongestCommonPrefixLength(String str1, String str2) { |
| 207 if (str1.isEmpty() || str2.isEmpty()) return 0; |
| 208 |
| 209 int limit = Math.min(str1.length(), str2.length()); |
| 210 int i = 0; |
| 211 for (; i < limit; i++) { |
| 212 if (str1.charAt(i) != str2.charAt(i)) break; |
| 213 } |
| 214 return i; |
| 215 } |
| 216 |
| 217 private static int getLongestCommonSuffixLength(String str1, String str2, in
t startIndex) { |
| 218 int commonSuffixLen = 0; |
| 219 for (int i = str1.length() - 1, j = str2.length() - 1; |
| 220 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) { |
| 221 if (str1.charAt(i) != str2.charAt(i)) break; |
| 222 } |
| 223 return commonSuffixLen; |
| 224 } |
| 225 |
| 226 /** |
| 227 * Returns true if url is a paging URL based on the page pattern where the p
age param is at the |
| 228 * start of a path component. |
| 229 * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is: |
| 230 * - www.foo.com/a/2/abc.html |
| 231 * - www.foo.com/a/abc.html |
| 232 * - www.foo.com/abc.html. |
| 233 */ |
| 234 private boolean isPagingUrlForStartOfPathComponent(String url) { |
| 235 final int urlLen = url.length(); |
| 236 final int suffixLen = mSuffix.length(); |
| 237 final int suffixStart = url.length() - suffixLen; |
| 238 |
| 239 int prevComponentPos = mUrl.getPath().lastIndexOf('/', |
| 240 // We're only looking in the path, so the reverse search should
start at the index |
| 241 // excluding the url's origin. |
| 242 mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length()); |
| 243 if (prevComponentPos != -1) { |
| 244 // Now, add back the url's origin to the index of previous path comp
onent. |
| 245 prevComponentPos += mUrl.getOrigin().length(); |
| 246 if (prevComponentPos + suffixLen == urlLen) { |
| 247 // The url doesn't have page number param and previous path comp
onent, like |
| 248 // www.foo.com/abc.html. |
| 249 return url.regionMatches(0, mUrlStr, 0, prevComponentPos); |
| 250 } |
| 251 } |
| 252 |
| 253 // If both url and pattern have the same prefix, url must have nothing e
lse. |
| 254 if (url.startsWith(mPrefix)) { |
| 255 int acceptLen = mPlaceholderSegmentStart + suffixLen; |
| 256 // The url doesn't have page number parameter, like www.foo.com/a/ab
c.html. |
| 257 if (acceptLen == urlLen) return true; |
| 258 if (acceptLen > urlLen) return false; |
| 259 |
| 260 // While we are here, the url must have page number param, so the ur
l must have a '/' |
| 261 // at the pattern's path component start position. |
| 262 if (url.charAt(mPlaceholderSegmentStart) != '/') return false; |
| 263 |
| 264 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholde
rSegmentStart + 1, |
| 265 suffixStart)); |
| 266 } |
| 267 |
| 268 return false; |
| 269 } |
| 270 |
| 271 /** |
| 272 * Returns true if url is a paging URL based on the page pattern where the p
age param is not at |
| 273 * the start of a path component. |
| 274 * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is: |
| 275 * - www.foo.com/a/abc-2.html |
| 276 * - www.foo.com/a/abc.html. |
| 277 */ |
| 278 private boolean isPagingUrlForNotStartOfPathComponent(String url) { |
| 279 final int urlLen = url.length(); |
| 280 final int suffixStart = urlLen - mSuffix.length(); |
| 281 |
| 282 // The page param path component of both url and pattern must have the s
ame prefix. |
| 283 if (!url.startsWith(mPrefix)) return false; |
| 284 |
| 285 // Find the first different character in page param path component just
before |
| 286 // placeholder or suffix, then check if it's acceptable. |
| 287 int firstDiffPos = mPlaceholderSegmentStart; |
| 288 int maxPos = Math.min(mPlaceholderStart, suffixStart); |
| 289 for (; firstDiffPos < maxPos; firstDiffPos++) { |
| 290 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break; |
| 291 } |
| 292 if (firstDiffPos == suffixStart) { // First different character is the
suffix. |
| 293 if (firstDiffPos + 1 == mPlaceholderStart && |
| 294 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) { |
| 295 return true; |
| 296 } |
| 297 } else if (firstDiffPos == mPlaceholderStart) { // First different char
acter is page param. |
| 298 if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos,
suffixStart))) { |
| 299 return true; |
| 300 } |
| 301 } |
| 302 |
| 303 return false; |
| 304 } |
| 305 |
| 306 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l). |
| 307 private static RegExp sLastPathComponentRegExp = null; // Match last path c
omponent. |
| 308 |
| 309 /** |
| 310 * Returns true if: |
| 311 * - the digitStart to digitEnd of urlStr is the last path component, and |
| 312 * - the entire path component is numeric, and |
| 313 * - the previous path component is a bad page param name. |
| 314 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an
d "tag" is a bad |
| 315 * page param. |
| 316 */ |
| 317 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i
nt digitStart, |
| 318 int digitEnd) { |
| 319 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path
component. |
| 320 pathStart < digitStart - 1) { // Not the first path component. |
| 321 String postMatch = urlStr.substring(digitEnd).toLowerCase(); |
| 322 // Checks that this is the last path component, and trailing charact
ers, if available, |
| 323 // are (s)htm(l) extensions. |
| 324 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$",
"i"); |
| 325 if (sExtRegExp.test(postMatch)) { |
| 326 // Entire component is numeric, get previous path component. |
| 327 if (sLastPathComponentRegExp == null) { |
| 328 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i")
; |
| 329 } |
| 330 MatchResult prevPathComponent = sLastPathComponentRegExp.exec( |
| 331 urlStr.substring(pathStart + 1, digitStart)); |
| 332 if (prevPathComponent != null && prevPathComponent.getGroupCount
() > 1 && |
| 333 PageParameterDetector.isPageParamNameBad(prevPathCompone
nt.getGroup(1))) { |
| 334 return true; |
| 335 } |
| 336 } // last numeric path component |
| 337 } |
| 338 |
| 339 return false; |
| 340 } |
| 341 |
| 342 /** |
| 343 * Returns true if given character is one of '-', '_', ';', ','. |
| 344 */ |
| 345 private static native boolean isPageParamSeparator(Character c) /*-{ |
| 346 return /[-_;,]/.test(c); |
| 347 }-*/; |
| 348 |
| 349 } |
OLD | NEW |