java/org/chromium/distiller/PathComponentPagePattern.java - Issue 1029593003: implement validations of pagination URLs

Side by Side Diff: java/org/chromium/distiller/PathComponentPagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

« java/org/chromium/distiller/PageParameterDetector.java ('K') | « java/org/chromium/distiller/ParsedUrl.java ('k') | java/org/chromium/distiller/QueryParamPagePattern.java » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 package org.chromium.distiller;

	6

	7 import com.google.gwt.regexp.shared.MatchResult;

	8 import com.google.gwt.regexp.shared.RegExp;

	9

	10 /**

	11 * This class detects the page parameter in the path of a potential pagination U RL. If detected,

	12 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH OLDER, then creates

	13 * and returns a new object. This object can then be accessed via PageParameter Detector.PagePattern

	14 * interface to:

	15 * - validate the generated URL page pattern against the document URL

	16 * - determine if a URL is a paging URL based on the page pattern.

	17 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is

	18 * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDete ctor.java).

	19 */

	20 public class PathComponentPagePattern implements PageParameterDetector.PagePatte rn {

	21 private final ParsedUrl mUrl;

	22 private final int mPageNumber;

	23 private final int mPlaceholderStart;

	24 private final String mUrlStr;

	25 // Start position of path component containing placeholder.

	26 private int mPlaceholderSegmentStart;

	27 // Page param path component in list of path components.

	28 private int mParamIndex = -1;

	29 private final String mPrefix; // The part of the page pattern before the pla ceholder.

	30 private String mSuffix = ""; // The part of the page pattern after the plac eholder.

	31

	32 /**

	33 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER.

	34 */

	35 static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart , int digitStart,

	36 int digitEnd) {

	37 try {

	38 return new PathComponentPagePattern(url, pathStart, digitStart, digi tEnd);

	39 } catch (Exception e) {
	cjhopman 2015/04/16 21:58:32 We shouldn't be throwing/catching the base Excepti We shouldn't be throwing/catching the base Exception class. Instead, either find an appropriate subclass of it or define our own. kuan 2015/04/20 23:11:13 Done. ditto for QueryParamPagePattern. Show quoted text On 2015/04/16 21:58:32, cjhopman wrote: > We shouldn't be throwing/catching the base Exception class. Instead, either find > an appropriate subclass of it or define our own. Done. ditto for QueryParamPagePattern.
	40 return null;

	41 }

	42 }

	43

	44 @Override

	45 public String toString() {

	46 return mUrlStr;

	47 }

	48

	49 @Override

	50 public int getPageNumber() {

	51 return mPageNumber;

	52 }

	53

	54 /**

	55 * Returns true if pattern and URL are sufficiently similar and the pattern' s components are not

	56 * calendar digits.

	57 *

	58 * @param docUrl the current document URL

	59 */

	60 @Override

	61 public boolean isValidFor(ParsedUrl docUrl) {

	62 final int urlPathComponentsLen = docUrl.getPathComponents().length;

	63 final int patternPathComponentsLen = mUrl.getPathComponents().length;

	64

	65 // If the page param is inside of path components, both the pattern and doc URL must have
	cjhopman 2015/04/16 21:58:32 We know that the page param is inside of path comp We know that the page param is inside of path components so this comment seems a bit awkward: How about just drop the part before the comma? kuan 2015/04/20 23:11:13 Done. Show quoted text On 2015/04/16 21:58:32, cjhopman wrote: > We know that the page param is inside of path components so this comment seems a > bit awkward: > > How about just drop the part before the comma? Done.
	66 // the similar path.

	67 if (urlPathComponentsLen > patternPathComponentsLen) return false;

	68

	69 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must

	70 // be at least half of the entire component in doc URL, e.g doc URL is

	71 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]".

	72 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {

	73 final String urlComponent = docUrl.getPathComponents()[0];

	74 final String patternComponent = mUrl.getPathComponents()[0];

	75 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent);

	76 int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, pat ternComponent,

	77 commonPrefixLen);

	78 return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.lengt h();

	79 }

	80

	81 if (!hasSamePathComponentsAs(docUrl)) return false;

	82

	83 if (isCalendarPage()) return false;

	84

	85 return true;

	86 }

	87

	88 /**

	89 * Returns true if a URL matches this page pattern based on a pipeline of ru les:

	90 * - suffix (part of pattern after page param placeholder) must be same, and

	91 * - for path page parameter that is part of a path component,

	92 * - if the first different character in path component is suffix, it must be a page parameter

	93 * separator, followed by the page parameter in the pattern

	94 * - else if it's page parameter, it and possible following digits must be a plain number.

	95 * - for path page parameter that is the entire path component,

	96 * - if URL has no page number param and previous path component, everythi ng else matches, or

	97 * - if prefix is the same, URL doesn't have anyhing else

	98 * - else url must have '/' at the same position as pattern's page paramet er path component,

	99 * followed by a plain number.

	100 *

	101 * @param url the URL to evalutate

	102 */

	103 @Override

	104 public boolean isPagingUrl(String url) {

	105 // Both url and pattern must have the same suffix, if available.

	106 if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false;

	107

	108 return isPartialPathComponent() ? isPartialPathComponentPagingUrl(url) :

	109 isEntirePathComponentPagingUrl(url);

	110 }

	111

	112 private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStar t, int digitEnd)

	113 throws Exception {

	114 final String urlStr = url.toString();

	115 if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEn d)) {

	116 throw new Exception("Bad last numeric path component");

	117 }

	118

	119 String valueStr = urlStr.substring(digitStart, digitEnd);

	120 int value = StringUtil.toNumber(valueStr);

	121 if (value < 0) {

	122 throw new Exception("Value in path component is an invalid number: " + valueStr);

	123 }

	124

	125 String pattern = urlStr.substring(0, digitStart) +

	126 PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring( digitEnd);

	127 mUrl = ParsedUrl.create(pattern);

	128 if (mUrl == null) throw new Exception("Invalid URL: " + pattern);

	129 mUrlStr = pattern;

	130 mPageNumber = value;

	131 mPlaceholderStart = digitStart;

	132 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart);

	133 determineParamIndex();

	134 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);

	135 // Determine suffix, if available.

	136 final int urlLen = mUrlStr.length();

	137 int suffixLen = urlLen - mPlaceholderStart -

	138 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;

	139 if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen);

	140 }

	141

	142 private boolean isPartialPathComponent() {

	143 return mUrlStr.charAt(mPlaceholderStart - 1) != '/';
	cjhopman 2015/04/16 21:58:32 this is strange, shouldn't foo.com/article/2page/ this is strange, shouldn't foo.com/article/2page/ be a partial path component? kuan 2015/04/20 23:11:13 good point, so i did more testing w/ the original Show quoted text On 2015/04/16 21:58:32, cjhopman wrote: > this is strange, shouldn't > foo.com/article/2page/ be a partial path component? good point, so i did more testing w/ the original code. they actually meant at start of path component. they called it "partial" and didn't care about the suffix (i.e. whatever's after placeholder), because the paging url must already have the same suffix by the time the code path executes to this point (see beginning of isPaingUrl()). now, the functionalities are renamed to reflect start of path component.
	144 }

	145

	146 private void determineParamIndex() {

	147 final String[] pathComponents = mUrl.getPathComponents();

	148 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) {

	149 if (pathComponents[mParamIndex].contains(

	150 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) {

	151 break;

	152 }

	153 }

	154 }

	155

	156 /**

	157 * Returns true if, except for the path component containing the page param, the other path

	158 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g.:

	159 * - doc URL is /thread/12, pattern is /thread/12/page/[*!]

	160 * returns true because "thread" and "12" in doc URL match those in patter n

	161 * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo

	162 * returns false because "foo" in doc URL doesn't match "page" in pattern whose page param

	163 path component comes after.

	164 * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo

	165 * returns true because "foo" in doc URL would match "foo" in pattern whos e page param path

	166 * component is skipped when matching.

	167 */

	168 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) {

	169 determineParamIndex();
	cjhopman 2015/04/16 21:58:32 don't need to call this, it's done in the construc don't need to call this, it's done in the constructor. same below. kuan 2015/04/20 23:11:13 Done. i forgot :( Show quoted text On 2015/04/16 21:58:32, cjhopman wrote: > don't need to call this, it's done in the constructor. same below. Done. i forgot :(
	170 final String[] urlComponents = docUrl.getPathComponents();

	171 final String[] patternComponents = mUrl.getPathComponents();

	172 boolean passedParamComponent = false;

	173 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) {

	174 if (i == mParamIndex && !passedParamComponent) {

	175 passedParamComponent = true;

	176 // Repeat current path component if doc URL has less components (as per comments

	177 // just above, doc URL may have less components).

	178 if (urlComponents.length < patternComponents.length) i--;

	179 continue;

	180 }

	181 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false;

	182 }

	183

	184 return true;

	185 }

	186

	187 /**

	188 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a

	189 * false-positive.

	190 */

	191 private boolean isCalendarPage() {

	192 determineParamIndex();

	193 if (mParamIndex < 2) return false;

	194

	195 // Only if param is the entire path component. This handles some cases erroneously

	196 // considered false-positives e.g. first page is

	197 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html,

	198 // and second page is

	199 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html,

	200 // would be considered false-positives otherwise because of "2014" and " 07".

	201 final String[] patternComponents = mUrl.getPathComponents();

	202 if (patternComponents[mParamIndex].length() !=

	203 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) {

	204 return false;

	205 }

	206

	207 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]);

	208 if (month > 0 && month <= 12) {

	209 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]);

	210 if (year > 1970 && year < 3000) return true;

	211 }

	212

	213 return false;

	214 }

	215

	216 private static int getLongestCommonPrefixLength(String str1, String str2) {

	217 if (str1.isEmpty() \|\| str2.isEmpty()) return 0;

	218

	219 int limit = Math.min(str1.length(), str2.length());

	220 int i = 0;

	221 for (; i < limit; i++) {

	222 if (str1.charAt(i) != str2.charAt(i)) break;

	223 }

	224 return i;

	225 }

	226

	227 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) {

	228 int commonSuffixLen = 0;

	229 for (int i = str1.length() - 1, j = str2.length() - 1;

	230 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {

	231 if (str1.charAt(i) != str2.charAt(i)) break;

	232 }

	233 return commonSuffixLen;

	234 }

	235

	236 /**

	237 * Returns true if url is a paging URL based on the page pattern where the p age param is part

	238 * of a path component.

	239 * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:

	240 * - www.foo.com/a/abc-2.html

	241 * - www.foo.com/a/abc.html.

	242 */

	243 private boolean isPartialPathComponentPagingUrl(String url) {

	244 final int urlLen = url.length();

	245 final int suffixStart = urlLen - mSuffix.length();

	246

	247 // The page param path component of both url and pattern must have the s ame prefix.

	248 if (!url.startsWith(mPrefix)) return false;

	249

	250 // Find the first different character in page param path component just before

	251 // placeholder or suffix, then check if it's acceptable.

	252 int firstDiffPos = mPlaceholderSegmentStart;

	253 int maxPos = Math.min(mPlaceholderStart, suffixStart);

	254 for (; firstDiffPos < maxPos; firstDiffPos++) {

	255 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break;

	256 }

	257 if (firstDiffPos == suffixStart) { // First different character is the suffix.

	258 if (firstDiffPos + 1 == mPlaceholderStart &&

	259 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) {

	260 return true;

	261 }

	262 } else if (firstDiffPos == mPlaceholderStart) { // First different char acter is page param.

	263 if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos, suffixStart))) {

	264 return true;

	265 }

	266 }

	267

	268 return false;

	269 }

	270

	271 /**

	272 * Returns true if url is a paging URL based on the page pattern where the p age param is the

	273 * entire path component.

	274 * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:

	275 * - www.foo.com/a/2/abc.html

	276 * - www.foo.com/a/abc.html

	277 * - www.foo.com/abc.html.

	278 */

	279 private boolean isEntirePathComponentPagingUrl(String url) {

	280 final int urlLen = url.length();

	281 final int suffixLen = mSuffix.length();

	282 final int suffixStart = url.length() - suffixLen;

	283

	284 int prevComponentPos = mUrl.getPath().lastIndexOf('/',

	285 // We're only looking in the path, so the reverse search should start at the index

	286 // excluding the url's origin.

	287 mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length());

	288 if (prevComponentPos != -1) {

	289 // Now, add back the url's origin to the index of previous path comp onent.

	290 prevComponentPos += mUrl.getOrigin().length();

	291 if (prevComponentPos + suffixLen == urlLen) {

	292 // The url doesn't have page number param and previous path comp onent, like

	293 // www.foo.com/abc.html.

	294 return url.regionMatches(0, mUrlStr, 0, prevComponentPos);

	295 }

	296 }

	297

	298 // If both url and pattern have the same prefix, url must have nothing e lse.

	299 if (url.startsWith(mPrefix)) {

	300 int acceptLen = mPlaceholderSegmentStart + suffixLen;

	301 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html.

	302 if (acceptLen == urlLen) return true;

	303 if (acceptLen > urlLen) return false;

	304

	305 // While we are here, the url must have page number param, so the ur l must have a '/'

	306 // at the pattern's path component start position.

	307 if (url.charAt(mPlaceholderSegmentStart) != '/') return false;

	308

	309 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholde rSegmentStart + 1,

	310 suffixStart));

	311 }

	312

	313 return false;

	314 }

	315

	316 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).

	317 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.

	318

	319 /**

	320 * Returns true if:

	321 * - the digitStart to digitEnd of urlStr is the last path component, and

	322 * - the entire path component is numeric, and

	323 * - the previous path component is a bad page param name.

	324 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad

	325 * page param.

	326 */

	327 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart,

	328 int digitEnd) {

	329 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.

	330 pathStart < digitStart - 1) { // Not the first path component.

	331 String postMatch = urlStr.substring(digitEnd).toLowerCase();

	332 // Checks that this is the last path component, and trailing charact ers, if available,

	333 // are (s)htm(l) extensions.

	334 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");

	335 if (sExtRegExp.test(postMatch)) {

	336 // Entire component is numeric, get previous path component.

	337 if (sLastPathComponentRegExp == null) {

	338 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;

	339 }

	340 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(

	341 urlStr.substring(pathStart + 1, digitStart));

	342 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&

	343 PageParameterDetector.isPageParamNameBad(prevPathCompone nt.getGroup(1))) {

	344 return true;

	345 }

	346 } // last numeric path component

	347 }

	348

	349 return false;

	350 }

	351

	352 /**

	353 * Returns true if given character is one of '-', '_', ';', ','.

	354 */

	355 private static native boolean isPageParamSeparator(Character c) /*-{

	356 return /[-_;,]/.test(c);

	357 }-*/;

	358

	359 }

OLD	NEW