java/org/chromium/distiller/PathComponentPagePattern.java - Issue 1029593003: implement validations of pagination URLs

Side by Side Diff: java/org/chromium/distiller/PathComponentPagePattern.java

Issue 1029593003: implement validations of pagination URLs (Closed) Base URL: https://github.com/chromium/dom-distiller.git@master

Patch Set: addr chris's comments Created 5 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2015 The Chromium Authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 package org.chromium.distiller;

	6

	7 import com.google.gwt.regexp.shared.MatchResult;

	8 import com.google.gwt.regexp.shared.RegExp;

	9

	10 /**

	11 * This class detects the page parameter in the path of a potential pagination U RL. If detected,

	12 * it replaces the page param value with PageParameterDetector.PAGE_PARAM_PLACEH OLDER, then creates

	13 * and returns a new object. This object can then be accessed via PageParameter Detector.PagePattern

	14 * interface to:

	15 * - validate the generated URL page pattern against the document URL

	16 * - determine if a URL is a paging URL based on the page pattern.

	17 * Example: if the original url is "http://www.foo.com/a/b-3.html", the page pat tern is

	18 * "http://www.foo.com/a/b-[*!].html". (See comments at top of PageParameterDete ctor.java).

	19 */

	20 public class PathComponentPagePattern implements PageParameterDetector.PagePatte rn {

	21 private final ParsedUrl mUrl;

	22 private final int mPageNumber;

	23 private final int mPlaceholderStart;

	24 private final String mUrlStr;

	25 // Start position of path component containing placeholder.

	26 private int mPlaceholderSegmentStart;

	27 // Page param path component in list of path components.

	28 private int mParamIndex = -1;

	29 private final String mPrefix; // The part of the page pattern before the pla ceholder.

	30 private String mSuffix = ""; // The part of the page pattern after the plac eholder.

	31

	32 /**

	33 * Returns a new PagePattern if url is valid and contains PAGE_PARAM_PLACEHO LDER.

	34 */

	35 static PageParameterDetector.PagePattern create(ParsedUrl url, int pathStart , int digitStart,

	36 int digitEnd) {

	37 try {

	38 return new PathComponentPagePattern(url, pathStart, digitStart, digi tEnd);

	39 } catch (IllegalArgumentException e) {

	40 return null;

	41 }

	42 }

	43

	44 @Override

	45 public String toString() {

	46 return mUrlStr;

	47 }

	48

	49 @Override

	50 public int getPageNumber() {

	51 return mPageNumber;

	52 }

	53

	54 /**

	55 * Returns true if pattern and URL are sufficiently similar and the pattern' s components are not

	56 * calendar digits.

	57 *

	58 * @param docUrl the current document URL

	59 */

	60 @Override

	61 public boolean isValidFor(ParsedUrl docUrl) {

	62 final int urlPathComponentsLen = docUrl.getPathComponents().length;

	63 final int patternPathComponentsLen = mUrl.getPathComponents().length;

	64

	65 // Both the pattern and doc URL must have the similar path.

	66 if (urlPathComponentsLen > patternPathComponentsLen) return false;

	67

	68 // If both doc URL and page pattern have only 1 component, their common prefix+suffix must

	69 // be at least half of the entire component in doc URL, e.g doc URL is

	70 // "foo.com/foo-bar-threads-132" and pattern is "foo.com/foo-bar-threads -132-[*!]".

	71 if (urlPathComponentsLen == 1 && patternPathComponentsLen == 1) {

	72 final String urlComponent = docUrl.getPathComponents()[0];

	73 final String patternComponent = mUrl.getPathComponents()[0];

	74 int commonPrefixLen = getLongestCommonPrefixLength(urlComponent, pat ternComponent);

	75 int commonSuffixLen = getLongestCommonSuffixLength(urlComponent, pat ternComponent,

	76 commonPrefixLen);

	77 return (commonSuffixLen + commonPrefixLen) * 2 >= urlComponent.lengt h();

	78 }

	79

	80 if (!hasSamePathComponentsAs(docUrl)) return false;

	81

	82 if (isCalendarPage()) return false;

	83

	84 return true;

	85 }

	86

	87 /**

	88 * Returns true if a URL matches this page pattern based on a pipeline of ru les:

	89 * - suffix (part of pattern after page param placeholder) must be same, and

	90 * - different set of rules depending on if page param is at start of path c omponent or not.

	91 *

	92 * @param url the URL to evalutate

	93 */

	94 @Override

	95 public boolean isPagingUrl(String url) {

	96 // Both url and pattern must have the same suffix, if available.

	97 if (!mSuffix.isEmpty() && !url.endsWith(mSuffix)) return false;

	98

	99 return atStartOfPathComponent() ? isPagingUrlForStartOfPathComponent(url ) :

	100 isPagingUrlForNotStartOfPathComponent(url);

	101 }

	102

	103 private PathComponentPagePattern(ParsedUrl url, int pathStart, int digitStar t, int digitEnd)

	104 throws IllegalArgumentException {

	105 final String urlStr = url.toString();

	106 if (isLastNumericPathComponentBad(urlStr, pathStart, digitStart, digitEn d)) {

	107 throw new IllegalArgumentException("Bad last numeric path component" );

	108 }

	109

	110 String valueStr = urlStr.substring(digitStart, digitEnd);

	111 int value = StringUtil.toNumber(valueStr);

	112 if (value < 0) {

	113 throw new IllegalArgumentException("Value in path component is an in valid number: " +

	114 valueStr);

	115 }

	116

	117 String pattern = urlStr.substring(0, digitStart) +

	118 PageParameterDetector.PAGE_PARAM_PLACEHOLDER + urlStr.substring( digitEnd);

	119 mUrl = ParsedUrl.create(pattern);

	120 if (mUrl == null) throw new IllegalArgumentException("Invalid URL: " + p attern);

	121 mUrlStr = pattern;

	122 mPageNumber = value;

	123 mPlaceholderStart = digitStart;

	124 mPlaceholderSegmentStart = mUrlStr.lastIndexOf('/', mPlaceholderStart);

	125 determineParamIndex();

	126 mPrefix = mUrlStr.substring(0, mPlaceholderSegmentStart);

	127 // Determine suffix, if available.

	128 final int urlLen = mUrlStr.length();

	129 int suffixLen = urlLen - mPlaceholderStart -

	130 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN;

	131 if (suffixLen != 0) mSuffix = mUrlStr.substring(urlLen - suffixLen);

	132 }

	133

	134 private boolean atStartOfPathComponent() {

	135 return mUrlStr.charAt(mPlaceholderStart - 1) == '/';

	136 }

	137

	138 private void determineParamIndex() {

	139 final String[] pathComponents = mUrl.getPathComponents();

	140 for (mParamIndex = 0; mParamIndex < pathComponents.length; mParamIndex++ ) {

	141 if (pathComponents[mParamIndex].contains(

	142 PageParameterDetector.PAGE_PARAM_PLACEHOLDER)) {

	143 break;

	144 }

	145 }

	146 }

	147

	148 /**

	149 * Returns true if, except for the path component containing the page param, the other path

	150 * components of doc URL are the same as pattern's. But pattern may have mo re components, e.g.:

	151 * - doc URL is /thread/12, pattern is /thread/12/page/[*!]

	152 * returns true because "thread" and "12" in doc URL match those in patter n

	153 * - doc URL is /thread/12/foo, pattern is /thread/12/page/[*!]/foo

	154 * returns false because "foo" in doc URL doesn't match "page" in pattern whose page param

	155 path component comes after.

	156 * - doc URL is /thread/12/foo, pattern is /thread/12/[*!]/foo

	157 * returns true because "foo" in doc URL would match "foo" in pattern whos e page param path

	158 * component is skipped when matching.

	159 */

	160 private boolean hasSamePathComponentsAs(ParsedUrl docUrl) {

	161 final String[] urlComponents = docUrl.getPathComponents();

	162 final String[] patternComponents = mUrl.getPathComponents();

	163 boolean passedParamComponent = false;

	164 for (int i = 0, j = 0; i < urlComponents.length && j < patternComponents .length; i++, j++) {

	165 if (i == mParamIndex && !passedParamComponent) {

	166 passedParamComponent = true;

	167 // Repeat current path component if doc URL has less components (as per comments

	168 // just above, doc URL may have less components).

	169 if (urlComponents.length < patternComponents.length) i--;

	170 continue;

	171 }

	172 if (!urlComponents[i].equalsIgnoreCase(patternComponents[j])) return false;

	173 }

	174

	175 return true;

	176 }

	177

	178 /**

	179 * Returns true if pattern is for a calendar page, e.g. 2012/01/[*!], which would be a

	180 * false-positive.

	181 */

	182 private boolean isCalendarPage() {

	183 if (mParamIndex < 2) return false;

	184

	185 // Only if param is the entire path component. This handles some cases erroneously

	186 // considered false-positives e.g. first page is

	187 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467.html,

	188 // and second page is

	189 // http://www.politico.com/story/2014/07/barack-obama-immigration-legal- questions-109467_Page2.html,

	190 // would be considered false-positives otherwise because of "2014" and " 07".

	191 final String[] patternComponents = mUrl.getPathComponents();

	192 if (patternComponents[mParamIndex].length() !=

	193 PageParameterDetector.PAGE_PARAM_PLACEHOLDER_LEN) {

	194 return false;

	195 }

	196

	197 int month = StringUtil.toNumber(patternComponents[mParamIndex - 1]);

	198 if (month > 0 && month <= 12) {

	199 int year = StringUtil.toNumber(patternComponents[mParamIndex - 2]);

	200 if (year > 1970 && year < 3000) return true;

	201 }

	202

	203 return false;

	204 }

	205

	206 private static int getLongestCommonPrefixLength(String str1, String str2) {

	207 if (str1.isEmpty() \|\| str2.isEmpty()) return 0;

	208

	209 int limit = Math.min(str1.length(), str2.length());

	210 int i = 0;

	211 for (; i < limit; i++) {

	212 if (str1.charAt(i) != str2.charAt(i)) break;

	213 }

	214 return i;

	215 }

	216

	217 private static int getLongestCommonSuffixLength(String str1, String str2, in t startIndex) {

	218 int commonSuffixLen = 0;

	219 for (int i = str1.length() - 1, j = str2.length() - 1;

	220 i > startIndex && j > startIndex; i--, j--, commonSuffixLen++) {

	221 if (str1.charAt(i) != str2.charAt(i)) break;

	222 }

	223 return commonSuffixLen;

	224 }

	225

	226 /**

	227 * Returns true if url is a paging URL based on the page pattern where the p age param is at the

	228 * start of a path component.

	229 * If the page pattern is www.foo.com/a/[*!]/abc.html, expected doc URL is:

	230 * - www.foo.com/a/2/abc.html

	231 * - www.foo.com/a/abc.html

	232 * - www.foo.com/abc.html.

	233 */

	234 private boolean isPagingUrlForStartOfPathComponent(String url) {

	235 final int urlLen = url.length();

	236 final int suffixLen = mSuffix.length();

	237 final int suffixStart = url.length() - suffixLen;

	238

	239 int prevComponentPos = mUrl.getPath().lastIndexOf('/',

	240 // We're only looking in the path, so the reverse search should start at the index

	241 // excluding the url's origin.

	242 mPlaceholderSegmentStart - 1 - mUrl.getOrigin().length());

	243 if (prevComponentPos != -1) {

	244 // Now, add back the url's origin to the index of previous path comp onent.

	245 prevComponentPos += mUrl.getOrigin().length();

	246 if (prevComponentPos + suffixLen == urlLen) {

	247 // The url doesn't have page number param and previous path comp onent, like

	248 // www.foo.com/abc.html.

	249 return url.regionMatches(0, mUrlStr, 0, prevComponentPos);

	250 }

	251 }

	252

	253 // If both url and pattern have the same prefix, url must have nothing e lse.

	254 if (url.startsWith(mPrefix)) {

	255 int acceptLen = mPlaceholderSegmentStart + suffixLen;

	256 // The url doesn't have page number parameter, like www.foo.com/a/ab c.html.

	257 if (acceptLen == urlLen) return true;

	258 if (acceptLen > urlLen) return false;

	259

	260 // While we are here, the url must have page number param, so the ur l must have a '/'

	261 // at the pattern's path component start position.

	262 if (url.charAt(mPlaceholderSegmentStart) != '/') return false;

	263

	264 return PageParameterDetector.isPlainNumber(url.substring(mPlaceholde rSegmentStart + 1,

	265 suffixStart));

	266 }

	267

	268 return false;

	269 }

	270

	271 /**

	272 * Returns true if url is a paging URL based on the page pattern where the p age param is not at

	273 * the start of a path component.

	274 * If the page pattern is www.foo.com/a/abc-[*!].html, expected doc URL is:

	275 * - www.foo.com/a/abc-2.html

	276 * - www.foo.com/a/abc.html.

	277 */

	278 private boolean isPagingUrlForNotStartOfPathComponent(String url) {

	279 final int urlLen = url.length();

	280 final int suffixStart = urlLen - mSuffix.length();

	281

	282 // The page param path component of both url and pattern must have the s ame prefix.

	283 if (!url.startsWith(mPrefix)) return false;

	284

	285 // Find the first different character in page param path component just before

	286 // placeholder or suffix, then check if it's acceptable.

	287 int firstDiffPos = mPlaceholderSegmentStart;

	288 int maxPos = Math.min(mPlaceholderStart, suffixStart);

	289 for (; firstDiffPos < maxPos; firstDiffPos++) {

	290 if (url.charAt(firstDiffPos) != mUrlStr.charAt(firstDiffPos)) break;

	291 }

	292 if (firstDiffPos == suffixStart) { // First different character is the suffix.

	293 if (firstDiffPos + 1 == mPlaceholderStart &&

	294 isPageParamSeparator(mUrlStr.charAt(firstDiffPos))) {

	295 return true;

	296 }

	297 } else if (firstDiffPos == mPlaceholderStart) { // First different char acter is page param.

	298 if (PageParameterDetector.isPlainNumber(url.substring(firstDiffPos, suffixStart))) {

	299 return true;

	300 }

	301 }

	302

	303 return false;

	304 }

	305

	306 private static RegExp sExtRegExp = null; // Match trailing .(s)htm(l).

	307 private static RegExp sLastPathComponentRegExp = null; // Match last path c omponent.

	308

	309 /**

	310 * Returns true if:

	311 * - the digitStart to digitEnd of urlStr is the last path component, and

	312 * - the entire path component is numeric, and

	313 * - the previous path component is a bad page param name.

	314 * E.g. "www.foo.com/tag/2" will return true because of the above reasons an d "tag" is a bad

	315 * page param.

	316 */

	317 static boolean isLastNumericPathComponentBad(String urlStr, int pathStart, i nt digitStart,

	318 int digitEnd) {

	319 if (urlStr.charAt(digitStart - 1) == '/' && // Digit is at start of path component.

	320 pathStart < digitStart - 1) { // Not the first path component.

	321 String postMatch = urlStr.substring(digitEnd).toLowerCase();

	322 // Checks that this is the last path component, and trailing charact ers, if available,

	323 // are (s)htm(l) extensions.

	324 if (sExtRegExp == null) sExtRegExp = RegExp.compile("(.s?html?)?$", "i");

	325 if (sExtRegExp.test(postMatch)) {

	326 // Entire component is numeric, get previous path component.

	327 if (sLastPathComponentRegExp == null) {

	328 sLastPathComponentRegExp = RegExp.compile("([^/]*)\\/$", "i") ;

	329 }

	330 MatchResult prevPathComponent = sLastPathComponentRegExp.exec(

	331 urlStr.substring(pathStart + 1, digitStart));

	332 if (prevPathComponent != null && prevPathComponent.getGroupCount () > 1 &&

	333 PageParameterDetector.isPageParamNameBad(prevPathCompone nt.getGroup(1))) {

	334 return true;

	335 }

	336 } // last numeric path component

	337 }

	338

	339 return false;

	340 }

	341

	342 /**

	343 * Returns true if given character is one of '-', '_', ';', ','.

	344 */

	345 private static native boolean isPageParamSeparator(Character c) /*-{

	346 return /[-_;,]/.test(c);

	347 }-*/;

	348

	349 }

OLD	NEW

« no previous file with comments | « java/org/chromium/distiller/ParsedUrl.java ('k') | java/org/chromium/distiller/QueryParamPagePattern.java » ('j') | no next file with comments »