Index: components/subresource_filter/core/common/url_pattern_matching.h |
diff --git a/components/subresource_filter/core/common/url_pattern_matching.h b/components/subresource_filter/core/common/url_pattern_matching.h |
deleted file mode 100644 |
index 937b703edfaa9fd66883322d4f5ad0282c6cffc4..0000000000000000000000000000000000000000 |
--- a/components/subresource_filter/core/common/url_pattern_matching.h |
+++ /dev/null |
@@ -1,277 +0,0 @@ |
-// Copyright 2016 The Chromium Authors. All rights reserved. |
-// Use of this source code is governed by a BSD-style license that can be |
-// found in the LICENSE file. |
- |
-// The matching logic distinguishes between the terms URL pattern and |
-// subpattern. A URL pattern usually stands for the full thing, e.g. |
-// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring |
-// of a pattern not containing the wildcard '*' character. For the example above |
-// the subpatterns are: "example.com^", "path" and "par=val^". |
-// |
-// The separator placeholder '^' symbol is used in subpatterns to match any |
-// separator character, which is any ASCII symbol except letters, digits, and |
-// the following: '_', '-', '.', '%'. Note that the separator placeholder |
-// character '^' is itself a separator, as well as '\0'. |
- |
-#ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ |
-#define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ |
- |
-#include <stddef.h> |
- |
-#include <algorithm> |
-#include <iterator> |
-#include <vector> |
- |
-#include "base/logging.h" |
-#include "base/strings/string_piece.h" |
-#include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" |
-#include "components/subresource_filter/core/common/knuth_morris_pratt.h" |
-#include "components/subresource_filter/core/common/string_splitter.h" |
-#include "components/subresource_filter/core/common/url_pattern.h" |
-#include "url/gurl.h" |
-#include "url/third_party/mozilla/url_parse.h" |
- |
-namespace subresource_filter { |
- |
-// Public interface declaration ------------------------------------------------ |
- |
-// Builds a compound Knuth-Morris-Pratt failure function used to match URLs |
-// against the |pattern|. |
-// |
-// The |pattern| is split on the '*' wildcard symbol and then a failure function |
-// is built for each subpattern by BuildFailureFunctionFuzzy or |
-// BuildFailureFunction (depending on whether the subpattern contains separator |
-// placeholders), and appended to the returned vector. Some of the subpatterns |
-// can be exempted from being indexed. E.g., if the |pattern| has a BOUNDARY |
-// left anchor, the first subpattern can be matched by checking if it's a prefix |
-// of a URL. |
-// |
-// Each subpattern indexed with BuildFailureFunctionFuzzy is prepended with a |
-// value 1 (to distinguish them from the subpatterns indexed with |
-// BuildFailureFunction, their failure functions always start with 0). |
-// |
-// The URL |pattern| must be normalized. Namely, it must not have the following |
-// substrings: **, *<END>, <BEGIN>*. If the the |pattern| has a BOUNDARY anchor, |
-// the corresponding side of its string must not end with a '*' wildcard. |
-template <typename IntType> |
-void BuildFailureFunction(const UrlPattern& pattern, |
- std::vector<IntType>* failure); |
- |
-// Returns whether the |url| matches the URL |pattern|. The |failure| function |
-// between |failure_begin| and |failure_end| must be the output of |
-// BuildFailureFunction() called with the same |pattern|. |
-// |
-// TODO(pkalinnikov): Outline algorithms implemented in this function. |
-template <typename FailureIter> |
-bool IsUrlPatternMatch(const GURL& url, |
- const UrlPattern& pattern, |
- FailureIter failure_begin, |
- FailureIter failure_end); |
- |
-// Implementation -------------------------------------------------------------- |
- |
-namespace impl { |
- |
-class IsWildcard { |
- public: |
- bool operator()(char c) const { return c == '*'; } |
-}; |
- |
-// Returns whether |position| within the |url| belongs to its |host| component |
-// and corresponds to the beginning of a (sub-)domain. |
-inline bool IsSubdomainAnchored(base::StringPiece url, |
- url::Component host, |
- size_t position) { |
- DCHECK_LE(position, url.size()); |
- const size_t host_begin = static_cast<size_t>(host.begin); |
- const size_t host_end = static_cast<size_t>(host.end()); |
- DCHECK_LE(host_end, url.size()); |
- |
- return position == host_begin || |
- (position > host_begin && position <= host_end && |
- url[position - 1] == '.'); |
-} |
- |
-// Returns the position just beyond the leftmost fuzzy occurrence of |
-// |subpattern| in the |text|. |
-template <typename IntType> |
-inline size_t FindFirstOccurrenceFuzzy(base::StringPiece text, |
- base::StringPiece subpattern, |
- const IntType* failure) { |
- return *AllOccurrencesFuzzy<IntType>(text, subpattern, failure).begin(); |
-} |
- |
-// Returns the position just beyond the leftmost occurrence of |subpattern| in |
-// the |url|, such that it satisfies a SUBDOMAIN anchor. |
-template <typename IntType> |
-inline size_t FindSubdomainAnchored(base::StringPiece url, |
- url::Component host, |
- base::StringPiece subpattern, |
- const IntType* failure) { |
- auto occurrences = AllOccurrences<IntType>(url, subpattern, failure); |
- return *std::find_if(occurrences.begin(), occurrences.end(), |
- [url, host, subpattern](size_t match_end_position) { |
- DCHECK_GE(match_end_position, subpattern.size()); |
- return IsSubdomainAnchored( |
- url, host, match_end_position - subpattern.size()); |
- }); |
-} |
- |
-// Returns the position just beyond the leftmost fuzzy occurrence of |
-// |subpattern| in the |url|, such that it satisfies a SUBDOMAIN anchor. |
-template <typename IntType> |
-inline size_t FindSubdomainAnchoredFuzzy(base::StringPiece url, |
- url::Component host, |
- base::StringPiece subpattern, |
- const IntType* failure) { |
- auto occurrences = AllOccurrencesFuzzy<IntType>(url, subpattern, failure); |
- return *std::find_if(occurrences.begin(), occurrences.end(), |
- [url, host, subpattern](size_t match_end_position) { |
- DCHECK_GE(match_end_position, subpattern.size()); |
- return IsSubdomainAnchored( |
- url, host, match_end_position - subpattern.size()); |
- }); |
-} |
- |
-} // namespace impl |
- |
-template <typename IntType> |
-void BuildFailureFunction(const UrlPattern& pattern, |
- std::vector<IntType>* failure) { |
- StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern); |
- auto subpattern_it = subpatterns.begin(); |
- auto subpattern_end = subpatterns.end(); |
- |
- if (subpattern_it == subpattern_end) |
- return; |
- if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) |
- ++subpattern_it; |
- |
- while (subpattern_it != subpattern_end) { |
- const base::StringPiece part = *subpattern_it++; |
- DCHECK(!part.empty()); |
- if (pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY && |
- subpattern_it == subpattern_end) { |
- break; |
- } |
- |
- if (part.find(kSeparatorPlaceholder) == base::StringPiece::npos) { |
- BuildFailureFunction(part, failure); |
- } else { |
- // Prepend the value '1' to the failure function to let matchers |
- // distinguish between the subpatterns with separator placeholders and |
- // those without. |
- failure->push_back(1); |
- BuildFailureFunctionFuzzy(part, failure); |
- } |
- } |
-} |
- |
-template <typename FailureIter> |
-bool IsUrlPatternMatch(const GURL& url, |
- const UrlPattern& pattern, |
- FailureIter failure_begin, |
- FailureIter failure_end) { |
- DCHECK(url.is_valid()); |
- |
- StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern); |
- auto subpattern_it = subpatterns.begin(); |
- auto subpattern_end = subpatterns.end(); |
- |
- if (subpattern_it == subpattern_end) { |
- return pattern.anchor_left != proto::ANCHOR_TYPE_BOUNDARY || |
- pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY || |
- url.is_empty(); |
- } |
- |
- const base::StringPiece spec = url.possibly_invalid_spec(); |
- const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; |
- |
- base::StringPiece subpattern = *subpattern_it++; |
- if (subpattern_it == subpattern_end && |
- pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) { |
- if (!EndsWithFuzzy(spec, subpattern)) |
- return false; |
- if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) |
- return spec.size() == subpattern.size(); |
- if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) { |
- DCHECK_LE(subpattern.size(), spec.size()); |
- return url.has_host() && |
- impl::IsSubdomainAnchored(spec, host_part, |
- spec.size() - subpattern.size()); |
- } |
- return true; |
- } |
- |
- base::StringPiece text = spec; |
- if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) { |
- if (!StartsWithFuzzy(spec, subpattern)) |
- return false; |
- if (subpattern_it == subpattern_end) |
- return true; |
- text.remove_prefix(subpattern.size()); |
- } else if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) { |
- if (!url.has_host()) |
- return false; |
- |
- const bool has_separator_placeholders = (*failure_begin != 0); |
- if (has_separator_placeholders) |
- ++failure_begin; |
- |
- const size_t position = |
- has_separator_placeholders |
- ? impl::FindSubdomainAnchoredFuzzy(spec, host_part, subpattern, |
- &*failure_begin) |
- : impl::FindSubdomainAnchored(spec, host_part, subpattern, |
- &*failure_begin); |
- if (position == base::StringPiece::npos) |
- return false; |
- if (subpattern_it == subpattern_end) |
- return true; |
- text.remove_prefix(position); |
- } else { |
- DCHECK_EQ(pattern.anchor_left, proto::ANCHOR_TYPE_NONE); |
- // Get back to the initial subpattern, process it in the loop below. |
- subpattern_it = subpatterns.begin(); |
- } |
- |
- while (subpattern_it != subpattern_end) { |
- subpattern = *subpattern_it++; |
- DCHECK(!subpattern.empty()); |
- |
- if (subpattern_it == subpattern_end && |
- pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) { |
- break; |
- } |
- |
- // The subpatterns with separator placeholders were indexed differently and |
- // should be matched accordingly. Their failure functions were prepended by |
- // a non-zero value, and we need to skip it. If the value turned to be zero, |
- // it is the initial value of a failure function of a regular substring. |
- CHECK(failure_begin < failure_end); |
- const bool has_separator_placeholders = (*failure_begin != 0); |
- if (has_separator_placeholders) |
- ++failure_begin; |
- CHECK(static_cast<size_t>(std::distance(failure_begin, failure_end)) >= |
- subpattern.size()); |
- |
- // If the subpattern has separator placeholders, it should be matched with |
- // FindFirstOccurrenceOfSubpattern, otherwise it can be found as a regular |
- // substring. |
- const size_t match_end = |
- (has_separator_placeholders |
- ? impl::FindFirstOccurrenceFuzzy(text, subpattern, &*failure_begin) |
- : FindFirstOccurrence(text, subpattern, &*failure_begin)); |
- if (match_end == base::StringPiece::npos) |
- return false; |
- text.remove_prefix(match_end); |
- failure_begin += subpattern.size(); |
- } |
- |
- return pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY || |
- EndsWithFuzzy(text, subpattern); |
-} |
- |
-} // namespace subresource_filter |
- |
-#endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ |