Index: components/subresource_filter/core/common/url_pattern.cc |
diff --git a/components/subresource_filter/core/common/url_pattern.cc b/components/subresource_filter/core/common/url_pattern.cc |
index 1cfa663762abf9a0698c184c708005d6ee1f6c87..d601f41e6d55bfc2a1521cab49fdcf06c8f33688 100644 |
--- a/components/subresource_filter/core/common/url_pattern.cc |
+++ b/components/subresource_filter/core/common/url_pattern.cc |
@@ -2,14 +2,39 @@ |
// Use of this source code is governed by a BSD-style license that can be |
// found in the LICENSE file. |
+// The matching logic distinguishes between the terms URL pattern and |
+// subpattern. A URL pattern usually stands for the full thing, e.g. |
+// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring |
+// of a pattern not containing the wildcard '*' character. For the example above |
+// the subpatterns are: "example.com^", "path" and "par=val^". |
+// |
+// The separator placeholder '^' symbol is used in subpatterns to match any |
+// separator character, which is any ASCII symbol except letters, digits, and |
+// the following: '_', '-', '.', '%'. Note that the separator placeholder |
+// character '^' is itself a separator, as well as '\0'. |
+ |
#include "components/subresource_filter/core/common/url_pattern.h" |
+#include <stddef.h> |
+ |
+#include <ostream> |
+ |
+#include "base/logging.h" |
#include "components/subresource_filter/core/common/flat/rules_generated.h" |
+#include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" |
+#include "components/subresource_filter/core/common/string_splitter.h" |
+#include "url/gurl.h" |
+#include "url/third_party/mozilla/url_parse.h" |
namespace subresource_filter { |
namespace { |
+class IsWildcard { |
+ public: |
+ bool operator()(char c) const { return c == '*'; } |
+}; |
+ |
proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { |
switch (type) { |
case flat::UrlPatternType_SUBSTRING: |
@@ -41,36 +66,191 @@ base::StringPiece ConvertString(const flatbuffers::String* string) { |
: base::StringPiece(); |
} |
+// Returns whether |position| within the |url| belongs to its |host| component |
+// and corresponds to the beginning of a (sub-)domain. |
+inline bool IsSubdomainAnchored(base::StringPiece url, |
+ url::Component host, |
+ size_t position) { |
+ DCHECK_LE(position, url.size()); |
+ const size_t host_begin = static_cast<size_t>(host.begin); |
+ const size_t host_end = static_cast<size_t>(host.end()); |
+ DCHECK_LE(host_end, url.size()); |
+ |
+ return position == host_begin || |
+ (position > host_begin && position <= host_end && |
+ url[position - 1] == '.'); |
+} |
+ |
+// Returns the position of the leftmost occurrence of a |subpattern| in the |
+// |text| starting no earlier than |from| the specified position. If the |
+// |subpattern| has separator placeholders, searches for a fuzzy occurrence. |
+size_t FindSubpattern(base::StringPiece text, |
+ base::StringPiece subpattern, |
+ size_t from = 0) { |
+ const bool is_fuzzy = |
+ (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); |
+ return is_fuzzy ? FindFuzzy(text, subpattern, from) |
+ : text.find(subpattern, from); |
+} |
+ |
+// Same as FindSubpattern(url, subpattern), but searches for an occurrence that |
+// starts at the beginning of a (sub-)domain within the url's |host| component. |
+size_t FindSubdomainAnchoredSubpattern(base::StringPiece url, |
+ url::Component host, |
+ base::StringPiece subpattern) { |
+ const bool is_fuzzy = |
+ (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); |
+ |
+ for (size_t position = 0; position <= url.size(); ++position) { |
+ position = is_fuzzy ? FindFuzzy(url, subpattern, position) |
+ : url.find(subpattern, position); |
+ if (position == base::StringPiece::npos || |
+ IsSubdomainAnchored(url, host, position)) { |
+ return position; |
+ } |
+ } |
+ return base::StringPiece::npos; |
+} |
+ |
} // namespace |
UrlPattern::UrlPattern() = default; |
UrlPattern::UrlPattern(base::StringPiece url_pattern, |
proto::UrlPatternType type) |
- : type(type), url_pattern(url_pattern) {} |
+ : type_(type), url_pattern_(url_pattern) {} |
UrlPattern::UrlPattern(base::StringPiece url_pattern, |
proto::AnchorType anchor_left, |
proto::AnchorType anchor_right) |
- : type(proto::URL_PATTERN_TYPE_WILDCARDED), |
- url_pattern(url_pattern), |
- anchor_left(anchor_left), |
- anchor_right(anchor_right) {} |
+ : type_(proto::URL_PATTERN_TYPE_WILDCARDED), |
+ url_pattern_(url_pattern), |
+ anchor_left_(anchor_left), |
+ anchor_right_(anchor_right) {} |
UrlPattern::UrlPattern(const flat::UrlRule& rule) |
- : type(ConvertUrlPatternType(rule.url_pattern_type())), |
- url_pattern(ConvertString(rule.url_pattern())), |
- anchor_left(ConvertAnchorType(rule.anchor_left())), |
- anchor_right(ConvertAnchorType(rule.anchor_right())), |
- match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} |
+ : type_(ConvertUrlPatternType(rule.url_pattern_type())), |
+ url_pattern_(ConvertString(rule.url_pattern())), |
+ anchor_left_(ConvertAnchorType(rule.anchor_left())), |
+ anchor_right_(ConvertAnchorType(rule.anchor_right())), |
+ match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} |
UrlPattern::UrlPattern(const proto::UrlRule& rule) |
- : type(rule.url_pattern_type()), |
- url_pattern(rule.url_pattern()), |
- anchor_left(rule.anchor_left()), |
- anchor_right(rule.anchor_right()), |
- match_case(rule.match_case()) {} |
+ : type_(rule.url_pattern_type()), |
+ url_pattern_(rule.url_pattern()), |
+ anchor_left_(rule.anchor_left()), |
+ anchor_right_(rule.anchor_right()), |
+ match_case_(rule.match_case()) {} |
UrlPattern::~UrlPattern() = default; |
+bool UrlPattern::MatchesUrl(const GURL& url) const { |
+ // Note: Empty domains are also invalid. |
+ DCHECK(url.is_valid()); |
+ DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING || |
+ type_ == proto::URL_PATTERN_TYPE_WILDCARDED); |
+ |
+ StringSplitter<IsWildcard> subpatterns(url_pattern_); |
+ auto subpattern_it = subpatterns.begin(); |
+ auto subpattern_end = subpatterns.end(); |
+ |
+ if (subpattern_it == subpattern_end) { |
+ return anchor_left_ == proto::ANCHOR_TYPE_NONE || |
+ anchor_right_ == proto::ANCHOR_TYPE_NONE; |
+ } |
+ |
+ const base::StringPiece spec = url.possibly_invalid_spec(); |
+ const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; |
+ DCHECK(!spec.empty()); |
+ |
+ base::StringPiece subpattern = *subpattern_it; |
+ ++subpattern_it; |
+ |
+ // If there is only one |subpattern|, and it has a right anchor, then simply |
+ // check that it is a suffix of the |spec|, and the left anchor is fulfilled. |
+ if (subpattern_it == subpattern_end && |
+ anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { |
+ if (!EndsWithFuzzy(spec, subpattern)) |
+ return false; |
+ if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) |
+ return spec.size() == subpattern.size(); |
+ if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { |
+ DCHECK_LE(subpattern.size(), spec.size()); |
+ return url.has_host() && |
+ IsSubdomainAnchored(spec, host_part, |
+ spec.size() - subpattern.size()); |
+ } |
+ return true; |
+ } |
+ |
+ // Otherwise, the first |subpattern| does not have to be a suffix. But it |
+ // still can have a left anchor. Check and handle that. |
+ base::StringPiece text = spec; |
+ if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) { |
+ if (!StartsWithFuzzy(spec, subpattern)) |
+ return false; |
+ if (subpattern_it == subpattern_end) { |
+ DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); |
+ return true; |
+ } |
+ text.remove_prefix(subpattern.size()); |
+ } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { |
+ if (!url.has_host()) |
+ return false; |
+ const size_t match_begin = |
+ FindSubdomainAnchoredSubpattern(spec, host_part, subpattern); |
+ if (match_begin == base::StringPiece::npos) |
+ return false; |
+ if (subpattern_it == subpattern_end) { |
+ DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); |
+ return true; |
+ } |
+ text.remove_prefix(match_begin + subpattern.size()); |
+ } else { |
+ DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE); |
+ // Get back to the initial |subpattern|, process it in the loop below. |
+ subpattern_it = subpatterns.begin(); |
+ } |
+ |
+ // Consecutively find all the remaining subpatterns in the |text|. If the |
+ // pattern has a right anchor, don't search for the last subpattern, but |
+ // instead check that it is a suffix of the |text|. |
+ while (subpattern_it != subpattern_end) { |
+ subpattern = *subpattern_it; |
+ DCHECK(!subpattern.empty()); |
+ |
+ if (++subpattern_it == subpattern_end && |
+ anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { |
+ break; |
+ } |
+ |
+ const size_t match_position = FindSubpattern(text, subpattern); |
+ if (match_position == base::StringPiece::npos) |
+ return false; |
+ text.remove_prefix(match_position + subpattern.size()); |
+ } |
+ |
+ return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY || |
+ EndsWithFuzzy(text, subpattern); |
+} |
+ |
+std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) { |
+ // Note: Each fall-through in this switch is intentional. |
+ switch (pattern.anchor_left()) { |
+ case proto::ANCHOR_TYPE_SUBDOMAIN: |
+ out << '|'; |
+ case proto::ANCHOR_TYPE_BOUNDARY: |
+ out << '|'; |
+ default: |
+ break; |
+ } |
+ out << pattern.url_pattern(); |
+ if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY) |
+ out << '|'; |
+ if (pattern.match_case()) |
+ out << "$match-case"; |
+ |
+ return out; |
+} |
+ |
} // namespace subresource_filter |