Chromium Code Reviews| Index: components/subresource_filter/core/common/url_pattern.cc |
| diff --git a/components/subresource_filter/core/common/url_pattern.cc b/components/subresource_filter/core/common/url_pattern.cc |
| index 1cfa663762abf9a0698c184c708005d6ee1f6c87..a8b018a106bfaa710b5564392ff3cc4405da1a52 100644 |
| --- a/components/subresource_filter/core/common/url_pattern.cc |
| +++ b/components/subresource_filter/core/common/url_pattern.cc |
| @@ -2,14 +2,40 @@ |
| // Use of this source code is governed by a BSD-style license that can be |
| // found in the LICENSE file. |
| +// The matching logic distinguishes between the terms URL pattern and |
| +// subpattern. A URL pattern usually stands for the full thing, e.g. |
| +// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring |
| +// of a pattern not containing the wildcard '*' character. For the example above |
| +// the subpatterns are: "example.com^", "path" and "par=val^". |
| +// |
| +// The separator placeholder '^' symbol is used in subpatterns to match any |
| +// separator character, which is any ASCII symbol except letters, digits, and |
| +// the following: '_', '-', '.', '%'. Note that the separator placeholder |
| +// character '^' is itself a separator, as well as '\0'. |
| + |
| #include "components/subresource_filter/core/common/url_pattern.h" |
| +#include <stddef.h> |
| + |
| +#include <algorithm> |
| +#include <ostream> |
| + |
| +#include "base/logging.h" |
| #include "components/subresource_filter/core/common/flat/rules_generated.h" |
| +#include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" |
| +#include "components/subresource_filter/core/common/string_splitter.h" |
| +#include "url/gurl.h" |
| +#include "url/third_party/mozilla/url_parse.h" |
| namespace subresource_filter { |
| namespace { |
| +class IsWildcard { |
| + public: |
| + bool operator()(char c) const { return c == '*'; } |
| +}; |
| + |
| proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { |
| switch (type) { |
| case flat::UrlPatternType_SUBSTRING: |
| @@ -41,36 +67,206 @@ base::StringPiece ConvertString(const flatbuffers::String* string) { |
| : base::StringPiece(); |
| } |
| +// Returns whether |position| within the |url| belongs to its |host| component |
| +// and corresponds to the beginning of a (sub-)domain. |
| +inline bool IsSubdomainAnchored(base::StringPiece url, |
| + url::Component host, |
| + size_t position) { |
| + DCHECK_LE(position, url.size()); |
| + const size_t host_begin = static_cast<size_t>(host.begin); |
| + const size_t host_end = static_cast<size_t>(host.end()); |
| + DCHECK_LE(host_end, url.size()); |
| + |
| + return position == host_begin || |
| + (position > host_begin && position <= host_end && |
| + url[position - 1] == '.'); |
| +} |
| + |
| +// Returns the position of the leftmost occurrence of |subpattern| in the |
| +// |text|, starting |from| a certain position. If |is_fuzzy| then |
|
engedy
2017/04/05 10:55:11
nit: s/a certain/the specified/g
pkalinnikov
2017/04/05 13:29:21
Done.
|
| +// searches for a fuzzy occurrence. |
| +size_t FindSubpatternImpl(base::StringPiece text, |
| + base::StringPiece subpattern, |
| + size_t from, |
| + bool is_fuzzy) { |
| + if (!is_fuzzy) |
| + return text.find(subpattern, from); |
| + |
| + auto* found = |
| + std::search(text.begin() + from, text.end(), subpattern.begin(), |
|
engedy
2017/04/05 10:55:11
nit: Have you considered moving this to fuzzy_patt
pkalinnikov
2017/04/05 13:29:21
Done. Also, reused the old AllOccurrences test.
|
| + subpattern.end(), [](char text_char, char subpattern_char) { |
|
Charlie Harrison
2017/04/04 16:49:39
nit: Could you hoist the lambda to a member var? I
pkalinnikov
2017/04/05 13:29:21
Done.
|
| + return text_char == subpattern_char || |
| + (subpattern_char == kSeparatorPlaceholder && |
| + IsSeparator(text_char)); |
| + }); |
| + return found == text.end() ? base::StringPiece::npos : found - text.begin(); |
| +} |
| + |
| +// Returns the position of the leftmost occurrence of |subpattern| in the |
| +// |text|, starting |from| a certain position. If the |subpattern| has separator |
| +// placeholders, searches for a fuzzy occurrence. |
| +size_t FindSubpattern(base::StringPiece text, |
| + base::StringPiece subpattern, |
| + size_t from = 0) { |
| + const bool is_fuzzy = |
| + (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); |
| + return FindSubpatternImpl(text, subpattern, from, is_fuzzy); |
| +} |
| + |
| +// Same as FindSubpattern(url, subpattern), but searches for an occurrence that |
| +// starts in the beginning of a (sub-)domain within the url's |host| component. |
|
engedy
2017/04/05 10:55:11
nit: ... at the beginning ...
pkalinnikov
2017/04/05 13:29:21
Done.
|
| +size_t FindSubdomainAnchoredSubpattern(base::StringPiece url, |
| + url::Component host, |
| + base::StringPiece subpattern) { |
| + const bool is_fuzzy = |
|
engedy
2017/04/05 10:55:11
Do we need to check subpattern for being empty her
pkalinnikov
2017/04/05 13:29:21
Done.
|
| + (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); |
| + for (size_t position = FindSubpatternImpl(url, subpattern, 0, is_fuzzy); |
| + position != base::StringPiece::npos; |
| + position = FindSubpatternImpl(url, subpattern, ++position, is_fuzzy)) { |
| + if (IsSubdomainAnchored(url, host, position)) |
| + return position; |
| + } |
| + return base::StringPiece::npos; |
| +} |
| + |
| } // namespace |
| UrlPattern::UrlPattern() = default; |
| UrlPattern::UrlPattern(base::StringPiece url_pattern, |
| proto::UrlPatternType type) |
| - : type(type), url_pattern(url_pattern) {} |
| + : type_(type), url_pattern_(url_pattern) {} |
| UrlPattern::UrlPattern(base::StringPiece url_pattern, |
| proto::AnchorType anchor_left, |
| proto::AnchorType anchor_right) |
| - : type(proto::URL_PATTERN_TYPE_WILDCARDED), |
| - url_pattern(url_pattern), |
| - anchor_left(anchor_left), |
| - anchor_right(anchor_right) {} |
| + : type_(proto::URL_PATTERN_TYPE_WILDCARDED), |
| + url_pattern_(url_pattern), |
| + anchor_left_(anchor_left), |
| + anchor_right_(anchor_right) {} |
| UrlPattern::UrlPattern(const flat::UrlRule& rule) |
| - : type(ConvertUrlPatternType(rule.url_pattern_type())), |
| - url_pattern(ConvertString(rule.url_pattern())), |
| - anchor_left(ConvertAnchorType(rule.anchor_left())), |
| - anchor_right(ConvertAnchorType(rule.anchor_right())), |
| - match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} |
| + : type_(ConvertUrlPatternType(rule.url_pattern_type())), |
| + url_pattern_(ConvertString(rule.url_pattern())), |
| + anchor_left_(ConvertAnchorType(rule.anchor_left())), |
| + anchor_right_(ConvertAnchorType(rule.anchor_right())), |
| + match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} |
| UrlPattern::UrlPattern(const proto::UrlRule& rule) |
| - : type(rule.url_pattern_type()), |
| - url_pattern(rule.url_pattern()), |
| - anchor_left(rule.anchor_left()), |
| - anchor_right(rule.anchor_right()), |
| - match_case(rule.match_case()) {} |
| + : type_(rule.url_pattern_type()), |
| + url_pattern_(rule.url_pattern()), |
| + anchor_left_(rule.anchor_left()), |
| + anchor_right_(rule.anchor_right()), |
| + match_case_(rule.match_case()) {} |
| UrlPattern::~UrlPattern() = default; |
| +bool UrlPattern::MatchesUrl(const GURL& url) const { |
| + DCHECK(url.is_valid()); |
|
engedy
2017/04/05 10:55:11
Could you please add a comment here that empty URL
pkalinnikov
2017/04/05 13:29:21
Done.
|
| + DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING || |
| + proto::URL_PATTERN_TYPE_WILDCARDED); |
| + |
| + StringSplitter<IsWildcard> subpatterns(url_pattern_); |
| + auto subpattern_it = subpatterns.begin(); |
| + auto subpattern_end = subpatterns.end(); |
| + |
| + if (subpattern_it == subpattern_end) { |
| + return anchor_left_ == proto::ANCHOR_TYPE_NONE || |
| + anchor_right_ == proto::ANCHOR_TYPE_NONE; |
| + } |
| + |
| + const base::StringPiece spec = url.possibly_invalid_spec(); |
| + const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; |
| + DCHECK(!spec.empty()); |
| + |
| + base::StringPiece subpattern = *subpattern_it; |
|
engedy
2017/04/05 10:55:11
nit: Could you please add a comment to explain why
pkalinnikov
2017/04/05 13:29:21
I splitted *subpattern_it++ in two lines, because
engedy
2017/04/05 13:41:33
Acknowledged.
|
| + ++subpattern_it; |
| + |
| + // If there is only one |subpattern|, and it has a right anchor, then simply |
| + // check that it is a suffix of the |spec|, and the left anchor is fulfilled. |
| + if (subpattern_it == subpattern_end && |
| + anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { |
| + if (!EndsWithFuzzy(spec, subpattern)) |
| + return false; |
| + if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) |
| + return spec.size() == subpattern.size(); |
| + if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { |
| + DCHECK_LE(subpattern.size(), spec.size()); |
| + return url.has_host() && |
| + IsSubdomainAnchored(spec, host_part, |
| + spec.size() - subpattern.size()); |
| + } |
| + return true; |
| + } |
| + |
| + // Otherwise, the first |subpattern| does not have to be a suffix. But it |
| + // still can have a left anchor. Check and handle that. |
| + base::StringPiece text = spec; |
| + if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) { |
| + if (!StartsWithFuzzy(spec, subpattern)) |
| + return false; |
| + if (subpattern_it == subpattern_end) { |
| + DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); |
| + return true; |
| + } |
| + text.remove_prefix(subpattern.size()); |
| + } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { |
| + if (!url.has_host()) |
| + return false; |
| + const size_t match_begin = |
| + FindSubdomainAnchoredSubpattern(spec, host_part, subpattern); |
| + if (match_begin == base::StringPiece::npos) |
| + return false; |
| + if (subpattern_it == subpattern_end) { |
| + DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); |
| + return true; |
| + } |
| + text.remove_prefix(match_begin + subpattern.size()); |
| + } else { |
| + DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE); |
| + // Get back to the initial |subpattern|, process it in the loop below. |
| + subpattern_it = subpatterns.begin(); |
| + } |
| + |
| + // Consecutively find all the remaining subpatterns in the |text|. If the |
| + // pattern has a right anchor, don't search for the last subpattern, but |
| + // instead check that it is a suffix of the |text|. |
| + while (subpattern_it != subpattern_end) { |
| + subpattern = *subpattern_it; |
| + DCHECK(!subpattern.empty()); |
| + |
| + if (++subpattern_it == subpattern_end && |
| + anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { |
| + break; |
| + } |
| + |
| + const size_t match_position = FindSubpattern(text, subpattern); |
| + if (match_position == base::StringPiece::npos) |
| + return false; |
| + text.remove_prefix(match_position + subpattern.size()); |
| + } |
| + |
| + return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY || |
| + EndsWithFuzzy(text, subpattern); |
| +} |
| + |
| +std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) { |
| + // Note: Each fall-through in this switch is intentional. |
| + switch (pattern.anchor_left()) { |
| + case proto::ANCHOR_TYPE_SUBDOMAIN: |
| + out << '|'; |
| + case proto::ANCHOR_TYPE_BOUNDARY: |
| + out << '|'; |
| + default: |
| + break; |
| + } |
| + out << pattern.url_pattern(); |
| + if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY) |
| + out << '|'; |
| + if (pattern.match_case()) |
| + out << "$match-case"; |
| + |
| + return out; |
| +} |
| + |
| } // namespace subresource_filter |