| Index: components/subresource_filter/core/common/url_pattern.cc
|
| diff --git a/components/subresource_filter/core/common/url_pattern.cc b/components/subresource_filter/core/common/url_pattern.cc
|
| index 1cfa663762abf9a0698c184c708005d6ee1f6c87..d601f41e6d55bfc2a1521cab49fdcf06c8f33688 100644
|
| --- a/components/subresource_filter/core/common/url_pattern.cc
|
| +++ b/components/subresource_filter/core/common/url_pattern.cc
|
| @@ -2,14 +2,39 @@
|
| // Use of this source code is governed by a BSD-style license that can be
|
| // found in the LICENSE file.
|
|
|
| +// The matching logic distinguishes between the terms URL pattern and
|
| +// subpattern. A URL pattern usually stands for the full thing, e.g.
|
| +// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
|
| +// of a pattern not containing the wildcard '*' character. For the example above
|
| +// the subpatterns are: "example.com^", "path" and "par=val^".
|
| +//
|
| +// The separator placeholder '^' symbol is used in subpatterns to match any
|
| +// separator character, which is any ASCII symbol except letters, digits, and
|
| +// the following: '_', '-', '.', '%'. Note that the separator placeholder
|
| +// character '^' is itself a separator, as well as '\0'.
|
| +
|
| #include "components/subresource_filter/core/common/url_pattern.h"
|
|
|
| +#include <stddef.h>
|
| +
|
| +#include <ostream>
|
| +
|
| +#include "base/logging.h"
|
| #include "components/subresource_filter/core/common/flat/rules_generated.h"
|
| +#include "components/subresource_filter/core/common/fuzzy_pattern_matching.h"
|
| +#include "components/subresource_filter/core/common/string_splitter.h"
|
| +#include "url/gurl.h"
|
| +#include "url/third_party/mozilla/url_parse.h"
|
|
|
| namespace subresource_filter {
|
|
|
| namespace {
|
|
|
| +class IsWildcard {
|
| + public:
|
| + bool operator()(char c) const { return c == '*'; }
|
| +};
|
| +
|
| proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
|
| switch (type) {
|
| case flat::UrlPatternType_SUBSTRING:
|
| @@ -41,36 +66,191 @@ base::StringPiece ConvertString(const flatbuffers::String* string) {
|
| : base::StringPiece();
|
| }
|
|
|
| +// Returns whether |position| within the |url| belongs to its |host| component
|
| +// and corresponds to the beginning of a (sub-)domain.
|
| +inline bool IsSubdomainAnchored(base::StringPiece url,
|
| + url::Component host,
|
| + size_t position) {
|
| + DCHECK_LE(position, url.size());
|
| + const size_t host_begin = static_cast<size_t>(host.begin);
|
| + const size_t host_end = static_cast<size_t>(host.end());
|
| + DCHECK_LE(host_end, url.size());
|
| +
|
| + return position == host_begin ||
|
| + (position > host_begin && position <= host_end &&
|
| + url[position - 1] == '.');
|
| +}
|
| +
|
| +// Returns the position of the leftmost occurrence of a |subpattern| in the
|
| +// |text| starting no earlier than |from| the specified position. If the
|
| +// |subpattern| has separator placeholders, searches for a fuzzy occurrence.
|
| +size_t FindSubpattern(base::StringPiece text,
|
| + base::StringPiece subpattern,
|
| + size_t from = 0) {
|
| + const bool is_fuzzy =
|
| + (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
|
| + return is_fuzzy ? FindFuzzy(text, subpattern, from)
|
| + : text.find(subpattern, from);
|
| +}
|
| +
|
| +// Same as FindSubpattern(url, subpattern), but searches for an occurrence that
|
| +// starts at the beginning of a (sub-)domain within the url's |host| component.
|
| +size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,
|
| + url::Component host,
|
| + base::StringPiece subpattern) {
|
| + const bool is_fuzzy =
|
| + (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
|
| +
|
| + for (size_t position = 0; position <= url.size(); ++position) {
|
| + position = is_fuzzy ? FindFuzzy(url, subpattern, position)
|
| + : url.find(subpattern, position);
|
| + if (position == base::StringPiece::npos ||
|
| + IsSubdomainAnchored(url, host, position)) {
|
| + return position;
|
| + }
|
| + }
|
| + return base::StringPiece::npos;
|
| +}
|
| +
|
| } // namespace
|
|
|
| UrlPattern::UrlPattern() = default;
|
|
|
| UrlPattern::UrlPattern(base::StringPiece url_pattern,
|
| proto::UrlPatternType type)
|
| - : type(type), url_pattern(url_pattern) {}
|
| + : type_(type), url_pattern_(url_pattern) {}
|
|
|
| UrlPattern::UrlPattern(base::StringPiece url_pattern,
|
| proto::AnchorType anchor_left,
|
| proto::AnchorType anchor_right)
|
| - : type(proto::URL_PATTERN_TYPE_WILDCARDED),
|
| - url_pattern(url_pattern),
|
| - anchor_left(anchor_left),
|
| - anchor_right(anchor_right) {}
|
| + : type_(proto::URL_PATTERN_TYPE_WILDCARDED),
|
| + url_pattern_(url_pattern),
|
| + anchor_left_(anchor_left),
|
| + anchor_right_(anchor_right) {}
|
|
|
| UrlPattern::UrlPattern(const flat::UrlRule& rule)
|
| - : type(ConvertUrlPatternType(rule.url_pattern_type())),
|
| - url_pattern(ConvertString(rule.url_pattern())),
|
| - anchor_left(ConvertAnchorType(rule.anchor_left())),
|
| - anchor_right(ConvertAnchorType(rule.anchor_right())),
|
| - match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}
|
| + : type_(ConvertUrlPatternType(rule.url_pattern_type())),
|
| + url_pattern_(ConvertString(rule.url_pattern())),
|
| + anchor_left_(ConvertAnchorType(rule.anchor_left())),
|
| + anchor_right_(ConvertAnchorType(rule.anchor_right())),
|
| + match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}
|
|
|
| UrlPattern::UrlPattern(const proto::UrlRule& rule)
|
| - : type(rule.url_pattern_type()),
|
| - url_pattern(rule.url_pattern()),
|
| - anchor_left(rule.anchor_left()),
|
| - anchor_right(rule.anchor_right()),
|
| - match_case(rule.match_case()) {}
|
| + : type_(rule.url_pattern_type()),
|
| + url_pattern_(rule.url_pattern()),
|
| + anchor_left_(rule.anchor_left()),
|
| + anchor_right_(rule.anchor_right()),
|
| + match_case_(rule.match_case()) {}
|
|
|
| UrlPattern::~UrlPattern() = default;
|
|
|
| +bool UrlPattern::MatchesUrl(const GURL& url) const {
|
| + // Note: Empty domains are also invalid.
|
| + DCHECK(url.is_valid());
|
| + DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
|
| + type_ == proto::URL_PATTERN_TYPE_WILDCARDED);
|
| +
|
| + StringSplitter<IsWildcard> subpatterns(url_pattern_);
|
| + auto subpattern_it = subpatterns.begin();
|
| + auto subpattern_end = subpatterns.end();
|
| +
|
| + if (subpattern_it == subpattern_end) {
|
| + return anchor_left_ == proto::ANCHOR_TYPE_NONE ||
|
| + anchor_right_ == proto::ANCHOR_TYPE_NONE;
|
| + }
|
| +
|
| + const base::StringPiece spec = url.possibly_invalid_spec();
|
| + const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;
|
| + DCHECK(!spec.empty());
|
| +
|
| + base::StringPiece subpattern = *subpattern_it;
|
| + ++subpattern_it;
|
| +
|
| + // If there is only one |subpattern|, and it has a right anchor, then simply
|
| + // check that it is a suffix of the |spec|, and the left anchor is fulfilled.
|
| + if (subpattern_it == subpattern_end &&
|
| + anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
|
| + if (!EndsWithFuzzy(spec, subpattern))
|
| + return false;
|
| + if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY)
|
| + return spec.size() == subpattern.size();
|
| + if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
|
| + DCHECK_LE(subpattern.size(), spec.size());
|
| + return url.has_host() &&
|
| + IsSubdomainAnchored(spec, host_part,
|
| + spec.size() - subpattern.size());
|
| + }
|
| + return true;
|
| + }
|
| +
|
| + // Otherwise, the first |subpattern| does not have to be a suffix. But it
|
| + // still can have a left anchor. Check and handle that.
|
| + base::StringPiece text = spec;
|
| + if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) {
|
| + if (!StartsWithFuzzy(spec, subpattern))
|
| + return false;
|
| + if (subpattern_it == subpattern_end) {
|
| + DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);
|
| + return true;
|
| + }
|
| + text.remove_prefix(subpattern.size());
|
| + } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
|
| + if (!url.has_host())
|
| + return false;
|
| + const size_t match_begin =
|
| + FindSubdomainAnchoredSubpattern(spec, host_part, subpattern);
|
| + if (match_begin == base::StringPiece::npos)
|
| + return false;
|
| + if (subpattern_it == subpattern_end) {
|
| + DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);
|
| + return true;
|
| + }
|
| + text.remove_prefix(match_begin + subpattern.size());
|
| + } else {
|
| + DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE);
|
| + // Get back to the initial |subpattern|, process it in the loop below.
|
| + subpattern_it = subpatterns.begin();
|
| + }
|
| +
|
| + // Consecutively find all the remaining subpatterns in the |text|. If the
|
| + // pattern has a right anchor, don't search for the last subpattern, but
|
| + // instead check that it is a suffix of the |text|.
|
| + while (subpattern_it != subpattern_end) {
|
| + subpattern = *subpattern_it;
|
| + DCHECK(!subpattern.empty());
|
| +
|
| + if (++subpattern_it == subpattern_end &&
|
| + anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
|
| + break;
|
| + }
|
| +
|
| + const size_t match_position = FindSubpattern(text, subpattern);
|
| + if (match_position == base::StringPiece::npos)
|
| + return false;
|
| + text.remove_prefix(match_position + subpattern.size());
|
| + }
|
| +
|
| + return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY ||
|
| + EndsWithFuzzy(text, subpattern);
|
| +}
|
| +
|
| +std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
|
| + // Note: Each fall-through in this switch is intentional.
|
| + switch (pattern.anchor_left()) {
|
| + case proto::ANCHOR_TYPE_SUBDOMAIN:
|
| + out << '|';
|
| + case proto::ANCHOR_TYPE_BOUNDARY:
|
| + out << '|';
|
| + default:
|
| + break;
|
| + }
|
| + out << pattern.url_pattern();
|
| + if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
|
| + out << '|';
|
| + if (pattern.match_case())
|
| + out << "$match-case";
|
| +
|
| + return out;
|
| +}
|
| +
|
| } // namespace subresource_filter
|
|
|