Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(580)

Unified Diff: components/subresource_filter/core/common/url_pattern.cc

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)
Patch Set: Fix tests. Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/subresource_filter/core/common/url_pattern.cc
diff --git a/components/subresource_filter/core/common/url_pattern.cc b/components/subresource_filter/core/common/url_pattern.cc
index 1cfa663762abf9a0698c184c708005d6ee1f6c87..33a45d3d66e8a3af89bf0c050f9621ca48524cfa 100644
--- a/components/subresource_filter/core/common/url_pattern.cc
+++ b/components/subresource_filter/core/common/url_pattern.cc
@@ -2,14 +2,40 @@
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
+// The matching logic distinguishes between the terms URL pattern and
+// subpattern. A URL pattern usually stands for the full thing, e.g.
+// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
+// of a pattern not containing the wildcard '*' character. For the example above
+// the subpatterns are: "example.com^", "path" and "par=val^".
+//
+// The separator placeholder '^' symbol is used in subpatterns to match any
+// separator character, which is any ASCII symbol except letters, digits, and
+// the following: '_', '-', '.', '%'. Note that the separator placeholder
+// character '^' is itself a separator, as well as '\0'.
+
#include "components/subresource_filter/core/common/url_pattern.h"
+#include <stddef.h>
+
+#include <algorithm>
+#include <ostream>
+
+#include "base/logging.h"
#include "components/subresource_filter/core/common/flat/rules_generated.h"
+#include "components/subresource_filter/core/common/fuzzy_pattern_matching.h"
+#include "components/subresource_filter/core/common/string_splitter.h"
+#include "url/gurl.h"
+#include "url/third_party/mozilla/url_parse.h"
namespace subresource_filter {
namespace {
+class IsWildcard {
+ public:
+ bool operator()(char c) const { return c == '*'; }
+};
+
proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
switch (type) {
case flat::UrlPatternType_SUBSTRING:
@@ -41,36 +67,187 @@ base::StringPiece ConvertString(const flatbuffers::String* string) {
: base::StringPiece();
}
+// Returns whether |position| within the |url| belongs to its |host| component
+// and corresponds to the beginning of a (sub-)domain.
+inline bool IsSubdomainAnchored(base::StringPiece url,
+ url::Component host,
+ size_t position) {
+ DCHECK_LE(position, url.size());
+ const size_t host_begin = static_cast<size_t>(host.begin);
+ const size_t host_end = static_cast<size_t>(host.end());
+ DCHECK_LE(host_end, url.size());
+
+ return position == host_begin ||
+ (position > host_begin && position <= host_end &&
+ url[position - 1] == '.');
+}
+
+class Subpattern {
Charlie Harrison 2017/04/04 13:58:26 It looks like this class is only used for generati
pkalinnikov 2017/04/04 16:42:59 Done.
+ public:
+ Subpattern(base::StringPiece subpattern)
+ : subpattern_(subpattern),
+ has_separator_placeholders_(subpattern.find(kSeparatorPlaceholder) !=
+ base::StringPiece::npos) {}
+
+ // Returns the position of the leftmost occurrence of |subpattern| in the
+ // |text|, starting |from| a certain position. If the |subpattern| has
+ // separator placeholders, searches for a fuzzy occurrence.
+ size_t FindIn(base::StringPiece text, size_t from = 0) const {
+ if (!has_separator_placeholders_)
+ return text.find(subpattern_, from);
+
+ auto* found = std::search(
+ text.begin() + from, text.end(), subpattern_.begin(), subpattern_.end(),
+ [](char text_char, char subpattern_char) {
+ return text_char == subpattern_char ||
+ (subpattern_char == kSeparatorPlaceholder &&
+ IsSeparator(text_char));
+ });
+ return found == text.end() ? base::StringPiece::npos : found - text.begin();
+ }
+
+ // Same as FindIn(url, 0), but searches for an occurrence that starts in the
+ // beginning of a (sub-)domain within the url's |host| component.
+ size_t FindSubdomainAnchoredIn(base::StringPiece url,
+ url::Component host) const {
+ for (size_t position = FindIn(url); position != base::StringPiece::npos;
+ position = FindIn(url, ++position)) {
+ if (IsSubdomainAnchored(url, host, position))
+ return position;
+ }
+ return base::StringPiece::npos;
+ }
+
+ private:
+ const base::StringPiece subpattern_;
+ const bool has_separator_placeholders_;
+};
Charlie Harrison 2017/04/04 13:58:26 does it need to be copyable?
pkalinnikov 2017/04/04 16:42:59 Not relevant anymore.
+
} // namespace
UrlPattern::UrlPattern() = default;
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::UrlPatternType type)
- : type(type), url_pattern(url_pattern) {}
+ : type_(type), url_pattern_(url_pattern) {}
UrlPattern::UrlPattern(base::StringPiece url_pattern,
proto::AnchorType anchor_left,
proto::AnchorType anchor_right)
- : type(proto::URL_PATTERN_TYPE_WILDCARDED),
- url_pattern(url_pattern),
- anchor_left(anchor_left),
- anchor_right(anchor_right) {}
+ : type_(proto::URL_PATTERN_TYPE_WILDCARDED),
+ url_pattern_(url_pattern),
+ anchor_left_(anchor_left),
+ anchor_right_(anchor_right) {}
UrlPattern::UrlPattern(const flat::UrlRule& rule)
- : type(ConvertUrlPatternType(rule.url_pattern_type())),
- url_pattern(ConvertString(rule.url_pattern())),
- anchor_left(ConvertAnchorType(rule.anchor_left())),
- anchor_right(ConvertAnchorType(rule.anchor_right())),
- match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}
+ : type_(ConvertUrlPatternType(rule.url_pattern_type())),
+ url_pattern_(ConvertString(rule.url_pattern())),
+ anchor_left_(ConvertAnchorType(rule.anchor_left())),
+ anchor_right_(ConvertAnchorType(rule.anchor_right())),
+ match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}
UrlPattern::UrlPattern(const proto::UrlRule& rule)
- : type(rule.url_pattern_type()),
- url_pattern(rule.url_pattern()),
- anchor_left(rule.anchor_left()),
- anchor_right(rule.anchor_right()),
- match_case(rule.match_case()) {}
+ : type_(rule.url_pattern_type()),
+ url_pattern_(rule.url_pattern()),
+ anchor_left_(rule.anchor_left()),
+ anchor_right_(rule.anchor_right()),
+ match_case_(rule.match_case()) {}
UrlPattern::~UrlPattern() = default;
+bool UrlPattern::MatchesUrl(const GURL& url) const {
+ DCHECK(url.is_valid());
+ DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
+ proto::URL_PATTERN_TYPE_WILDCARDED);
+
+ StringSplitter<IsWildcard> subpatterns(url_pattern_);
+ auto subpattern_it = subpatterns.begin();
+ auto subpattern_end = subpatterns.end();
+
+ if (subpattern_it == subpattern_end) {
+ return anchor_left_ == proto::ANCHOR_TYPE_NONE ||
+ anchor_right_ == proto::ANCHOR_TYPE_NONE;
+ }
+
+ const base::StringPiece spec = url.possibly_invalid_spec();
+ const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;
+ DCHECK(!spec.empty());
+
+ base::StringPiece subpattern = *subpattern_it++;
Charlie Harrison 2017/04/04 13:58:26 Can you include comments above these code paragrap
Charlie Harrison 2017/04/04 13:58:26 nit: Can you break this into two lines for clarity
pkalinnikov 2017/04/04 16:42:59 Done.
pkalinnikov 2017/04/04 16:42:59 How about this?
Charlie Harrison 2017/04/04 16:49:39 looks good.
+ if (subpattern_it == subpattern_end &&
+ anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
+ if (!EndsWithFuzzy(spec, subpattern))
+ return false;
+ if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY)
+ return spec.size() == subpattern.size();
+ if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
+ DCHECK_LE(subpattern.size(), spec.size());
+ return url.has_host() &&
+ IsSubdomainAnchored(spec, host_part,
+ spec.size() - subpattern.size());
+ }
+ return true;
+ }
+
+ base::StringPiece text = spec;
+ if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) {
+ if (!StartsWithFuzzy(spec, subpattern))
+ return false;
+ if (subpattern_it == subpattern_end)
+ return true;
+ text.remove_prefix(subpattern.size());
+ } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
+ if (!url.has_host())
+ return false;
+ const size_t match_begin =
+ Subpattern(subpattern).FindSubdomainAnchoredIn(spec, host_part);
+ if (match_begin == base::StringPiece::npos)
+ return false;
+ if (subpattern_it == subpattern_end)
+ return true;
+ text.remove_prefix(match_begin + subpattern.size());
+ } else {
+ DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE);
+ // Get back to the initial subpattern, process it in the loop below.
+ subpattern_it = subpatterns.begin();
+ }
+
+ while (subpattern_it != subpattern_end) {
+ subpattern = *subpattern_it++;
+ DCHECK(!subpattern.empty());
+
+ if (subpattern_it == subpattern_end &&
+ anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
+ break;
+ }
+
+ const size_t match_position = Subpattern(subpattern).FindIn(text);
+ if (match_position == base::StringPiece::npos)
+ return false;
+ text.remove_prefix(match_position + subpattern.size());
+ }
+
+ return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY ||
+ EndsWithFuzzy(text, subpattern);
+}
+
+std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
+ // Note: Each fall-through in this switch is intentional.
+ switch (pattern.anchor_left()) {
+ case proto::ANCHOR_TYPE_SUBDOMAIN:
+ out << '|';
+ case proto::ANCHOR_TYPE_BOUNDARY:
+ out << '|';
+ default:
+ break;
+ }
+ out << pattern.url_pattern();
+ if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
+ out << '|';
+ if (pattern.match_case())
+ out << "$match-case";
+
+ return out;
+}
+
} // namespace subresource_filter

Powered by Google App Engine
This is Rietveld 408576698