components/subresource_filter/core/common/url_pattern.cc - Issue 2793993002: [subresource_filter] Replace KMP by std::search.

Unified Diff: components/subresource_filter/core/common/url_pattern.cc

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)

Patch Set: Address comments from csharrison@ Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« components/subresource_filter/core/common/url_pattern.h ('K') | « components/subresource_filter/core/common/url_pattern.h ('k') | components/subresource_filter/core/common/url_pattern_matching.h » ('j') | components/subresource_filter/core/common/url_pattern_unittest.cc » ('J')
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: components/subresource_filter/core/common/url_pattern.cc

diff --git a/components/subresource_filter/core/common/url_pattern.cc b/components/subresource_filter/core/common/url_pattern.cc

index 1cfa663762abf9a0698c184c708005d6ee1f6c87..a8b018a106bfaa710b5564392ff3cc4405da1a52 100644

--- a/components/subresource_filter/core/common/url_pattern.cc

+++ b/components/subresource_filter/core/common/url_pattern.cc

@@ -2,14 +2,40 @@

// Use of this source code is governed by a BSD-style license that can be

// found in the LICENSE file.

+// The matching logic distinguishes between the terms URL pattern and

+// subpattern. A URL pattern usually stands for the full thing, e.g.

+// "example.com^*path*par=val^", whereas subpattern denotes a maximal substring

+// of a pattern not containing the wildcard '*' character. For the example above

+// the subpatterns are: "example.com^", "path" and "par=val^".

+//

+// The separator placeholder '^' symbol is used in subpatterns to match any

+// separator character, which is any ASCII symbol except letters, digits, and

+// the following: '_', '-', '.', '%'. Note that the separator placeholder

+// character '^' is itself a separator, as well as '\0'.

#include "components/subresource_filter/core/common/url_pattern.h"

+#include <stddef.h>

+#include <algorithm>

+#include <ostream>

+#include "base/logging.h"

#include "components/subresource_filter/core/common/flat/rules_generated.h"

+#include "components/subresource_filter/core/common/fuzzy_pattern_matching.h"

+#include "components/subresource_filter/core/common/string_splitter.h"

+#include "url/gurl.h"

+#include "url/third_party/mozilla/url_parse.h"

namespace subresource_filter {

namespace {

+class IsWildcard {

+ public:

+ bool operator()(char c) const { return c == '*'; }

+};

proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {

switch (type) {

case flat::UrlPatternType_SUBSTRING:

@@ -41,36 +67,206 @@ base::StringPiece ConvertString(const flatbuffers::String* string) {

: base::StringPiece();

}

+// and corresponds to the beginning of a (sub-)domain.

+inline bool IsSubdomainAnchored(base::StringPiece url,

+ url::Component host,

+ size_t position) {

+ DCHECK_LE(position, url.size());

+ const size_t host_begin = static_cast<size_t>(host.begin);

+ const size_t host_end = static_cast<size_t>(host.end());

+ DCHECK_LE(host_end, url.size());

+ return position == host_begin ||

+ (position > host_begin && position <= host_end &&

+ url[position - 1] == '.');

+// Returns the position of the leftmost occurrence of |subpattern| in the

engedy 2017/04/05 10:55:11 nit: s/a certain/the specified/g

pkalinnikov 2017/04/05 13:29:21 Done.

+// searches for a fuzzy occurrence.

+size_t FindSubpatternImpl(base::StringPiece text,

+ base::StringPiece subpattern,

+ size_t from,

+ bool is_fuzzy) {

+ if (!is_fuzzy)

+ return text.find(subpattern, from);

+ auto* found =

+ std::search(text.begin() + from, text.end(), subpattern.begin(),

engedy 2017/04/05 10:55:11 nit: Have you considered moving this to fuzzy_patt

pkalinnikov 2017/04/05 13:29:21 Done. Also, reused the old AllOccurrences test.

+ subpattern.end(), [](char text_char, char subpattern_char) {

Charlie Harrison 2017/04/04 16:49:39 nit: Could you hoist the lambda to a member var? I

pkalinnikov 2017/04/05 13:29:21 Done.

+ return text_char == subpattern_char ||

+ (subpattern_char == kSeparatorPlaceholder &&

+ IsSeparator(text_char));

+ });

+ return found == text.end() ? base::StringPiece::npos : found - text.begin();

+// Returns the position of the leftmost occurrence of |subpattern| in the

+// placeholders, searches for a fuzzy occurrence.

+size_t FindSubpattern(base::StringPiece text,

+ base::StringPiece subpattern,

+ size_t from = 0) {

+ const bool is_fuzzy =

+ (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);

+ return FindSubpatternImpl(text, subpattern, from, is_fuzzy);

+// Same as FindSubpattern(url, subpattern), but searches for an occurrence that

+// starts in the beginning of a (sub-)domain within the url's |host| component.

engedy 2017/04/05 10:55:11 nit: ... at the beginning ...

pkalinnikov 2017/04/05 13:29:21 Done.

+size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,

+ url::Component host,

+ base::StringPiece subpattern) {

+ const bool is_fuzzy =

engedy 2017/04/05 10:55:11 Do we need to check subpattern for being empty her

pkalinnikov 2017/04/05 13:29:21 Done.

+ (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);

+ for (size_t position = FindSubpatternImpl(url, subpattern, 0, is_fuzzy);

+ position != base::StringPiece::npos;

+ position = FindSubpatternImpl(url, subpattern, ++position, is_fuzzy)) {

+ if (IsSubdomainAnchored(url, host, position))

+ return position;

+ }

+ return base::StringPiece::npos;

} // namespace

UrlPattern::UrlPattern() = default;

UrlPattern::UrlPattern(base::StringPiece url_pattern,

proto::UrlPatternType type)

- : type(type), url_pattern(url_pattern) {}

+ : type_(type), url_pattern_(url_pattern) {}

UrlPattern::UrlPattern(base::StringPiece url_pattern,

proto::AnchorType anchor_left,

proto::AnchorType anchor_right)

- : type(proto::URL_PATTERN_TYPE_WILDCARDED),

- url_pattern(url_pattern),

- anchor_left(anchor_left),

- anchor_right(anchor_right) {}

+ : type_(proto::URL_PATTERN_TYPE_WILDCARDED),

+ url_pattern_(url_pattern),

+ anchor_left_(anchor_left),

+ anchor_right_(anchor_right) {}

UrlPattern::UrlPattern(const flat::UrlRule& rule)

- : type(ConvertUrlPatternType(rule.url_pattern_type())),

- url_pattern(ConvertString(rule.url_pattern())),

- anchor_left(ConvertAnchorType(rule.anchor_left())),

- anchor_right(ConvertAnchorType(rule.anchor_right())),

- match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}

+ : type_(ConvertUrlPatternType(rule.url_pattern_type())),

+ url_pattern_(ConvertString(rule.url_pattern())),

+ anchor_left_(ConvertAnchorType(rule.anchor_left())),

+ anchor_right_(ConvertAnchorType(rule.anchor_right())),

+ match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}

UrlPattern::UrlPattern(const proto::UrlRule& rule)

- : type(rule.url_pattern_type()),

- url_pattern(rule.url_pattern()),

- anchor_left(rule.anchor_left()),

- anchor_right(rule.anchor_right()),

- match_case(rule.match_case()) {}

+ : type_(rule.url_pattern_type()),

+ url_pattern_(rule.url_pattern()),

+ anchor_left_(rule.anchor_left()),

+ anchor_right_(rule.anchor_right()),

+ match_case_(rule.match_case()) {}

UrlPattern::~UrlPattern() = default;

+bool UrlPattern::MatchesUrl(const GURL& url) const {

+ DCHECK(url.is_valid());

engedy 2017/04/05 10:55:11 Could you please add a comment here that empty URL

pkalinnikov 2017/04/05 13:29:21 Done.

+ DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||

+ proto::URL_PATTERN_TYPE_WILDCARDED);

+ StringSplitter<IsWildcard> subpatterns(url_pattern_);

+ auto subpattern_it = subpatterns.begin();

+ auto subpattern_end = subpatterns.end();

+ if (subpattern_it == subpattern_end) {

+ return anchor_left_ == proto::ANCHOR_TYPE_NONE ||

+ anchor_right_ == proto::ANCHOR_TYPE_NONE;

+ }

+ const base::StringPiece spec = url.possibly_invalid_spec();

+ const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;

+ DCHECK(!spec.empty());

+ base::StringPiece subpattern = *subpattern_it;

engedy 2017/04/05 10:55:11 nit: Could you please add a comment to explain why

pkalinnikov 2017/04/05 13:29:21 I splitted *subpattern_it++ in two lines, because

engedy 2017/04/05 13:41:33 Acknowledged.

+ ++subpattern_it;

+ // If there is only one |subpattern|, and it has a right anchor, then simply

+ // check that it is a suffix of the |spec|, and the left anchor is fulfilled.

+ if (subpattern_it == subpattern_end &&

+ anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {

+ if (!EndsWithFuzzy(spec, subpattern))

+ return false;

+ if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY)

+ return spec.size() == subpattern.size();

+ if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {

+ DCHECK_LE(subpattern.size(), spec.size());

+ return url.has_host() &&

+ IsSubdomainAnchored(spec, host_part,

+ spec.size() - subpattern.size());

+ }

+ return true;

+ }

+ // Otherwise, the first |subpattern| does not have to be a suffix. But it

+ // still can have a left anchor. Check and handle that.

+ base::StringPiece text = spec;

+ if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) {

+ if (!StartsWithFuzzy(spec, subpattern))

+ return false;

+ if (subpattern_it == subpattern_end) {

+ DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);

+ return true;

+ }

+ text.remove_prefix(subpattern.size());

+ } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {

+ if (!url.has_host())

+ return false;

+ const size_t match_begin =

+ FindSubdomainAnchoredSubpattern(spec, host_part, subpattern);

+ if (match_begin == base::StringPiece::npos)

+ return false;

+ if (subpattern_it == subpattern_end) {

+ DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);

+ return true;

+ }

+ text.remove_prefix(match_begin + subpattern.size());

+ } else {

+ DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE);

+ // Get back to the initial |subpattern|, process it in the loop below.

+ subpattern_it = subpatterns.begin();

+ }

+ // Consecutively find all the remaining subpatterns in the |text|. If the

+ // pattern has a right anchor, don't search for the last subpattern, but

+ // instead check that it is a suffix of the |text|.

+ while (subpattern_it != subpattern_end) {

+ subpattern = *subpattern_it;

+ DCHECK(!subpattern.empty());

+ if (++subpattern_it == subpattern_end &&

+ anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {

+ break;

+ }

+ const size_t match_position = FindSubpattern(text, subpattern);

+ if (match_position == base::StringPiece::npos)

+ return false;

+ text.remove_prefix(match_position + subpattern.size());

+ }

+ return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY ||

+ EndsWithFuzzy(text, subpattern);

+std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {

+ // Note: Each fall-through in this switch is intentional.

+ switch (pattern.anchor_left()) {

+ case proto::ANCHOR_TYPE_SUBDOMAIN:

+ out << '|';

+ case proto::ANCHOR_TYPE_BOUNDARY:

+ out << '|';

+ default:

+ break;

+ }

+ out << pattern.url_pattern();

+ if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)

+ out << '|';

+ if (pattern.match_case())

+ out << "$match-case";

+ return out;

} // namespace subresource_filter