components/subresource_filter/core/common/url_pattern.cc - Issue 2793993002: [subresource_filter] Replace KMP by std::search.

Side by Side Diff: components/subresource_filter/core/common/url_pattern.cc

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)

Patch Set: Address comments from csharrison@ Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

« components/subresource_filter/core/common/url_pattern.h ('K') | « components/subresource_filter/core/common/url_pattern.h ('k') | components/subresource_filter/core/common/url_pattern_matching.h » ('j') | components/subresource_filter/core/common/url_pattern_unittest.cc » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

OLD	NEW
1 // Copyright 2016 The Chromium Authors. All rights reserved.	1 // Copyright 2016 The Chromium Authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

	5 // The matching logic distinguishes between the terms URL pattern and

	6 // subpattern. A URL pattern usually stands for the full thing, e.g.

	7 // "example.com^pathpar=val^", whereas subpattern denotes a maximal substring

	8 // of a pattern not containing the wildcard '*' character. For the example above

	9 // the subpatterns are: "example.com^", "path" and "par=val^".

	10 //

	11 // The separator placeholder '^' symbol is used in subpatterns to match any

	12 // separator character, which is any ASCII symbol except letters, digits, and

	13 // the following: '_', '-', '.', '%'. Note that the separator placeholder

	14 // character '^' is itself a separator, as well as '\0'.

	15

5 #include "components/subresource_filter/core/common/url_pattern.h"	16 #include "components/subresource_filter/core/common/url_pattern.h"

6	17

	18 #include <stddef.h>

	19

	20 #include <algorithm>

	21 #include <ostream>

	22

	23 #include "base/logging.h"

7 #include "components/subresource_filter/core/common/flat/rules_generated.h"	24 #include "components/subresource_filter/core/common/flat/rules_generated.h"

	25 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h"

	26 #include "components/subresource_filter/core/common/string_splitter.h"

	27 #include "url/gurl.h"

	28 #include "url/third_party/mozilla/url_parse.h"

8	29

9 namespace subresource_filter {	30 namespace subresource_filter {

10	31

11 namespace {	32 namespace {

12	33

	34 class IsWildcard {

	35 public:

	36 bool operator()(char c) const { return c == '*'; }

	37 };

	38

13 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {	39 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {

14 switch (type) {	40 switch (type) {

15 case flat::UrlPatternType_SUBSTRING:	41 case flat::UrlPatternType_SUBSTRING:

16 return proto::URL_PATTERN_TYPE_SUBSTRING;	42 return proto::URL_PATTERN_TYPE_SUBSTRING;

17 case flat::UrlPatternType_WILDCARDED:	43 case flat::UrlPatternType_WILDCARDED:

18 return proto::URL_PATTERN_TYPE_WILDCARDED;	44 return proto::URL_PATTERN_TYPE_WILDCARDED;

19 case flat::UrlPatternType_REGEXP:	45 case flat::UrlPatternType_REGEXP:

20 return proto::URL_PATTERN_TYPE_REGEXP;	46 return proto::URL_PATTERN_TYPE_REGEXP;

21 default:	47 default:

22 return proto::URL_PATTERN_TYPE_UNSPECIFIED;	48 return proto::URL_PATTERN_TYPE_UNSPECIFIED;

(...skipping 11 matching lines...) Expand all Loading...
34 default:	60 default:

35 return proto::ANCHOR_TYPE_UNSPECIFIED;	61 return proto::ANCHOR_TYPE_UNSPECIFIED;

36 }	62 }

37 }	63 }

38	64

39 base::StringPiece ConvertString(const flatbuffers::String* string) {	65 base::StringPiece ConvertString(const flatbuffers::String* string) {

40 return string ? base::StringPiece(string->data(), string->size())	66 return string ? base::StringPiece(string->data(), string->size())

41 : base::StringPiece();	67 : base::StringPiece();

42 }	68 }

43	69

	70 // Returns whether \|position\| within the \|url\| belongs to its \|host\| component

	71 // and corresponds to the beginning of a (sub-)domain.

	72 inline bool IsSubdomainAnchored(base::StringPiece url,

	73 url::Component host,

	74 size_t position) {

	75 DCHECK_LE(position, url.size());

	76 const size_t host_begin = static_cast<size_t>(host.begin);

	77 const size_t host_end = static_cast<size_t>(host.end());

	78 DCHECK_LE(host_end, url.size());

	79

	80 return position == host_begin \|\|

	81 (position > host_begin && position <= host_end &&

	82 url[position - 1] == '.');

	83 }

	84

	85 // Returns the position of the leftmost occurrence of \|subpattern\| in the

	86 // \|text\|, starting \|from\| a certain position. If \|is_fuzzy\| then
	engedy 2017/04/05 10:55:11 nit: s/a certain/the specified/g nit: s/a certain/the specified/g pkalinnikov 2017/04/05 13:29:21 Done. Show quoted text On 2017/04/05 10:55:11, engedy wrote: > nit: s/a certain/the specified/g Done.
	87 // searches for a fuzzy occurrence.

	88 size_t FindSubpatternImpl(base::StringPiece text,

	89 base::StringPiece subpattern,

	90 size_t from,

	91 bool is_fuzzy) {

	92 if (!is_fuzzy)

	93 return text.find(subpattern, from);

	94

	95 auto* found =

	96 std::search(text.begin() + from, text.end(), subpattern.begin(),
	engedy 2017/04/05 10:55:11 nit: Have you considered moving this to fuzzy_patt nit: Have you considered moving this to fuzzy_pattern_matching.h? pkalinnikov 2017/04/05 13:29:21 Done. Also, reused the old AllOccurrences test. Show quoted text On 2017/04/05 10:55:11, engedy wrote: > nit: Have you considered moving this to fuzzy_pattern_matching.h? Done. Also, reused the old AllOccurrences test.
	97 subpattern.end(), [](char text_char, char subpattern_char) {
	Charlie Harrison 2017/04/04 16:49:39 nit: Could you hoist the lambda to a member var? I nit: Could you hoist the lambda to a member var? It is more readable to me as: auto searcher = [](char text_char, char subpattern_char) { return text_char == subpattern_char \|\| (subpattern_char == kSeparatorPlaceholder && IsSeparator(text_char)); }; auto* found = std::search(text.begin() + from, text.end(), subpattern.begin(), subpattern.end(), searcher); pkalinnikov 2017/04/05 13:29:21 Done. Show quoted text On 2017/04/04 16:49:39, Charlie Harrison wrote: > nit: Could you hoist the lambda to a member var? It is more readable to me as: > > auto searcher = [](char text_char, char subpattern_char) { > return text_char == subpattern_char \|\| > (subpattern_char == kSeparatorPlaceholder && > IsSeparator(text_char)); > }; > auto* found = std::search(text.begin() + from, text.end(), > subpattern.begin(), subpattern.end(), searcher); Done.
	98 return text_char == subpattern_char \|\|

	99 (subpattern_char == kSeparatorPlaceholder &&

	100 IsSeparator(text_char));

	101 });

	102 return found == text.end() ? base::StringPiece::npos : found - text.begin();

	103 }

	104

	105 // Returns the position of the leftmost occurrence of \|subpattern\| in the

	106 // \|text\|, starting \|from\| a certain position. If the \|subpattern\| has separator

	107 // placeholders, searches for a fuzzy occurrence.

	108 size_t FindSubpattern(base::StringPiece text,

	109 base::StringPiece subpattern,

	110 size_t from = 0) {

	111 const bool is_fuzzy =

	112 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);

	113 return FindSubpatternImpl(text, subpattern, from, is_fuzzy);

	114 }

	115

	116 // Same as FindSubpattern(url, subpattern), but searches for an occurrence that

	117 // starts in the beginning of a (sub-)domain within the url's \|host\| component.
	engedy 2017/04/05 10:55:11 nit: ... at the beginning ... nit: ... at the beginning ... pkalinnikov 2017/04/05 13:29:21 Done. Show quoted text On 2017/04/05 10:55:11, engedy wrote: > nit: ... at the beginning ... Done.
	118 size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,

	119 url::Component host,

	120 base::StringPiece subpattern) {

	121 const bool is_fuzzy =
	engedy 2017/04/05 10:55:11 Do we need to check subpattern for being empty her Do we need to check subpattern for being empty here? Looks like in that case, we might increment the iterator in line 96 beyond text.end(), which might not be allowed for some kind of things backing the StringPiece. Alternatively, we could do some bounds checking in FindSubpatternImpl or somehow enforce that this never happens. pkalinnikov 2017/04/05 13:29:21 Done. Show quoted text On 2017/04/05 10:55:11, engedy wrote: > Do we need to check subpattern for being empty here? Looks like in that case, we > might increment the iterator in line 96 beyond text.end(), which might not be > allowed for some kind of things backing the StringPiece. Alternatively, we could > do some bounds checking in FindSubpatternImpl or somehow enforce that this never > happens. Done.
	122 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);

	123 for (size_t position = FindSubpatternImpl(url, subpattern, 0, is_fuzzy);

	124 position != base::StringPiece::npos;

	125 position = FindSubpatternImpl(url, subpattern, ++position, is_fuzzy)) {

	126 if (IsSubdomainAnchored(url, host, position))

	127 return position;

	128 }

	129 return base::StringPiece::npos;

	130 }

	131

44 } // namespace	132 } // namespace

45	133

46 UrlPattern::UrlPattern() = default;	134 UrlPattern::UrlPattern() = default;

47	135

48 UrlPattern::UrlPattern(base::StringPiece url_pattern,	136 UrlPattern::UrlPattern(base::StringPiece url_pattern,

49 proto::UrlPatternType type)	137 proto::UrlPatternType type)

50 : type(type), url_pattern(url_pattern) {}	138 : type_(type), url_pattern_(url_pattern) {}

51	139

52 UrlPattern::UrlPattern(base::StringPiece url_pattern,	140 UrlPattern::UrlPattern(base::StringPiece url_pattern,

53 proto::AnchorType anchor_left,	141 proto::AnchorType anchor_left,

54 proto::AnchorType anchor_right)	142 proto::AnchorType anchor_right)

55 : type(proto::URL_PATTERN_TYPE_WILDCARDED),	143 : type_(proto::URL_PATTERN_TYPE_WILDCARDED),

56 url_pattern(url_pattern),	144 url_pattern_(url_pattern),

57 anchor_left(anchor_left),	145 anchor_left_(anchor_left),

58 anchor_right(anchor_right) {}	146 anchor_right_(anchor_right) {}

59	147

60 UrlPattern::UrlPattern(const flat::UrlRule& rule)	148 UrlPattern::UrlPattern(const flat::UrlRule& rule)

61 : type(ConvertUrlPatternType(rule.url_pattern_type())),	149 : type_(ConvertUrlPatternType(rule.url_pattern_type())),

62 url_pattern(ConvertString(rule.url_pattern())),	150 url_pattern_(ConvertString(rule.url_pattern())),

63 anchor_left(ConvertAnchorType(rule.anchor_left())),	151 anchor_left_(ConvertAnchorType(rule.anchor_left())),

64 anchor_right(ConvertAnchorType(rule.anchor_right())),	152 anchor_right_(ConvertAnchorType(rule.anchor_right())),

65 match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}	153 match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}

66	154

67 UrlPattern::UrlPattern(const proto::UrlRule& rule)	155 UrlPattern::UrlPattern(const proto::UrlRule& rule)

68 : type(rule.url_pattern_type()),	156 : type_(rule.url_pattern_type()),

69 url_pattern(rule.url_pattern()),	157 url_pattern_(rule.url_pattern()),

70 anchor_left(rule.anchor_left()),	158 anchor_left_(rule.anchor_left()),

71 anchor_right(rule.anchor_right()),	159 anchor_right_(rule.anchor_right()),

72 match_case(rule.match_case()) {}	160 match_case_(rule.match_case()) {}

73	161

74 UrlPattern::~UrlPattern() = default;	162 UrlPattern::~UrlPattern() = default;

75	163

	164 bool UrlPattern::MatchesUrl(const GURL& url) const {

	165 DCHECK(url.is_valid());
	engedy 2017/04/05 10:55:11 Could you please add a comment here that empty URL Could you please add a comment here that empty URLs are also invalid? pkalinnikov 2017/04/05 13:29:21 Done. Show quoted text On 2017/04/05 10:55:11, engedy wrote: > Could you please add a comment here that empty URLs are also invalid? Done.
	166 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING \|\|

	167 proto::URL_PATTERN_TYPE_WILDCARDED);

	168

	169 StringSplitter<IsWildcard> subpatterns(url_pattern_);

	170 auto subpattern_it = subpatterns.begin();

	171 auto subpattern_end = subpatterns.end();

	172

	173 if (subpattern_it == subpattern_end) {

	174 return anchor_left_ == proto::ANCHOR_TYPE_NONE \|\|

	175 anchor_right_ == proto::ANCHOR_TYPE_NONE;

	176 }

	177

	178 const base::StringPiece spec = url.possibly_invalid_spec();

	179 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;

	180 DCHECK(!spec.empty());

	181

	182 base::StringPiece subpattern = *subpattern_it;
	engedy 2017/04/05 10:55:11 nit: Could you please add a comment to explain why nit: Could you please add a comment to explain why we expect this way is faster? (Does the compiler not optimize this anyway?) pkalinnikov 2017/04/05 13:29:21 I splitted subpattern_it++ in two lines, because Show quoted text On 2017/04/05 10:55:11, engedy wrote: > nit: Could you please add a comment to explain why we expect this way is faster? > (Does the compiler not optimize this anyway?) I splitted subpattern_it++ in two lines, because csharrison@ had a readability concern. engedy 2017/04/05 13:41:33 Acknowledged. Show quoted text On 2017/04/05 13:29:21, pkalinnikov wrote: > On 2017/04/05 10:55:11, engedy wrote: > > nit: Could you please add a comment to explain why we expect this way is > faster? > > (Does the compiler not optimize this anyway?) > > I splitted *subpattern_it++ in two lines, because csharrison@ had a readability > concern. Acknowledged.
	183 ++subpattern_it;

	184

	185 // If there is only one \|subpattern\|, and it has a right anchor, then simply

	186 // check that it is a suffix of the \|spec\|, and the left anchor is fulfilled.

	187 if (subpattern_it == subpattern_end &&

	188 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {

	189 if (!EndsWithFuzzy(spec, subpattern))

	190 return false;

	191 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY)

	192 return spec.size() == subpattern.size();

	193 if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {

	194 DCHECK_LE(subpattern.size(), spec.size());

	195 return url.has_host() &&

	196 IsSubdomainAnchored(spec, host_part,

	197 spec.size() - subpattern.size());

	198 }

	199 return true;

	200 }

	201

	202 // Otherwise, the first \|subpattern\| does not have to be a suffix. But it

	203 // still can have a left anchor. Check and handle that.

	204 base::StringPiece text = spec;

	205 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) {

	206 if (!StartsWithFuzzy(spec, subpattern))

	207 return false;

	208 if (subpattern_it == subpattern_end) {

	209 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);

	210 return true;

	211 }

	212 text.remove_prefix(subpattern.size());

	213 } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {

	214 if (!url.has_host())

	215 return false;

	216 const size_t match_begin =

	217 FindSubdomainAnchoredSubpattern(spec, host_part, subpattern);

	218 if (match_begin == base::StringPiece::npos)

	219 return false;

	220 if (subpattern_it == subpattern_end) {

	221 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);

	222 return true;

	223 }

	224 text.remove_prefix(match_begin + subpattern.size());

	225 } else {

	226 DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE);

	227 // Get back to the initial \|subpattern\|, process it in the loop below.

	228 subpattern_it = subpatterns.begin();

	229 }

	230

	231 // Consecutively find all the remaining subpatterns in the \|text\|. If the

	232 // pattern has a right anchor, don't search for the last subpattern, but

	233 // instead check that it is a suffix of the \|text\|.

	234 while (subpattern_it != subpattern_end) {

	235 subpattern = *subpattern_it;

	236 DCHECK(!subpattern.empty());

	237

	238 if (++subpattern_it == subpattern_end &&

	239 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {

	240 break;

	241 }

	242

	243 const size_t match_position = FindSubpattern(text, subpattern);

	244 if (match_position == base::StringPiece::npos)

	245 return false;

	246 text.remove_prefix(match_position + subpattern.size());

	247 }

	248

	249 return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY \|\|

	250 EndsWithFuzzy(text, subpattern);

	251 }

	252

	253 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {

	254 // Note: Each fall-through in this switch is intentional.

	255 switch (pattern.anchor_left()) {

	256 case proto::ANCHOR_TYPE_SUBDOMAIN:

	257 out << '\|';

	258 case proto::ANCHOR_TYPE_BOUNDARY:

	259 out << '\|';

	260 default:

	261 break;

	262 }

	263 out << pattern.url_pattern();

	264 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)

	265 out << '\|';

	266 if (pattern.match_case())

	267 out << "$match-case";

	268

	269 return out;

	270 }

	271

76 } // namespace subresource_filter	272 } // namespace subresource_filter

OLD	NEW