Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(46)

Side by Side Diff: components/subresource_filter/core/common/url_pattern.cc

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)
Patch Set: Clean up. Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // The matching logic distinguishes between the terms URL pattern and
6 // subpattern. A URL pattern usually stands for the full thing, e.g.
7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
8 // of a pattern not containing the wildcard '*' character. For the example above
9 // the subpatterns are: "example.com^", "path" and "par=val^".
10 //
11 // The separator placeholder '^' symbol is used in subpatterns to match any
12 // separator character, which is any ASCII symbol except letters, digits, and
13 // the following: '_', '-', '.', '%'. Note that the separator placeholder
14 // character '^' is itself a separator, as well as '\0'.
15
5 #include "components/subresource_filter/core/common/url_pattern.h" 16 #include "components/subresource_filter/core/common/url_pattern.h"
6 17
18 #include <stddef.h>
19
20 #include <ostream>
21
22 #include "base/logging.h"
7 #include "components/subresource_filter/core/common/flat/rules_generated.h" 23 #include "components/subresource_filter/core/common/flat/rules_generated.h"
24 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h"
25 #include "components/subresource_filter/core/common/string_splitter.h"
26 #include "url/gurl.h"
27 #include "url/third_party/mozilla/url_parse.h"
8 28
9 namespace subresource_filter { 29 namespace subresource_filter {
10 30
11 namespace { 31 namespace {
12 32
33 class IsWildcard {
34 public:
35 bool operator()(char c) const { return c == '*'; }
36 };
37
13 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { 38 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
14 switch (type) { 39 switch (type) {
15 case flat::UrlPatternType_SUBSTRING: 40 case flat::UrlPatternType_SUBSTRING:
16 return proto::URL_PATTERN_TYPE_SUBSTRING; 41 return proto::URL_PATTERN_TYPE_SUBSTRING;
17 case flat::UrlPatternType_WILDCARDED: 42 case flat::UrlPatternType_WILDCARDED:
18 return proto::URL_PATTERN_TYPE_WILDCARDED; 43 return proto::URL_PATTERN_TYPE_WILDCARDED;
19 case flat::UrlPatternType_REGEXP: 44 case flat::UrlPatternType_REGEXP:
20 return proto::URL_PATTERN_TYPE_REGEXP; 45 return proto::URL_PATTERN_TYPE_REGEXP;
21 default: 46 default:
22 return proto::URL_PATTERN_TYPE_UNSPECIFIED; 47 return proto::URL_PATTERN_TYPE_UNSPECIFIED;
(...skipping 11 matching lines...) Expand all
34 default: 59 default:
35 return proto::ANCHOR_TYPE_UNSPECIFIED; 60 return proto::ANCHOR_TYPE_UNSPECIFIED;
36 } 61 }
37 } 62 }
38 63
39 base::StringPiece ConvertString(const flatbuffers::String* string) { 64 base::StringPiece ConvertString(const flatbuffers::String* string) {
40 return string ? base::StringPiece(string->data(), string->size()) 65 return string ? base::StringPiece(string->data(), string->size())
41 : base::StringPiece(); 66 : base::StringPiece();
42 } 67 }
43 68
69 // Returns whether |position| within the |url| belongs to its |host| component
70 // and corresponds to the beginning of a (sub-)domain.
71 inline bool IsSubdomainAnchored(base::StringPiece url,
72 url::Component host,
73 size_t position) {
74 DCHECK_LE(position, url.size());
75 const size_t host_begin = static_cast<size_t>(host.begin);
76 const size_t host_end = static_cast<size_t>(host.end());
77 DCHECK_LE(host_end, url.size());
78
79 return position == host_begin ||
80 (position > host_begin && position <= host_end &&
81 url[position - 1] == '.');
82 }
83
84 // Returns the position of the leftmost occurrence of a |subpattern| in the
85 // |text|, starting |from| a certain position. If the |subpattern| has separator
86 // placeholders, searches for a fuzzy occurrence.
87 size_t FindSubpattern(base::StringPiece text,
88 base::StringPiece subpattern,
89 size_t from = 0) {
90 const bool is_fuzzy =
91 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
92 return is_fuzzy ? FindFuzzy(text, subpattern, from)
93 : text.find(subpattern, from);
94 }
95
96 // Same as FindSubpattern(url, subpattern), but searches for an occurrence that
97 // starts at the beginning of a (sub-)domain within the url's |host| component.
98 size_t FindSubdomainAnchoredSubpattern(base::StringPiece url,
99 url::Component host,
100 base::StringPiece subpattern) {
101 const bool is_fuzzy =
102 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos);
103
104 for (size_t position = 0; position <= url.size(); ++position) {
105 position = is_fuzzy ? FindFuzzy(url, subpattern, position)
106 : url.find(subpattern, position);
107 if (position == base::StringPiece::npos ||
engedy 2017/04/05 13:41:33 Shouldn't we return npos in case the first term is
pkalinnikov 2017/04/05 13:48:47 If the first term is true, then "return position"
engedy 2017/04/05 13:55:40 Oh, of course. :)
108 IsSubdomainAnchored(url, host, position)) {
109 return position;
110 }
111 }
112 return base::StringPiece::npos;
113 }
114
44 } // namespace 115 } // namespace
45 116
46 UrlPattern::UrlPattern() = default; 117 UrlPattern::UrlPattern() = default;
47 118
48 UrlPattern::UrlPattern(base::StringPiece url_pattern, 119 UrlPattern::UrlPattern(base::StringPiece url_pattern,
49 proto::UrlPatternType type) 120 proto::UrlPatternType type)
50 : type(type), url_pattern(url_pattern) {} 121 : type_(type), url_pattern_(url_pattern) {}
51 122
52 UrlPattern::UrlPattern(base::StringPiece url_pattern, 123 UrlPattern::UrlPattern(base::StringPiece url_pattern,
53 proto::AnchorType anchor_left, 124 proto::AnchorType anchor_left,
54 proto::AnchorType anchor_right) 125 proto::AnchorType anchor_right)
55 : type(proto::URL_PATTERN_TYPE_WILDCARDED), 126 : type_(proto::URL_PATTERN_TYPE_WILDCARDED),
56 url_pattern(url_pattern), 127 url_pattern_(url_pattern),
57 anchor_left(anchor_left), 128 anchor_left_(anchor_left),
58 anchor_right(anchor_right) {} 129 anchor_right_(anchor_right) {}
59 130
60 UrlPattern::UrlPattern(const flat::UrlRule& rule) 131 UrlPattern::UrlPattern(const flat::UrlRule& rule)
61 : type(ConvertUrlPatternType(rule.url_pattern_type())), 132 : type_(ConvertUrlPatternType(rule.url_pattern_type())),
62 url_pattern(ConvertString(rule.url_pattern())), 133 url_pattern_(ConvertString(rule.url_pattern())),
63 anchor_left(ConvertAnchorType(rule.anchor_left())), 134 anchor_left_(ConvertAnchorType(rule.anchor_left())),
64 anchor_right(ConvertAnchorType(rule.anchor_right())), 135 anchor_right_(ConvertAnchorType(rule.anchor_right())),
65 match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} 136 match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}
66 137
67 UrlPattern::UrlPattern(const proto::UrlRule& rule) 138 UrlPattern::UrlPattern(const proto::UrlRule& rule)
68 : type(rule.url_pattern_type()), 139 : type_(rule.url_pattern_type()),
69 url_pattern(rule.url_pattern()), 140 url_pattern_(rule.url_pattern()),
70 anchor_left(rule.anchor_left()), 141 anchor_left_(rule.anchor_left()),
71 anchor_right(rule.anchor_right()), 142 anchor_right_(rule.anchor_right()),
72 match_case(rule.match_case()) {} 143 match_case_(rule.match_case()) {}
73 144
74 UrlPattern::~UrlPattern() = default; 145 UrlPattern::~UrlPattern() = default;
75 146
147 bool UrlPattern::MatchesUrl(const GURL& url) const {
148 // Note: Empty domains are also invalid.
149 DCHECK(url.is_valid());
150 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
151 proto::URL_PATTERN_TYPE_WILDCARDED);
152
153 StringSplitter<IsWildcard> subpatterns(url_pattern_);
154 auto subpattern_it = subpatterns.begin();
155 auto subpattern_end = subpatterns.end();
156
157 if (subpattern_it == subpattern_end) {
158 return anchor_left_ == proto::ANCHOR_TYPE_NONE ||
159 anchor_right_ == proto::ANCHOR_TYPE_NONE;
160 }
161
162 const base::StringPiece spec = url.possibly_invalid_spec();
163 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;
164 DCHECK(!spec.empty());
165
166 base::StringPiece subpattern = *subpattern_it;
167 ++subpattern_it;
168
169 // If there is only one |subpattern|, and it has a right anchor, then simply
170 // check that it is a suffix of the |spec|, and the left anchor is fulfilled.
171 if (subpattern_it == subpattern_end &&
172 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
173 if (!EndsWithFuzzy(spec, subpattern))
174 return false;
175 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY)
176 return spec.size() == subpattern.size();
177 if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
178 DCHECK_LE(subpattern.size(), spec.size());
179 return url.has_host() &&
180 IsSubdomainAnchored(spec, host_part,
181 spec.size() - subpattern.size());
182 }
183 return true;
184 }
185
186 // Otherwise, the first |subpattern| does not have to be a suffix. But it
187 // still can have a left anchor. Check and handle that.
188 base::StringPiece text = spec;
189 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) {
190 if (!StartsWithFuzzy(spec, subpattern))
191 return false;
192 if (subpattern_it == subpattern_end) {
193 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);
194 return true;
195 }
196 text.remove_prefix(subpattern.size());
197 } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
198 if (!url.has_host())
199 return false;
200 const size_t match_begin =
201 FindSubdomainAnchoredSubpattern(spec, host_part, subpattern);
202 if (match_begin == base::StringPiece::npos)
203 return false;
204 if (subpattern_it == subpattern_end) {
205 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE);
206 return true;
207 }
208 text.remove_prefix(match_begin + subpattern.size());
209 } else {
210 DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE);
211 // Get back to the initial |subpattern|, process it in the loop below.
212 subpattern_it = subpatterns.begin();
213 }
214
215 // Consecutively find all the remaining subpatterns in the |text|. If the
216 // pattern has a right anchor, don't search for the last subpattern, but
217 // instead check that it is a suffix of the |text|.
218 while (subpattern_it != subpattern_end) {
219 subpattern = *subpattern_it;
220 DCHECK(!subpattern.empty());
221
222 if (++subpattern_it == subpattern_end &&
223 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
224 break;
225 }
226
227 const size_t match_position = FindSubpattern(text, subpattern);
228 if (match_position == base::StringPiece::npos)
229 return false;
230 text.remove_prefix(match_position + subpattern.size());
231 }
232
233 return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY ||
234 EndsWithFuzzy(text, subpattern);
235 }
236
237 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
238 // Note: Each fall-through in this switch is intentional.
239 switch (pattern.anchor_left()) {
240 case proto::ANCHOR_TYPE_SUBDOMAIN:
241 out << '|';
242 case proto::ANCHOR_TYPE_BOUNDARY:
243 out << '|';
244 default:
245 break;
246 }
247 out << pattern.url_pattern();
248 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
249 out << '|';
250 if (pattern.match_case())
251 out << "$match-case";
252
253 return out;
254 }
255
76 } // namespace subresource_filter 256 } // namespace subresource_filter
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698