Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(302)

Side by Side Diff: components/subresource_filter/core/common/url_pattern.cc

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)
Patch Set: Fix tests. Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 The Chromium Authors. All rights reserved. 1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // The matching logic distinguishes between the terms URL pattern and
6 // subpattern. A URL pattern usually stands for the full thing, e.g.
7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
8 // of a pattern not containing the wildcard '*' character. For the example above
9 // the subpatterns are: "example.com^", "path" and "par=val^".
10 //
11 // The separator placeholder '^' symbol is used in subpatterns to match any
12 // separator character, which is any ASCII symbol except letters, digits, and
13 // the following: '_', '-', '.', '%'. Note that the separator placeholder
14 // character '^' is itself a separator, as well as '\0'.
15
5 #include "components/subresource_filter/core/common/url_pattern.h" 16 #include "components/subresource_filter/core/common/url_pattern.h"
6 17
18 #include <stddef.h>
19
20 #include <algorithm>
21 #include <ostream>
22
23 #include "base/logging.h"
7 #include "components/subresource_filter/core/common/flat/rules_generated.h" 24 #include "components/subresource_filter/core/common/flat/rules_generated.h"
25 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h"
26 #include "components/subresource_filter/core/common/string_splitter.h"
27 #include "url/gurl.h"
28 #include "url/third_party/mozilla/url_parse.h"
8 29
9 namespace subresource_filter { 30 namespace subresource_filter {
10 31
11 namespace { 32 namespace {
12 33
34 class IsWildcard {
35 public:
36 bool operator()(char c) const { return c == '*'; }
37 };
38
13 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { 39 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) {
14 switch (type) { 40 switch (type) {
15 case flat::UrlPatternType_SUBSTRING: 41 case flat::UrlPatternType_SUBSTRING:
16 return proto::URL_PATTERN_TYPE_SUBSTRING; 42 return proto::URL_PATTERN_TYPE_SUBSTRING;
17 case flat::UrlPatternType_WILDCARDED: 43 case flat::UrlPatternType_WILDCARDED:
18 return proto::URL_PATTERN_TYPE_WILDCARDED; 44 return proto::URL_PATTERN_TYPE_WILDCARDED;
19 case flat::UrlPatternType_REGEXP: 45 case flat::UrlPatternType_REGEXP:
20 return proto::URL_PATTERN_TYPE_REGEXP; 46 return proto::URL_PATTERN_TYPE_REGEXP;
21 default: 47 default:
22 return proto::URL_PATTERN_TYPE_UNSPECIFIED; 48 return proto::URL_PATTERN_TYPE_UNSPECIFIED;
(...skipping 11 matching lines...) Expand all
34 default: 60 default:
35 return proto::ANCHOR_TYPE_UNSPECIFIED; 61 return proto::ANCHOR_TYPE_UNSPECIFIED;
36 } 62 }
37 } 63 }
38 64
39 base::StringPiece ConvertString(const flatbuffers::String* string) { 65 base::StringPiece ConvertString(const flatbuffers::String* string) {
40 return string ? base::StringPiece(string->data(), string->size()) 66 return string ? base::StringPiece(string->data(), string->size())
41 : base::StringPiece(); 67 : base::StringPiece();
42 } 68 }
43 69
70 // Returns whether |position| within the |url| belongs to its |host| component
71 // and corresponds to the beginning of a (sub-)domain.
72 inline bool IsSubdomainAnchored(base::StringPiece url,
73 url::Component host,
74 size_t position) {
75 DCHECK_LE(position, url.size());
76 const size_t host_begin = static_cast<size_t>(host.begin);
77 const size_t host_end = static_cast<size_t>(host.end());
78 DCHECK_LE(host_end, url.size());
79
80 return position == host_begin ||
81 (position > host_begin && position <= host_end &&
82 url[position - 1] == '.');
83 }
84
85 class Subpattern {
Charlie Harrison 2017/04/04 13:58:26 It looks like this class is only used for generati
pkalinnikov 2017/04/04 16:42:59 Done.
86 public:
87 Subpattern(base::StringPiece subpattern)
88 : subpattern_(subpattern),
89 has_separator_placeholders_(subpattern.find(kSeparatorPlaceholder) !=
90 base::StringPiece::npos) {}
91
92 // Returns the position of the leftmost occurrence of |subpattern| in the
93 // |text|, starting |from| a certain position. If the |subpattern| has
94 // separator placeholders, searches for a fuzzy occurrence.
95 size_t FindIn(base::StringPiece text, size_t from = 0) const {
96 if (!has_separator_placeholders_)
97 return text.find(subpattern_, from);
98
99 auto* found = std::search(
100 text.begin() + from, text.end(), subpattern_.begin(), subpattern_.end(),
101 [](char text_char, char subpattern_char) {
102 return text_char == subpattern_char ||
103 (subpattern_char == kSeparatorPlaceholder &&
104 IsSeparator(text_char));
105 });
106 return found == text.end() ? base::StringPiece::npos : found - text.begin();
107 }
108
109 // Same as FindIn(url, 0), but searches for an occurrence that starts in the
110 // beginning of a (sub-)domain within the url's |host| component.
111 size_t FindSubdomainAnchoredIn(base::StringPiece url,
112 url::Component host) const {
113 for (size_t position = FindIn(url); position != base::StringPiece::npos;
114 position = FindIn(url, ++position)) {
115 if (IsSubdomainAnchored(url, host, position))
116 return position;
117 }
118 return base::StringPiece::npos;
119 }
120
121 private:
122 const base::StringPiece subpattern_;
123 const bool has_separator_placeholders_;
124 };
Charlie Harrison 2017/04/04 13:58:26 does it need to be copyable?
pkalinnikov 2017/04/04 16:42:59 Not relevant anymore.
125
44 } // namespace 126 } // namespace
45 127
46 UrlPattern::UrlPattern() = default; 128 UrlPattern::UrlPattern() = default;
47 129
48 UrlPattern::UrlPattern(base::StringPiece url_pattern, 130 UrlPattern::UrlPattern(base::StringPiece url_pattern,
49 proto::UrlPatternType type) 131 proto::UrlPatternType type)
50 : type(type), url_pattern(url_pattern) {} 132 : type_(type), url_pattern_(url_pattern) {}
51 133
52 UrlPattern::UrlPattern(base::StringPiece url_pattern, 134 UrlPattern::UrlPattern(base::StringPiece url_pattern,
53 proto::AnchorType anchor_left, 135 proto::AnchorType anchor_left,
54 proto::AnchorType anchor_right) 136 proto::AnchorType anchor_right)
55 : type(proto::URL_PATTERN_TYPE_WILDCARDED), 137 : type_(proto::URL_PATTERN_TYPE_WILDCARDED),
56 url_pattern(url_pattern), 138 url_pattern_(url_pattern),
57 anchor_left(anchor_left), 139 anchor_left_(anchor_left),
58 anchor_right(anchor_right) {} 140 anchor_right_(anchor_right) {}
59 141
60 UrlPattern::UrlPattern(const flat::UrlRule& rule) 142 UrlPattern::UrlPattern(const flat::UrlRule& rule)
61 : type(ConvertUrlPatternType(rule.url_pattern_type())), 143 : type_(ConvertUrlPatternType(rule.url_pattern_type())),
62 url_pattern(ConvertString(rule.url_pattern())), 144 url_pattern_(ConvertString(rule.url_pattern())),
63 anchor_left(ConvertAnchorType(rule.anchor_left())), 145 anchor_left_(ConvertAnchorType(rule.anchor_left())),
64 anchor_right(ConvertAnchorType(rule.anchor_right())), 146 anchor_right_(ConvertAnchorType(rule.anchor_right())),
65 match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} 147 match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {}
66 148
67 UrlPattern::UrlPattern(const proto::UrlRule& rule) 149 UrlPattern::UrlPattern(const proto::UrlRule& rule)
68 : type(rule.url_pattern_type()), 150 : type_(rule.url_pattern_type()),
69 url_pattern(rule.url_pattern()), 151 url_pattern_(rule.url_pattern()),
70 anchor_left(rule.anchor_left()), 152 anchor_left_(rule.anchor_left()),
71 anchor_right(rule.anchor_right()), 153 anchor_right_(rule.anchor_right()),
72 match_case(rule.match_case()) {} 154 match_case_(rule.match_case()) {}
73 155
74 UrlPattern::~UrlPattern() = default; 156 UrlPattern::~UrlPattern() = default;
75 157
158 bool UrlPattern::MatchesUrl(const GURL& url) const {
159 DCHECK(url.is_valid());
160 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING ||
161 proto::URL_PATTERN_TYPE_WILDCARDED);
162
163 StringSplitter<IsWildcard> subpatterns(url_pattern_);
164 auto subpattern_it = subpatterns.begin();
165 auto subpattern_end = subpatterns.end();
166
167 if (subpattern_it == subpattern_end) {
168 return anchor_left_ == proto::ANCHOR_TYPE_NONE ||
169 anchor_right_ == proto::ANCHOR_TYPE_NONE;
170 }
171
172 const base::StringPiece spec = url.possibly_invalid_spec();
173 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;
174 DCHECK(!spec.empty());
175
176 base::StringPiece subpattern = *subpattern_it++;
Charlie Harrison 2017/04/04 13:58:26 Can you include comments above these code paragrap
Charlie Harrison 2017/04/04 13:58:26 nit: Can you break this into two lines for clarity
pkalinnikov 2017/04/04 16:42:59 Done.
pkalinnikov 2017/04/04 16:42:59 How about this?
Charlie Harrison 2017/04/04 16:49:39 looks good.
177 if (subpattern_it == subpattern_end &&
178 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
179 if (!EndsWithFuzzy(spec, subpattern))
180 return false;
181 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY)
182 return spec.size() == subpattern.size();
183 if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
184 DCHECK_LE(subpattern.size(), spec.size());
185 return url.has_host() &&
186 IsSubdomainAnchored(spec, host_part,
187 spec.size() - subpattern.size());
188 }
189 return true;
190 }
191
192 base::StringPiece text = spec;
193 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) {
194 if (!StartsWithFuzzy(spec, subpattern))
195 return false;
196 if (subpattern_it == subpattern_end)
197 return true;
198 text.remove_prefix(subpattern.size());
199 } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) {
200 if (!url.has_host())
201 return false;
202 const size_t match_begin =
203 Subpattern(subpattern).FindSubdomainAnchoredIn(spec, host_part);
204 if (match_begin == base::StringPiece::npos)
205 return false;
206 if (subpattern_it == subpattern_end)
207 return true;
208 text.remove_prefix(match_begin + subpattern.size());
209 } else {
210 DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE);
211 // Get back to the initial subpattern, process it in the loop below.
212 subpattern_it = subpatterns.begin();
213 }
214
215 while (subpattern_it != subpattern_end) {
216 subpattern = *subpattern_it++;
217 DCHECK(!subpattern.empty());
218
219 if (subpattern_it == subpattern_end &&
220 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) {
221 break;
222 }
223
224 const size_t match_position = Subpattern(subpattern).FindIn(text);
225 if (match_position == base::StringPiece::npos)
226 return false;
227 text.remove_prefix(match_position + subpattern.size());
228 }
229
230 return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY ||
231 EndsWithFuzzy(text, subpattern);
232 }
233
234 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) {
235 // Note: Each fall-through in this switch is intentional.
236 switch (pattern.anchor_left()) {
237 case proto::ANCHOR_TYPE_SUBDOMAIN:
238 out << '|';
239 case proto::ANCHOR_TYPE_BOUNDARY:
240 out << '|';
241 default:
242 break;
243 }
244 out << pattern.url_pattern();
245 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY)
246 out << '|';
247 if (pattern.match_case())
248 out << "$match-case";
249
250 return out;
251 }
252
76 } // namespace subresource_filter 253 } // namespace subresource_filter
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698