OLD | NEW |
---|---|
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // The matching logic distinguishes between the terms URL pattern and | |
6 // subpattern. A URL pattern usually stands for the full thing, e.g. | |
7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring | |
8 // of a pattern not containing the wildcard '*' character. For the example above | |
9 // the subpatterns are: "example.com^", "path" and "par=val^". | |
10 // | |
11 // The separator placeholder '^' symbol is used in subpatterns to match any | |
12 // separator character, which is any ASCII symbol except letters, digits, and | |
13 // the following: '_', '-', '.', '%'. Note that the separator placeholder | |
14 // character '^' is itself a separator, as well as '\0'. | |
15 | |
5 #include "components/subresource_filter/core/common/url_pattern.h" | 16 #include "components/subresource_filter/core/common/url_pattern.h" |
6 | 17 |
18 #include <stddef.h> | |
19 | |
20 #include <algorithm> | |
21 #include <ostream> | |
22 | |
23 #include "base/logging.h" | |
7 #include "components/subresource_filter/core/common/flat/rules_generated.h" | 24 #include "components/subresource_filter/core/common/flat/rules_generated.h" |
25 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" | |
26 #include "components/subresource_filter/core/common/string_splitter.h" | |
27 #include "url/gurl.h" | |
28 #include "url/third_party/mozilla/url_parse.h" | |
8 | 29 |
9 namespace subresource_filter { | 30 namespace subresource_filter { |
10 | 31 |
11 namespace { | 32 namespace { |
12 | 33 |
34 class IsWildcard { | |
35 public: | |
36 bool operator()(char c) const { return c == '*'; } | |
37 }; | |
38 | |
13 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { | 39 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { |
14 switch (type) { | 40 switch (type) { |
15 case flat::UrlPatternType_SUBSTRING: | 41 case flat::UrlPatternType_SUBSTRING: |
16 return proto::URL_PATTERN_TYPE_SUBSTRING; | 42 return proto::URL_PATTERN_TYPE_SUBSTRING; |
17 case flat::UrlPatternType_WILDCARDED: | 43 case flat::UrlPatternType_WILDCARDED: |
18 return proto::URL_PATTERN_TYPE_WILDCARDED; | 44 return proto::URL_PATTERN_TYPE_WILDCARDED; |
19 case flat::UrlPatternType_REGEXP: | 45 case flat::UrlPatternType_REGEXP: |
20 return proto::URL_PATTERN_TYPE_REGEXP; | 46 return proto::URL_PATTERN_TYPE_REGEXP; |
21 default: | 47 default: |
22 return proto::URL_PATTERN_TYPE_UNSPECIFIED; | 48 return proto::URL_PATTERN_TYPE_UNSPECIFIED; |
(...skipping 11 matching lines...) Expand all Loading... | |
34 default: | 60 default: |
35 return proto::ANCHOR_TYPE_UNSPECIFIED; | 61 return proto::ANCHOR_TYPE_UNSPECIFIED; |
36 } | 62 } |
37 } | 63 } |
38 | 64 |
39 base::StringPiece ConvertString(const flatbuffers::String* string) { | 65 base::StringPiece ConvertString(const flatbuffers::String* string) { |
40 return string ? base::StringPiece(string->data(), string->size()) | 66 return string ? base::StringPiece(string->data(), string->size()) |
41 : base::StringPiece(); | 67 : base::StringPiece(); |
42 } | 68 } |
43 | 69 |
70 // Returns whether |position| within the |url| belongs to its |host| component | |
71 // and corresponds to the beginning of a (sub-)domain. | |
72 inline bool IsSubdomainAnchored(base::StringPiece url, | |
73 url::Component host, | |
74 size_t position) { | |
75 DCHECK_LE(position, url.size()); | |
76 const size_t host_begin = static_cast<size_t>(host.begin); | |
77 const size_t host_end = static_cast<size_t>(host.end()); | |
78 DCHECK_LE(host_end, url.size()); | |
79 | |
80 return position == host_begin || | |
81 (position > host_begin && position <= host_end && | |
82 url[position - 1] == '.'); | |
83 } | |
84 | |
85 class Subpattern { | |
Charlie Harrison
2017/04/04 13:58:26
It looks like this class is only used for generati
pkalinnikov
2017/04/04 16:42:59
Done.
| |
86 public: | |
87 Subpattern(base::StringPiece subpattern) | |
88 : subpattern_(subpattern), | |
89 has_separator_placeholders_(subpattern.find(kSeparatorPlaceholder) != | |
90 base::StringPiece::npos) {} | |
91 | |
92 // Returns the position of the leftmost occurrence of |subpattern| in the | |
93 // |text|, starting |from| a certain position. If the |subpattern| has | |
94 // separator placeholders, searches for a fuzzy occurrence. | |
95 size_t FindIn(base::StringPiece text, size_t from = 0) const { | |
96 if (!has_separator_placeholders_) | |
97 return text.find(subpattern_, from); | |
98 | |
99 auto* found = std::search( | |
100 text.begin() + from, text.end(), subpattern_.begin(), subpattern_.end(), | |
101 [](char text_char, char subpattern_char) { | |
102 return text_char == subpattern_char || | |
103 (subpattern_char == kSeparatorPlaceholder && | |
104 IsSeparator(text_char)); | |
105 }); | |
106 return found == text.end() ? base::StringPiece::npos : found - text.begin(); | |
107 } | |
108 | |
109 // Same as FindIn(url, 0), but searches for an occurrence that starts in the | |
110 // beginning of a (sub-)domain within the url's |host| component. | |
111 size_t FindSubdomainAnchoredIn(base::StringPiece url, | |
112 url::Component host) const { | |
113 for (size_t position = FindIn(url); position != base::StringPiece::npos; | |
114 position = FindIn(url, ++position)) { | |
115 if (IsSubdomainAnchored(url, host, position)) | |
116 return position; | |
117 } | |
118 return base::StringPiece::npos; | |
119 } | |
120 | |
121 private: | |
122 const base::StringPiece subpattern_; | |
123 const bool has_separator_placeholders_; | |
124 }; | |
Charlie Harrison
2017/04/04 13:58:26
does it need to be copyable?
pkalinnikov
2017/04/04 16:42:59
Not relevant anymore.
| |
125 | |
44 } // namespace | 126 } // namespace |
45 | 127 |
46 UrlPattern::UrlPattern() = default; | 128 UrlPattern::UrlPattern() = default; |
47 | 129 |
48 UrlPattern::UrlPattern(base::StringPiece url_pattern, | 130 UrlPattern::UrlPattern(base::StringPiece url_pattern, |
49 proto::UrlPatternType type) | 131 proto::UrlPatternType type) |
50 : type(type), url_pattern(url_pattern) {} | 132 : type_(type), url_pattern_(url_pattern) {} |
51 | 133 |
52 UrlPattern::UrlPattern(base::StringPiece url_pattern, | 134 UrlPattern::UrlPattern(base::StringPiece url_pattern, |
53 proto::AnchorType anchor_left, | 135 proto::AnchorType anchor_left, |
54 proto::AnchorType anchor_right) | 136 proto::AnchorType anchor_right) |
55 : type(proto::URL_PATTERN_TYPE_WILDCARDED), | 137 : type_(proto::URL_PATTERN_TYPE_WILDCARDED), |
56 url_pattern(url_pattern), | 138 url_pattern_(url_pattern), |
57 anchor_left(anchor_left), | 139 anchor_left_(anchor_left), |
58 anchor_right(anchor_right) {} | 140 anchor_right_(anchor_right) {} |
59 | 141 |
60 UrlPattern::UrlPattern(const flat::UrlRule& rule) | 142 UrlPattern::UrlPattern(const flat::UrlRule& rule) |
61 : type(ConvertUrlPatternType(rule.url_pattern_type())), | 143 : type_(ConvertUrlPatternType(rule.url_pattern_type())), |
62 url_pattern(ConvertString(rule.url_pattern())), | 144 url_pattern_(ConvertString(rule.url_pattern())), |
63 anchor_left(ConvertAnchorType(rule.anchor_left())), | 145 anchor_left_(ConvertAnchorType(rule.anchor_left())), |
64 anchor_right(ConvertAnchorType(rule.anchor_right())), | 146 anchor_right_(ConvertAnchorType(rule.anchor_right())), |
65 match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} | 147 match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} |
66 | 148 |
67 UrlPattern::UrlPattern(const proto::UrlRule& rule) | 149 UrlPattern::UrlPattern(const proto::UrlRule& rule) |
68 : type(rule.url_pattern_type()), | 150 : type_(rule.url_pattern_type()), |
69 url_pattern(rule.url_pattern()), | 151 url_pattern_(rule.url_pattern()), |
70 anchor_left(rule.anchor_left()), | 152 anchor_left_(rule.anchor_left()), |
71 anchor_right(rule.anchor_right()), | 153 anchor_right_(rule.anchor_right()), |
72 match_case(rule.match_case()) {} | 154 match_case_(rule.match_case()) {} |
73 | 155 |
74 UrlPattern::~UrlPattern() = default; | 156 UrlPattern::~UrlPattern() = default; |
75 | 157 |
158 bool UrlPattern::MatchesUrl(const GURL& url) const { | |
159 DCHECK(url.is_valid()); | |
160 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING || | |
161 proto::URL_PATTERN_TYPE_WILDCARDED); | |
162 | |
163 StringSplitter<IsWildcard> subpatterns(url_pattern_); | |
164 auto subpattern_it = subpatterns.begin(); | |
165 auto subpattern_end = subpatterns.end(); | |
166 | |
167 if (subpattern_it == subpattern_end) { | |
168 return anchor_left_ == proto::ANCHOR_TYPE_NONE || | |
169 anchor_right_ == proto::ANCHOR_TYPE_NONE; | |
170 } | |
171 | |
172 const base::StringPiece spec = url.possibly_invalid_spec(); | |
173 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; | |
174 DCHECK(!spec.empty()); | |
175 | |
176 base::StringPiece subpattern = *subpattern_it++; | |
Charlie Harrison
2017/04/04 13:58:26
Can you include comments above these code paragrap
Charlie Harrison
2017/04/04 13:58:26
nit: Can you break this into two lines for clarity
pkalinnikov
2017/04/04 16:42:59
Done.
pkalinnikov
2017/04/04 16:42:59
How about this?
Charlie Harrison
2017/04/04 16:49:39
looks good.
| |
177 if (subpattern_it == subpattern_end && | |
178 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { | |
179 if (!EndsWithFuzzy(spec, subpattern)) | |
180 return false; | |
181 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) | |
182 return spec.size() == subpattern.size(); | |
183 if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
184 DCHECK_LE(subpattern.size(), spec.size()); | |
185 return url.has_host() && | |
186 IsSubdomainAnchored(spec, host_part, | |
187 spec.size() - subpattern.size()); | |
188 } | |
189 return true; | |
190 } | |
191 | |
192 base::StringPiece text = spec; | |
193 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) { | |
194 if (!StartsWithFuzzy(spec, subpattern)) | |
195 return false; | |
196 if (subpattern_it == subpattern_end) | |
197 return true; | |
198 text.remove_prefix(subpattern.size()); | |
199 } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
200 if (!url.has_host()) | |
201 return false; | |
202 const size_t match_begin = | |
203 Subpattern(subpattern).FindSubdomainAnchoredIn(spec, host_part); | |
204 if (match_begin == base::StringPiece::npos) | |
205 return false; | |
206 if (subpattern_it == subpattern_end) | |
207 return true; | |
208 text.remove_prefix(match_begin + subpattern.size()); | |
209 } else { | |
210 DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE); | |
211 // Get back to the initial subpattern, process it in the loop below. | |
212 subpattern_it = subpatterns.begin(); | |
213 } | |
214 | |
215 while (subpattern_it != subpattern_end) { | |
216 subpattern = *subpattern_it++; | |
217 DCHECK(!subpattern.empty()); | |
218 | |
219 if (subpattern_it == subpattern_end && | |
220 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { | |
221 break; | |
222 } | |
223 | |
224 const size_t match_position = Subpattern(subpattern).FindIn(text); | |
225 if (match_position == base::StringPiece::npos) | |
226 return false; | |
227 text.remove_prefix(match_position + subpattern.size()); | |
228 } | |
229 | |
230 return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY || | |
231 EndsWithFuzzy(text, subpattern); | |
232 } | |
233 | |
234 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) { | |
235 // Note: Each fall-through in this switch is intentional. | |
236 switch (pattern.anchor_left()) { | |
237 case proto::ANCHOR_TYPE_SUBDOMAIN: | |
238 out << '|'; | |
239 case proto::ANCHOR_TYPE_BOUNDARY: | |
240 out << '|'; | |
241 default: | |
242 break; | |
243 } | |
244 out << pattern.url_pattern(); | |
245 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY) | |
246 out << '|'; | |
247 if (pattern.match_case()) | |
248 out << "$match-case"; | |
249 | |
250 return out; | |
251 } | |
252 | |
76 } // namespace subresource_filter | 253 } // namespace subresource_filter |
OLD | NEW |