OLD | NEW |
| (Empty) |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
2 // Use of this source code is governed by a BSD-style license that can be | |
3 // found in the LICENSE file. | |
4 | |
5 // The matching logic distinguishes between the terms URL pattern and | |
6 // subpattern. A URL pattern usually stands for the full thing, e.g. | |
7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring | |
8 // of a pattern not containing the wildcard '*' character. For the example above | |
9 // the subpatterns are: "example.com^", "path" and "par=val^". | |
10 // | |
11 // The separator placeholder '^' symbol is used in subpatterns to match any | |
12 // separator character, which is any ASCII symbol except letters, digits, and | |
13 // the following: '_', '-', '.', '%'. Note that the separator placeholder | |
14 // character '^' is itself a separator, as well as '\0'. | |
15 | |
16 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ | |
17 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ | |
18 | |
19 #include <stddef.h> | |
20 | |
21 #include <algorithm> | |
22 #include <iterator> | |
23 #include <vector> | |
24 | |
25 #include "base/logging.h" | |
26 #include "base/strings/string_piece.h" | |
27 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" | |
28 #include "components/subresource_filter/core/common/knuth_morris_pratt.h" | |
29 #include "components/subresource_filter/core/common/string_splitter.h" | |
30 #include "components/subresource_filter/core/common/url_pattern.h" | |
31 #include "url/gurl.h" | |
32 #include "url/third_party/mozilla/url_parse.h" | |
33 | |
34 namespace subresource_filter { | |
35 | |
36 // Public interface declaration ------------------------------------------------ | |
37 | |
38 // Builds a compound Knuth-Morris-Pratt failure function used to match URLs | |
39 // against the |pattern|. | |
40 // | |
41 // The |pattern| is split on the '*' wildcard symbol and then a failure function | |
42 // is built for each subpattern by BuildFailureFunctionFuzzy or | |
43 // BuildFailureFunction (depending on whether the subpattern contains separator | |
44 // placeholders), and appended to the returned vector. Some of the subpatterns | |
45 // can be exempted from being indexed. E.g., if the |pattern| has a BOUNDARY | |
46 // left anchor, the first subpattern can be matched by checking if it's a prefix | |
47 // of a URL. | |
48 // | |
49 // Each subpattern indexed with BuildFailureFunctionFuzzy is prepended with a | |
50 // value 1 (to distinguish them from the subpatterns indexed with | |
51 // BuildFailureFunction, their failure functions always start with 0). | |
52 // | |
53 // The URL |pattern| must be normalized. Namely, it must not have the following | |
54 // substrings: **, *<END>, <BEGIN>*. If the the |pattern| has a BOUNDARY anchor, | |
55 // the corresponding side of its string must not end with a '*' wildcard. | |
56 template <typename IntType> | |
57 void BuildFailureFunction(const UrlPattern& pattern, | |
58 std::vector<IntType>* failure); | |
59 | |
60 // Returns whether the |url| matches the URL |pattern|. The |failure| function | |
61 // between |failure_begin| and |failure_end| must be the output of | |
62 // BuildFailureFunction() called with the same |pattern|. | |
63 // | |
64 // TODO(pkalinnikov): Outline algorithms implemented in this function. | |
65 template <typename FailureIter> | |
66 bool IsUrlPatternMatch(const GURL& url, | |
67 const UrlPattern& pattern, | |
68 FailureIter failure_begin, | |
69 FailureIter failure_end); | |
70 | |
71 // Implementation -------------------------------------------------------------- | |
72 | |
73 namespace impl { | |
74 | |
75 class IsWildcard { | |
76 public: | |
77 bool operator()(char c) const { return c == '*'; } | |
78 }; | |
79 | |
80 // Returns whether |position| within the |url| belongs to its |host| component | |
81 // and corresponds to the beginning of a (sub-)domain. | |
82 inline bool IsSubdomainAnchored(base::StringPiece url, | |
83 url::Component host, | |
84 size_t position) { | |
85 DCHECK_LE(position, url.size()); | |
86 const size_t host_begin = static_cast<size_t>(host.begin); | |
87 const size_t host_end = static_cast<size_t>(host.end()); | |
88 DCHECK_LE(host_end, url.size()); | |
89 | |
90 return position == host_begin || | |
91 (position > host_begin && position <= host_end && | |
92 url[position - 1] == '.'); | |
93 } | |
94 | |
95 // Returns the position just beyond the leftmost fuzzy occurrence of | |
96 // |subpattern| in the |text|. | |
97 template <typename IntType> | |
98 inline size_t FindFirstOccurrenceFuzzy(base::StringPiece text, | |
99 base::StringPiece subpattern, | |
100 const IntType* failure) { | |
101 return *AllOccurrencesFuzzy<IntType>(text, subpattern, failure).begin(); | |
102 } | |
103 | |
104 // Returns the position just beyond the leftmost occurrence of |subpattern| in | |
105 // the |url|, such that it satisfies a SUBDOMAIN anchor. | |
106 template <typename IntType> | |
107 inline size_t FindSubdomainAnchored(base::StringPiece url, | |
108 url::Component host, | |
109 base::StringPiece subpattern, | |
110 const IntType* failure) { | |
111 auto occurrences = AllOccurrences<IntType>(url, subpattern, failure); | |
112 return *std::find_if(occurrences.begin(), occurrences.end(), | |
113 [url, host, subpattern](size_t match_end_position) { | |
114 DCHECK_GE(match_end_position, subpattern.size()); | |
115 return IsSubdomainAnchored( | |
116 url, host, match_end_position - subpattern.size()); | |
117 }); | |
118 } | |
119 | |
120 // Returns the position just beyond the leftmost fuzzy occurrence of | |
121 // |subpattern| in the |url|, such that it satisfies a SUBDOMAIN anchor. | |
122 template <typename IntType> | |
123 inline size_t FindSubdomainAnchoredFuzzy(base::StringPiece url, | |
124 url::Component host, | |
125 base::StringPiece subpattern, | |
126 const IntType* failure) { | |
127 auto occurrences = AllOccurrencesFuzzy<IntType>(url, subpattern, failure); | |
128 return *std::find_if(occurrences.begin(), occurrences.end(), | |
129 [url, host, subpattern](size_t match_end_position) { | |
130 DCHECK_GE(match_end_position, subpattern.size()); | |
131 return IsSubdomainAnchored( | |
132 url, host, match_end_position - subpattern.size()); | |
133 }); | |
134 } | |
135 | |
136 } // namespace impl | |
137 | |
138 template <typename IntType> | |
139 void BuildFailureFunction(const UrlPattern& pattern, | |
140 std::vector<IntType>* failure) { | |
141 StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern); | |
142 auto subpattern_it = subpatterns.begin(); | |
143 auto subpattern_end = subpatterns.end(); | |
144 | |
145 if (subpattern_it == subpattern_end) | |
146 return; | |
147 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) | |
148 ++subpattern_it; | |
149 | |
150 while (subpattern_it != subpattern_end) { | |
151 const base::StringPiece part = *subpattern_it++; | |
152 DCHECK(!part.empty()); | |
153 if (pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY && | |
154 subpattern_it == subpattern_end) { | |
155 break; | |
156 } | |
157 | |
158 if (part.find(kSeparatorPlaceholder) == base::StringPiece::npos) { | |
159 BuildFailureFunction(part, failure); | |
160 } else { | |
161 // Prepend the value '1' to the failure function to let matchers | |
162 // distinguish between the subpatterns with separator placeholders and | |
163 // those without. | |
164 failure->push_back(1); | |
165 BuildFailureFunctionFuzzy(part, failure); | |
166 } | |
167 } | |
168 } | |
169 | |
170 template <typename FailureIter> | |
171 bool IsUrlPatternMatch(const GURL& url, | |
172 const UrlPattern& pattern, | |
173 FailureIter failure_begin, | |
174 FailureIter failure_end) { | |
175 DCHECK(url.is_valid()); | |
176 | |
177 StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern); | |
178 auto subpattern_it = subpatterns.begin(); | |
179 auto subpattern_end = subpatterns.end(); | |
180 | |
181 if (subpattern_it == subpattern_end) { | |
182 return pattern.anchor_left != proto::ANCHOR_TYPE_BOUNDARY || | |
183 pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY || | |
184 url.is_empty(); | |
185 } | |
186 | |
187 const base::StringPiece spec = url.possibly_invalid_spec(); | |
188 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; | |
189 | |
190 base::StringPiece subpattern = *subpattern_it++; | |
191 if (subpattern_it == subpattern_end && | |
192 pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) { | |
193 if (!EndsWithFuzzy(spec, subpattern)) | |
194 return false; | |
195 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) | |
196 return spec.size() == subpattern.size(); | |
197 if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
198 DCHECK_LE(subpattern.size(), spec.size()); | |
199 return url.has_host() && | |
200 impl::IsSubdomainAnchored(spec, host_part, | |
201 spec.size() - subpattern.size()); | |
202 } | |
203 return true; | |
204 } | |
205 | |
206 base::StringPiece text = spec; | |
207 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) { | |
208 if (!StartsWithFuzzy(spec, subpattern)) | |
209 return false; | |
210 if (subpattern_it == subpattern_end) | |
211 return true; | |
212 text.remove_prefix(subpattern.size()); | |
213 } else if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
214 if (!url.has_host()) | |
215 return false; | |
216 | |
217 const bool has_separator_placeholders = (*failure_begin != 0); | |
218 if (has_separator_placeholders) | |
219 ++failure_begin; | |
220 | |
221 const size_t position = | |
222 has_separator_placeholders | |
223 ? impl::FindSubdomainAnchoredFuzzy(spec, host_part, subpattern, | |
224 &*failure_begin) | |
225 : impl::FindSubdomainAnchored(spec, host_part, subpattern, | |
226 &*failure_begin); | |
227 if (position == base::StringPiece::npos) | |
228 return false; | |
229 if (subpattern_it == subpattern_end) | |
230 return true; | |
231 text.remove_prefix(position); | |
232 } else { | |
233 DCHECK_EQ(pattern.anchor_left, proto::ANCHOR_TYPE_NONE); | |
234 // Get back to the initial subpattern, process it in the loop below. | |
235 subpattern_it = subpatterns.begin(); | |
236 } | |
237 | |
238 while (subpattern_it != subpattern_end) { | |
239 subpattern = *subpattern_it++; | |
240 DCHECK(!subpattern.empty()); | |
241 | |
242 if (subpattern_it == subpattern_end && | |
243 pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) { | |
244 break; | |
245 } | |
246 | |
247 // The subpatterns with separator placeholders were indexed differently and | |
248 // should be matched accordingly. Their failure functions were prepended by | |
249 // a non-zero value, and we need to skip it. If the value turned to be zero, | |
250 // it is the initial value of a failure function of a regular substring. | |
251 CHECK(failure_begin < failure_end); | |
252 const bool has_separator_placeholders = (*failure_begin != 0); | |
253 if (has_separator_placeholders) | |
254 ++failure_begin; | |
255 CHECK(static_cast<size_t>(std::distance(failure_begin, failure_end)) >= | |
256 subpattern.size()); | |
257 | |
258 // If the subpattern has separator placeholders, it should be matched with | |
259 // FindFirstOccurrenceOfSubpattern, otherwise it can be found as a regular | |
260 // substring. | |
261 const size_t match_end = | |
262 (has_separator_placeholders | |
263 ? impl::FindFirstOccurrenceFuzzy(text, subpattern, &*failure_begin) | |
264 : FindFirstOccurrence(text, subpattern, &*failure_begin)); | |
265 if (match_end == base::StringPiece::npos) | |
266 return false; | |
267 text.remove_prefix(match_end); | |
268 failure_begin += subpattern.size(); | |
269 } | |
270 | |
271 return pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY || | |
272 EndsWithFuzzy(text, subpattern); | |
273 } | |
274 | |
275 } // namespace subresource_filter | |
276 | |
277 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ | |
OLD | NEW |