Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(644)

Side by Side Diff: components/subresource_filter/core/common/url_pattern_matching.h

Issue 2793993002: [subresource_filter] Replace KMP by std::search. (Closed)
Patch Set: Fix DCHECK. Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
(Empty)
1 // Copyright 2016 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // The matching logic distinguishes between the terms URL pattern and
6 // subpattern. A URL pattern usually stands for the full thing, e.g.
7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring
8 // of a pattern not containing the wildcard '*' character. For the example above
9 // the subpatterns are: "example.com^", "path" and "par=val^".
10 //
11 // The separator placeholder '^' symbol is used in subpatterns to match any
12 // separator character, which is any ASCII symbol except letters, digits, and
13 // the following: '_', '-', '.', '%'. Note that the separator placeholder
14 // character '^' is itself a separator, as well as '\0'.
15
16 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_
17 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_
18
19 #include <stddef.h>
20
21 #include <algorithm>
22 #include <iterator>
23 #include <vector>
24
25 #include "base/logging.h"
26 #include "base/strings/string_piece.h"
27 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h"
28 #include "components/subresource_filter/core/common/knuth_morris_pratt.h"
29 #include "components/subresource_filter/core/common/string_splitter.h"
30 #include "components/subresource_filter/core/common/url_pattern.h"
31 #include "url/gurl.h"
32 #include "url/third_party/mozilla/url_parse.h"
33
34 namespace subresource_filter {
35
36 // Public interface declaration ------------------------------------------------
37
38 // Builds a compound Knuth-Morris-Pratt failure function used to match URLs
39 // against the |pattern|.
40 //
41 // The |pattern| is split on the '*' wildcard symbol and then a failure function
42 // is built for each subpattern by BuildFailureFunctionFuzzy or
43 // BuildFailureFunction (depending on whether the subpattern contains separator
44 // placeholders), and appended to the returned vector. Some of the subpatterns
45 // can be exempted from being indexed. E.g., if the |pattern| has a BOUNDARY
46 // left anchor, the first subpattern can be matched by checking if it's a prefix
47 // of a URL.
48 //
49 // Each subpattern indexed with BuildFailureFunctionFuzzy is prepended with a
50 // value 1 (to distinguish them from the subpatterns indexed with
51 // BuildFailureFunction, their failure functions always start with 0).
52 //
53 // The URL |pattern| must be normalized. Namely, it must not have the following
54 // substrings: **, *<END>, <BEGIN>*. If the the |pattern| has a BOUNDARY anchor,
55 // the corresponding side of its string must not end with a '*' wildcard.
56 template <typename IntType>
57 void BuildFailureFunction(const UrlPattern& pattern,
58 std::vector<IntType>* failure);
59
60 // Returns whether the |url| matches the URL |pattern|. The |failure| function
61 // between |failure_begin| and |failure_end| must be the output of
62 // BuildFailureFunction() called with the same |pattern|.
63 //
64 // TODO(pkalinnikov): Outline algorithms implemented in this function.
65 template <typename FailureIter>
66 bool IsUrlPatternMatch(const GURL& url,
67 const UrlPattern& pattern,
68 FailureIter failure_begin,
69 FailureIter failure_end);
70
71 // Implementation --------------------------------------------------------------
72
73 namespace impl {
74
75 class IsWildcard {
76 public:
77 bool operator()(char c) const { return c == '*'; }
78 };
79
80 // Returns whether |position| within the |url| belongs to its |host| component
81 // and corresponds to the beginning of a (sub-)domain.
82 inline bool IsSubdomainAnchored(base::StringPiece url,
83 url::Component host,
84 size_t position) {
85 DCHECK_LE(position, url.size());
86 const size_t host_begin = static_cast<size_t>(host.begin);
87 const size_t host_end = static_cast<size_t>(host.end());
88 DCHECK_LE(host_end, url.size());
89
90 return position == host_begin ||
91 (position > host_begin && position <= host_end &&
92 url[position - 1] == '.');
93 }
94
95 // Returns the position just beyond the leftmost fuzzy occurrence of
96 // |subpattern| in the |text|.
97 template <typename IntType>
98 inline size_t FindFirstOccurrenceFuzzy(base::StringPiece text,
99 base::StringPiece subpattern,
100 const IntType* failure) {
101 return *AllOccurrencesFuzzy<IntType>(text, subpattern, failure).begin();
102 }
103
104 // Returns the position just beyond the leftmost occurrence of |subpattern| in
105 // the |url|, such that it satisfies a SUBDOMAIN anchor.
106 template <typename IntType>
107 inline size_t FindSubdomainAnchored(base::StringPiece url,
108 url::Component host,
109 base::StringPiece subpattern,
110 const IntType* failure) {
111 auto occurrences = AllOccurrences<IntType>(url, subpattern, failure);
112 return *std::find_if(occurrences.begin(), occurrences.end(),
113 [url, host, subpattern](size_t match_end_position) {
114 DCHECK_GE(match_end_position, subpattern.size());
115 return IsSubdomainAnchored(
116 url, host, match_end_position - subpattern.size());
117 });
118 }
119
120 // Returns the position just beyond the leftmost fuzzy occurrence of
121 // |subpattern| in the |url|, such that it satisfies a SUBDOMAIN anchor.
122 template <typename IntType>
123 inline size_t FindSubdomainAnchoredFuzzy(base::StringPiece url,
124 url::Component host,
125 base::StringPiece subpattern,
126 const IntType* failure) {
127 auto occurrences = AllOccurrencesFuzzy<IntType>(url, subpattern, failure);
128 return *std::find_if(occurrences.begin(), occurrences.end(),
129 [url, host, subpattern](size_t match_end_position) {
130 DCHECK_GE(match_end_position, subpattern.size());
131 return IsSubdomainAnchored(
132 url, host, match_end_position - subpattern.size());
133 });
134 }
135
136 } // namespace impl
137
138 template <typename IntType>
139 void BuildFailureFunction(const UrlPattern& pattern,
140 std::vector<IntType>* failure) {
141 StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern);
142 auto subpattern_it = subpatterns.begin();
143 auto subpattern_end = subpatterns.end();
144
145 if (subpattern_it == subpattern_end)
146 return;
147 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY)
148 ++subpattern_it;
149
150 while (subpattern_it != subpattern_end) {
151 const base::StringPiece part = *subpattern_it++;
152 DCHECK(!part.empty());
153 if (pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY &&
154 subpattern_it == subpattern_end) {
155 break;
156 }
157
158 if (part.find(kSeparatorPlaceholder) == base::StringPiece::npos) {
159 BuildFailureFunction(part, failure);
160 } else {
161 // Prepend the value '1' to the failure function to let matchers
162 // distinguish between the subpatterns with separator placeholders and
163 // those without.
164 failure->push_back(1);
165 BuildFailureFunctionFuzzy(part, failure);
166 }
167 }
168 }
169
170 template <typename FailureIter>
171 bool IsUrlPatternMatch(const GURL& url,
172 const UrlPattern& pattern,
173 FailureIter failure_begin,
174 FailureIter failure_end) {
175 DCHECK(url.is_valid());
176
177 StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern);
178 auto subpattern_it = subpatterns.begin();
179 auto subpattern_end = subpatterns.end();
180
181 if (subpattern_it == subpattern_end) {
182 return pattern.anchor_left != proto::ANCHOR_TYPE_BOUNDARY ||
183 pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY ||
184 url.is_empty();
185 }
186
187 const base::StringPiece spec = url.possibly_invalid_spec();
188 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host;
189
190 base::StringPiece subpattern = *subpattern_it++;
191 if (subpattern_it == subpattern_end &&
192 pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
193 if (!EndsWithFuzzy(spec, subpattern))
194 return false;
195 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY)
196 return spec.size() == subpattern.size();
197 if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
198 DCHECK_LE(subpattern.size(), spec.size());
199 return url.has_host() &&
200 impl::IsSubdomainAnchored(spec, host_part,
201 spec.size() - subpattern.size());
202 }
203 return true;
204 }
205
206 base::StringPiece text = spec;
207 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) {
208 if (!StartsWithFuzzy(spec, subpattern))
209 return false;
210 if (subpattern_it == subpattern_end)
211 return true;
212 text.remove_prefix(subpattern.size());
213 } else if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) {
214 if (!url.has_host())
215 return false;
216
217 const bool has_separator_placeholders = (*failure_begin != 0);
218 if (has_separator_placeholders)
219 ++failure_begin;
220
221 const size_t position =
222 has_separator_placeholders
223 ? impl::FindSubdomainAnchoredFuzzy(spec, host_part, subpattern,
224 &*failure_begin)
225 : impl::FindSubdomainAnchored(spec, host_part, subpattern,
226 &*failure_begin);
227 if (position == base::StringPiece::npos)
228 return false;
229 if (subpattern_it == subpattern_end)
230 return true;
231 text.remove_prefix(position);
232 } else {
233 DCHECK_EQ(pattern.anchor_left, proto::ANCHOR_TYPE_NONE);
234 // Get back to the initial subpattern, process it in the loop below.
235 subpattern_it = subpatterns.begin();
236 }
237
238 while (subpattern_it != subpattern_end) {
239 subpattern = *subpattern_it++;
240 DCHECK(!subpattern.empty());
241
242 if (subpattern_it == subpattern_end &&
243 pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) {
244 break;
245 }
246
247 // The subpatterns with separator placeholders were indexed differently and
248 // should be matched accordingly. Their failure functions were prepended by
249 // a non-zero value, and we need to skip it. If the value turned to be zero,
250 // it is the initial value of a failure function of a regular substring.
251 CHECK(failure_begin < failure_end);
252 const bool has_separator_placeholders = (*failure_begin != 0);
253 if (has_separator_placeholders)
254 ++failure_begin;
255 CHECK(static_cast<size_t>(std::distance(failure_begin, failure_end)) >=
256 subpattern.size());
257
258 // If the subpattern has separator placeholders, it should be matched with
259 // FindFirstOccurrenceOfSubpattern, otherwise it can be found as a regular
260 // substring.
261 const size_t match_end =
262 (has_separator_placeholders
263 ? impl::FindFirstOccurrenceFuzzy(text, subpattern, &*failure_begin)
264 : FindFirstOccurrence(text, subpattern, &*failure_begin));
265 if (match_end == base::StringPiece::npos)
266 return false;
267 text.remove_prefix(match_end);
268 failure_begin += subpattern.size();
269 }
270
271 return pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY ||
272 EndsWithFuzzy(text, subpattern);
273 }
274
275 } // namespace subresource_filter
276
277 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698