| OLD | NEW |
| (Empty) |
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | |
| 2 // Use of this source code is governed by a BSD-style license that can be | |
| 3 // found in the LICENSE file. | |
| 4 | |
| 5 // The matching logic distinguishes between the terms URL pattern and | |
| 6 // subpattern. A URL pattern usually stands for the full thing, e.g. | |
| 7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring | |
| 8 // of a pattern not containing the wildcard '*' character. For the example above | |
| 9 // the subpatterns are: "example.com^", "path" and "par=val^". | |
| 10 // | |
| 11 // The separator placeholder '^' symbol is used in subpatterns to match any | |
| 12 // separator character, which is any ASCII symbol except letters, digits, and | |
| 13 // the following: '_', '-', '.', '%'. Note that the separator placeholder | |
| 14 // character '^' is itself a separator, as well as '\0'. | |
| 15 | |
| 16 #ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ | |
| 17 #define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ | |
| 18 | |
| 19 #include <stddef.h> | |
| 20 | |
| 21 #include <algorithm> | |
| 22 #include <iterator> | |
| 23 #include <vector> | |
| 24 | |
| 25 #include "base/logging.h" | |
| 26 #include "base/strings/string_piece.h" | |
| 27 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" | |
| 28 #include "components/subresource_filter/core/common/knuth_morris_pratt.h" | |
| 29 #include "components/subresource_filter/core/common/string_splitter.h" | |
| 30 #include "components/subresource_filter/core/common/url_pattern.h" | |
| 31 #include "url/gurl.h" | |
| 32 #include "url/third_party/mozilla/url_parse.h" | |
| 33 | |
| 34 namespace subresource_filter { | |
| 35 | |
| 36 // Public interface declaration ------------------------------------------------ | |
| 37 | |
| 38 // Builds a compound Knuth-Morris-Pratt failure function used to match URLs | |
| 39 // against the |pattern|. | |
| 40 // | |
| 41 // The |pattern| is split on the '*' wildcard symbol and then a failure function | |
| 42 // is built for each subpattern by BuildFailureFunctionFuzzy or | |
| 43 // BuildFailureFunction (depending on whether the subpattern contains separator | |
| 44 // placeholders), and appended to the returned vector. Some of the subpatterns | |
| 45 // can be exempted from being indexed. E.g., if the |pattern| has a BOUNDARY | |
| 46 // left anchor, the first subpattern can be matched by checking if it's a prefix | |
| 47 // of a URL. | |
| 48 // | |
| 49 // Each subpattern indexed with BuildFailureFunctionFuzzy is prepended with a | |
| 50 // value 1 (to distinguish them from the subpatterns indexed with | |
| 51 // BuildFailureFunction, their failure functions always start with 0). | |
| 52 // | |
| 53 // The URL |pattern| must be normalized. Namely, it must not have the following | |
| 54 // substrings: **, *<END>, <BEGIN>*. If the the |pattern| has a BOUNDARY anchor, | |
| 55 // the corresponding side of its string must not end with a '*' wildcard. | |
| 56 template <typename IntType> | |
| 57 void BuildFailureFunction(const UrlPattern& pattern, | |
| 58 std::vector<IntType>* failure); | |
| 59 | |
| 60 // Returns whether the |url| matches the URL |pattern|. The |failure| function | |
| 61 // between |failure_begin| and |failure_end| must be the output of | |
| 62 // BuildFailureFunction() called with the same |pattern|. | |
| 63 // | |
| 64 // TODO(pkalinnikov): Outline algorithms implemented in this function. | |
| 65 template <typename FailureIter> | |
| 66 bool IsUrlPatternMatch(const GURL& url, | |
| 67 const UrlPattern& pattern, | |
| 68 FailureIter failure_begin, | |
| 69 FailureIter failure_end); | |
| 70 | |
| 71 // Implementation -------------------------------------------------------------- | |
| 72 | |
| 73 namespace impl { | |
| 74 | |
| 75 class IsWildcard { | |
| 76 public: | |
| 77 bool operator()(char c) const { return c == '*'; } | |
| 78 }; | |
| 79 | |
| 80 // Returns whether |position| within the |url| belongs to its |host| component | |
| 81 // and corresponds to the beginning of a (sub-)domain. | |
| 82 inline bool IsSubdomainAnchored(base::StringPiece url, | |
| 83 url::Component host, | |
| 84 size_t position) { | |
| 85 DCHECK_LE(position, url.size()); | |
| 86 const size_t host_begin = static_cast<size_t>(host.begin); | |
| 87 const size_t host_end = static_cast<size_t>(host.end()); | |
| 88 DCHECK_LE(host_end, url.size()); | |
| 89 | |
| 90 return position == host_begin || | |
| 91 (position > host_begin && position <= host_end && | |
| 92 url[position - 1] == '.'); | |
| 93 } | |
| 94 | |
| 95 // Returns the position just beyond the leftmost fuzzy occurrence of | |
| 96 // |subpattern| in the |text|. | |
| 97 template <typename IntType> | |
| 98 inline size_t FindFirstOccurrenceFuzzy(base::StringPiece text, | |
| 99 base::StringPiece subpattern, | |
| 100 const IntType* failure) { | |
| 101 return *AllOccurrencesFuzzy<IntType>(text, subpattern, failure).begin(); | |
| 102 } | |
| 103 | |
| 104 // Returns the position just beyond the leftmost occurrence of |subpattern| in | |
| 105 // the |url|, such that it satisfies a SUBDOMAIN anchor. | |
| 106 template <typename IntType> | |
| 107 inline size_t FindSubdomainAnchored(base::StringPiece url, | |
| 108 url::Component host, | |
| 109 base::StringPiece subpattern, | |
| 110 const IntType* failure) { | |
| 111 auto occurrences = AllOccurrences<IntType>(url, subpattern, failure); | |
| 112 return *std::find_if(occurrences.begin(), occurrences.end(), | |
| 113 [url, host, subpattern](size_t match_end_position) { | |
| 114 DCHECK_GE(match_end_position, subpattern.size()); | |
| 115 return IsSubdomainAnchored( | |
| 116 url, host, match_end_position - subpattern.size()); | |
| 117 }); | |
| 118 } | |
| 119 | |
| 120 // Returns the position just beyond the leftmost fuzzy occurrence of | |
| 121 // |subpattern| in the |url|, such that it satisfies a SUBDOMAIN anchor. | |
| 122 template <typename IntType> | |
| 123 inline size_t FindSubdomainAnchoredFuzzy(base::StringPiece url, | |
| 124 url::Component host, | |
| 125 base::StringPiece subpattern, | |
| 126 const IntType* failure) { | |
| 127 auto occurrences = AllOccurrencesFuzzy<IntType>(url, subpattern, failure); | |
| 128 return *std::find_if(occurrences.begin(), occurrences.end(), | |
| 129 [url, host, subpattern](size_t match_end_position) { | |
| 130 DCHECK_GE(match_end_position, subpattern.size()); | |
| 131 return IsSubdomainAnchored( | |
| 132 url, host, match_end_position - subpattern.size()); | |
| 133 }); | |
| 134 } | |
| 135 | |
| 136 } // namespace impl | |
| 137 | |
| 138 template <typename IntType> | |
| 139 void BuildFailureFunction(const UrlPattern& pattern, | |
| 140 std::vector<IntType>* failure) { | |
| 141 StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern); | |
| 142 auto subpattern_it = subpatterns.begin(); | |
| 143 auto subpattern_end = subpatterns.end(); | |
| 144 | |
| 145 if (subpattern_it == subpattern_end) | |
| 146 return; | |
| 147 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) | |
| 148 ++subpattern_it; | |
| 149 | |
| 150 while (subpattern_it != subpattern_end) { | |
| 151 const base::StringPiece part = *subpattern_it++; | |
| 152 DCHECK(!part.empty()); | |
| 153 if (pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY && | |
| 154 subpattern_it == subpattern_end) { | |
| 155 break; | |
| 156 } | |
| 157 | |
| 158 if (part.find(kSeparatorPlaceholder) == base::StringPiece::npos) { | |
| 159 BuildFailureFunction(part, failure); | |
| 160 } else { | |
| 161 // Prepend the value '1' to the failure function to let matchers | |
| 162 // distinguish between the subpatterns with separator placeholders and | |
| 163 // those without. | |
| 164 failure->push_back(1); | |
| 165 BuildFailureFunctionFuzzy(part, failure); | |
| 166 } | |
| 167 } | |
| 168 } | |
| 169 | |
| 170 template <typename FailureIter> | |
| 171 bool IsUrlPatternMatch(const GURL& url, | |
| 172 const UrlPattern& pattern, | |
| 173 FailureIter failure_begin, | |
| 174 FailureIter failure_end) { | |
| 175 DCHECK(url.is_valid()); | |
| 176 | |
| 177 StringSplitter<impl::IsWildcard> subpatterns(pattern.url_pattern); | |
| 178 auto subpattern_it = subpatterns.begin(); | |
| 179 auto subpattern_end = subpatterns.end(); | |
| 180 | |
| 181 if (subpattern_it == subpattern_end) { | |
| 182 return pattern.anchor_left != proto::ANCHOR_TYPE_BOUNDARY || | |
| 183 pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY || | |
| 184 url.is_empty(); | |
| 185 } | |
| 186 | |
| 187 const base::StringPiece spec = url.possibly_invalid_spec(); | |
| 188 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; | |
| 189 | |
| 190 base::StringPiece subpattern = *subpattern_it++; | |
| 191 if (subpattern_it == subpattern_end && | |
| 192 pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) { | |
| 193 if (!EndsWithFuzzy(spec, subpattern)) | |
| 194 return false; | |
| 195 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) | |
| 196 return spec.size() == subpattern.size(); | |
| 197 if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
| 198 DCHECK_LE(subpattern.size(), spec.size()); | |
| 199 return url.has_host() && | |
| 200 impl::IsSubdomainAnchored(spec, host_part, | |
| 201 spec.size() - subpattern.size()); | |
| 202 } | |
| 203 return true; | |
| 204 } | |
| 205 | |
| 206 base::StringPiece text = spec; | |
| 207 if (pattern.anchor_left == proto::ANCHOR_TYPE_BOUNDARY) { | |
| 208 if (!StartsWithFuzzy(spec, subpattern)) | |
| 209 return false; | |
| 210 if (subpattern_it == subpattern_end) | |
| 211 return true; | |
| 212 text.remove_prefix(subpattern.size()); | |
| 213 } else if (pattern.anchor_left == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
| 214 if (!url.has_host()) | |
| 215 return false; | |
| 216 | |
| 217 const bool has_separator_placeholders = (*failure_begin != 0); | |
| 218 if (has_separator_placeholders) | |
| 219 ++failure_begin; | |
| 220 | |
| 221 const size_t position = | |
| 222 has_separator_placeholders | |
| 223 ? impl::FindSubdomainAnchoredFuzzy(spec, host_part, subpattern, | |
| 224 &*failure_begin) | |
| 225 : impl::FindSubdomainAnchored(spec, host_part, subpattern, | |
| 226 &*failure_begin); | |
| 227 if (position == base::StringPiece::npos) | |
| 228 return false; | |
| 229 if (subpattern_it == subpattern_end) | |
| 230 return true; | |
| 231 text.remove_prefix(position); | |
| 232 } else { | |
| 233 DCHECK_EQ(pattern.anchor_left, proto::ANCHOR_TYPE_NONE); | |
| 234 // Get back to the initial subpattern, process it in the loop below. | |
| 235 subpattern_it = subpatterns.begin(); | |
| 236 } | |
| 237 | |
| 238 while (subpattern_it != subpattern_end) { | |
| 239 subpattern = *subpattern_it++; | |
| 240 DCHECK(!subpattern.empty()); | |
| 241 | |
| 242 if (subpattern_it == subpattern_end && | |
| 243 pattern.anchor_right == proto::ANCHOR_TYPE_BOUNDARY) { | |
| 244 break; | |
| 245 } | |
| 246 | |
| 247 // The subpatterns with separator placeholders were indexed differently and | |
| 248 // should be matched accordingly. Their failure functions were prepended by | |
| 249 // a non-zero value, and we need to skip it. If the value turned to be zero, | |
| 250 // it is the initial value of a failure function of a regular substring. | |
| 251 CHECK(failure_begin < failure_end); | |
| 252 const bool has_separator_placeholders = (*failure_begin != 0); | |
| 253 if (has_separator_placeholders) | |
| 254 ++failure_begin; | |
| 255 CHECK(static_cast<size_t>(std::distance(failure_begin, failure_end)) >= | |
| 256 subpattern.size()); | |
| 257 | |
| 258 // If the subpattern has separator placeholders, it should be matched with | |
| 259 // FindFirstOccurrenceOfSubpattern, otherwise it can be found as a regular | |
| 260 // substring. | |
| 261 const size_t match_end = | |
| 262 (has_separator_placeholders | |
| 263 ? impl::FindFirstOccurrenceFuzzy(text, subpattern, &*failure_begin) | |
| 264 : FindFirstOccurrence(text, subpattern, &*failure_begin)); | |
| 265 if (match_end == base::StringPiece::npos) | |
| 266 return false; | |
| 267 text.remove_prefix(match_end); | |
| 268 failure_begin += subpattern.size(); | |
| 269 } | |
| 270 | |
| 271 return pattern.anchor_right != proto::ANCHOR_TYPE_BOUNDARY || | |
| 272 EndsWithFuzzy(text, subpattern); | |
| 273 } | |
| 274 | |
| 275 } // namespace subresource_filter | |
| 276 | |
| 277 #endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_MATCHING_H_ | |
| OLD | NEW |