Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 // The matching logic distinguishes between the terms URL pattern and | |
| 6 // subpattern. A URL pattern usually stands for the full thing, e.g. | |
| 7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring | |
| 8 // of a pattern not containing the wildcard '*' character. For the example above | |
| 9 // the subpatterns are: "example.com^", "path" and "par=val^". | |
| 10 // | |
| 11 // The separator placeholder '^' symbol is used in subpatterns to match any | |
| 12 // separator character, which is any ASCII symbol except letters, digits, and | |
| 13 // the following: '_', '-', '.', '%'. Note that the separator placeholder | |
| 14 // character '^' is itself a separator, as well as '\0'. | |
| 15 | |
| 5 #include "components/subresource_filter/core/common/url_pattern.h" | 16 #include "components/subresource_filter/core/common/url_pattern.h" |
| 6 | 17 |
| 18 #include <stddef.h> | |
| 19 | |
| 20 #include <algorithm> | |
| 21 #include <ostream> | |
| 22 | |
| 23 #include "base/logging.h" | |
| 7 #include "components/subresource_filter/core/common/flat/rules_generated.h" | 24 #include "components/subresource_filter/core/common/flat/rules_generated.h" |
| 25 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" | |
| 26 #include "components/subresource_filter/core/common/string_splitter.h" | |
| 27 #include "url/gurl.h" | |
| 28 #include "url/third_party/mozilla/url_parse.h" | |
| 8 | 29 |
| 9 namespace subresource_filter { | 30 namespace subresource_filter { |
| 10 | 31 |
| 11 namespace { | 32 namespace { |
| 12 | 33 |
| 34 class IsWildcard { | |
| 35 public: | |
| 36 bool operator()(char c) const { return c == '*'; } | |
| 37 }; | |
| 38 | |
| 13 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { | 39 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { |
| 14 switch (type) { | 40 switch (type) { |
| 15 case flat::UrlPatternType_SUBSTRING: | 41 case flat::UrlPatternType_SUBSTRING: |
| 16 return proto::URL_PATTERN_TYPE_SUBSTRING; | 42 return proto::URL_PATTERN_TYPE_SUBSTRING; |
| 17 case flat::UrlPatternType_WILDCARDED: | 43 case flat::UrlPatternType_WILDCARDED: |
| 18 return proto::URL_PATTERN_TYPE_WILDCARDED; | 44 return proto::URL_PATTERN_TYPE_WILDCARDED; |
| 19 case flat::UrlPatternType_REGEXP: | 45 case flat::UrlPatternType_REGEXP: |
| 20 return proto::URL_PATTERN_TYPE_REGEXP; | 46 return proto::URL_PATTERN_TYPE_REGEXP; |
| 21 default: | 47 default: |
| 22 return proto::URL_PATTERN_TYPE_UNSPECIFIED; | 48 return proto::URL_PATTERN_TYPE_UNSPECIFIED; |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 34 default: | 60 default: |
| 35 return proto::ANCHOR_TYPE_UNSPECIFIED; | 61 return proto::ANCHOR_TYPE_UNSPECIFIED; |
| 36 } | 62 } |
| 37 } | 63 } |
| 38 | 64 |
| 39 base::StringPiece ConvertString(const flatbuffers::String* string) { | 65 base::StringPiece ConvertString(const flatbuffers::String* string) { |
| 40 return string ? base::StringPiece(string->data(), string->size()) | 66 return string ? base::StringPiece(string->data(), string->size()) |
| 41 : base::StringPiece(); | 67 : base::StringPiece(); |
| 42 } | 68 } |
| 43 | 69 |
| 70 // Returns whether |position| within the |url| belongs to its |host| component | |
| 71 // and corresponds to the beginning of a (sub-)domain. | |
| 72 inline bool IsSubdomainAnchored(base::StringPiece url, | |
| 73 url::Component host, | |
| 74 size_t position) { | |
| 75 DCHECK_LE(position, url.size()); | |
| 76 const size_t host_begin = static_cast<size_t>(host.begin); | |
| 77 const size_t host_end = static_cast<size_t>(host.end()); | |
| 78 DCHECK_LE(host_end, url.size()); | |
| 79 | |
| 80 return position == host_begin || | |
| 81 (position > host_begin && position <= host_end && | |
| 82 url[position - 1] == '.'); | |
| 83 } | |
| 84 | |
| 85 // Returns the position of the leftmost occurrence of |subpattern| in the | |
| 86 // |text|, starting |from| a certain position. If |is_fuzzy| then | |
|
engedy
2017/04/05 10:55:11
nit: s/a certain/the specified/g
pkalinnikov
2017/04/05 13:29:21
Done.
| |
| 87 // searches for a fuzzy occurrence. | |
| 88 size_t FindSubpatternImpl(base::StringPiece text, | |
| 89 base::StringPiece subpattern, | |
| 90 size_t from, | |
| 91 bool is_fuzzy) { | |
| 92 if (!is_fuzzy) | |
| 93 return text.find(subpattern, from); | |
| 94 | |
| 95 auto* found = | |
| 96 std::search(text.begin() + from, text.end(), subpattern.begin(), | |
|
engedy
2017/04/05 10:55:11
nit: Have you considered moving this to fuzzy_patt
pkalinnikov
2017/04/05 13:29:21
Done. Also, reused the old AllOccurrences test.
| |
| 97 subpattern.end(), [](char text_char, char subpattern_char) { | |
|
Charlie Harrison
2017/04/04 16:49:39
nit: Could you hoist the lambda to a member var? I
pkalinnikov
2017/04/05 13:29:21
Done.
| |
| 98 return text_char == subpattern_char || | |
| 99 (subpattern_char == kSeparatorPlaceholder && | |
| 100 IsSeparator(text_char)); | |
| 101 }); | |
| 102 return found == text.end() ? base::StringPiece::npos : found - text.begin(); | |
| 103 } | |
| 104 | |
| 105 // Returns the position of the leftmost occurrence of |subpattern| in the | |
| 106 // |text|, starting |from| a certain position. If the |subpattern| has separator | |
| 107 // placeholders, searches for a fuzzy occurrence. | |
| 108 size_t FindSubpattern(base::StringPiece text, | |
| 109 base::StringPiece subpattern, | |
| 110 size_t from = 0) { | |
| 111 const bool is_fuzzy = | |
| 112 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); | |
| 113 return FindSubpatternImpl(text, subpattern, from, is_fuzzy); | |
| 114 } | |
| 115 | |
| 116 // Same as FindSubpattern(url, subpattern), but searches for an occurrence that | |
| 117 // starts in the beginning of a (sub-)domain within the url's |host| component. | |
|
engedy
2017/04/05 10:55:11
nit: ... at the beginning ...
pkalinnikov
2017/04/05 13:29:21
Done.
| |
| 118 size_t FindSubdomainAnchoredSubpattern(base::StringPiece url, | |
| 119 url::Component host, | |
| 120 base::StringPiece subpattern) { | |
| 121 const bool is_fuzzy = | |
|
engedy
2017/04/05 10:55:11
Do we need to check subpattern for being empty her
pkalinnikov
2017/04/05 13:29:21
Done.
| |
| 122 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); | |
| 123 for (size_t position = FindSubpatternImpl(url, subpattern, 0, is_fuzzy); | |
| 124 position != base::StringPiece::npos; | |
| 125 position = FindSubpatternImpl(url, subpattern, ++position, is_fuzzy)) { | |
| 126 if (IsSubdomainAnchored(url, host, position)) | |
| 127 return position; | |
| 128 } | |
| 129 return base::StringPiece::npos; | |
| 130 } | |
| 131 | |
| 44 } // namespace | 132 } // namespace |
| 45 | 133 |
| 46 UrlPattern::UrlPattern() = default; | 134 UrlPattern::UrlPattern() = default; |
| 47 | 135 |
| 48 UrlPattern::UrlPattern(base::StringPiece url_pattern, | 136 UrlPattern::UrlPattern(base::StringPiece url_pattern, |
| 49 proto::UrlPatternType type) | 137 proto::UrlPatternType type) |
| 50 : type(type), url_pattern(url_pattern) {} | 138 : type_(type), url_pattern_(url_pattern) {} |
| 51 | 139 |
| 52 UrlPattern::UrlPattern(base::StringPiece url_pattern, | 140 UrlPattern::UrlPattern(base::StringPiece url_pattern, |
| 53 proto::AnchorType anchor_left, | 141 proto::AnchorType anchor_left, |
| 54 proto::AnchorType anchor_right) | 142 proto::AnchorType anchor_right) |
| 55 : type(proto::URL_PATTERN_TYPE_WILDCARDED), | 143 : type_(proto::URL_PATTERN_TYPE_WILDCARDED), |
| 56 url_pattern(url_pattern), | 144 url_pattern_(url_pattern), |
| 57 anchor_left(anchor_left), | 145 anchor_left_(anchor_left), |
| 58 anchor_right(anchor_right) {} | 146 anchor_right_(anchor_right) {} |
| 59 | 147 |
| 60 UrlPattern::UrlPattern(const flat::UrlRule& rule) | 148 UrlPattern::UrlPattern(const flat::UrlRule& rule) |
| 61 : type(ConvertUrlPatternType(rule.url_pattern_type())), | 149 : type_(ConvertUrlPatternType(rule.url_pattern_type())), |
| 62 url_pattern(ConvertString(rule.url_pattern())), | 150 url_pattern_(ConvertString(rule.url_pattern())), |
| 63 anchor_left(ConvertAnchorType(rule.anchor_left())), | 151 anchor_left_(ConvertAnchorType(rule.anchor_left())), |
| 64 anchor_right(ConvertAnchorType(rule.anchor_right())), | 152 anchor_right_(ConvertAnchorType(rule.anchor_right())), |
| 65 match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} | 153 match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} |
| 66 | 154 |
| 67 UrlPattern::UrlPattern(const proto::UrlRule& rule) | 155 UrlPattern::UrlPattern(const proto::UrlRule& rule) |
| 68 : type(rule.url_pattern_type()), | 156 : type_(rule.url_pattern_type()), |
| 69 url_pattern(rule.url_pattern()), | 157 url_pattern_(rule.url_pattern()), |
| 70 anchor_left(rule.anchor_left()), | 158 anchor_left_(rule.anchor_left()), |
| 71 anchor_right(rule.anchor_right()), | 159 anchor_right_(rule.anchor_right()), |
| 72 match_case(rule.match_case()) {} | 160 match_case_(rule.match_case()) {} |
| 73 | 161 |
| 74 UrlPattern::~UrlPattern() = default; | 162 UrlPattern::~UrlPattern() = default; |
| 75 | 163 |
| 164 bool UrlPattern::MatchesUrl(const GURL& url) const { | |
| 165 DCHECK(url.is_valid()); | |
|
engedy
2017/04/05 10:55:11
Could you please add a comment here that empty URL
pkalinnikov
2017/04/05 13:29:21
Done.
| |
| 166 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING || | |
| 167 proto::URL_PATTERN_TYPE_WILDCARDED); | |
| 168 | |
| 169 StringSplitter<IsWildcard> subpatterns(url_pattern_); | |
| 170 auto subpattern_it = subpatterns.begin(); | |
| 171 auto subpattern_end = subpatterns.end(); | |
| 172 | |
| 173 if (subpattern_it == subpattern_end) { | |
| 174 return anchor_left_ == proto::ANCHOR_TYPE_NONE || | |
| 175 anchor_right_ == proto::ANCHOR_TYPE_NONE; | |
| 176 } | |
| 177 | |
| 178 const base::StringPiece spec = url.possibly_invalid_spec(); | |
| 179 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; | |
| 180 DCHECK(!spec.empty()); | |
| 181 | |
| 182 base::StringPiece subpattern = *subpattern_it; | |
|
engedy
2017/04/05 10:55:11
nit: Could you please add a comment to explain why
pkalinnikov
2017/04/05 13:29:21
I splitted *subpattern_it++ in two lines, because
engedy
2017/04/05 13:41:33
Acknowledged.
| |
| 183 ++subpattern_it; | |
| 184 | |
| 185 // If there is only one |subpattern|, and it has a right anchor, then simply | |
| 186 // check that it is a suffix of the |spec|, and the left anchor is fulfilled. | |
| 187 if (subpattern_it == subpattern_end && | |
| 188 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { | |
| 189 if (!EndsWithFuzzy(spec, subpattern)) | |
| 190 return false; | |
| 191 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) | |
| 192 return spec.size() == subpattern.size(); | |
| 193 if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
| 194 DCHECK_LE(subpattern.size(), spec.size()); | |
| 195 return url.has_host() && | |
| 196 IsSubdomainAnchored(spec, host_part, | |
| 197 spec.size() - subpattern.size()); | |
| 198 } | |
| 199 return true; | |
| 200 } | |
| 201 | |
| 202 // Otherwise, the first |subpattern| does not have to be a suffix. But it | |
| 203 // still can have a left anchor. Check and handle that. | |
| 204 base::StringPiece text = spec; | |
| 205 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) { | |
| 206 if (!StartsWithFuzzy(spec, subpattern)) | |
| 207 return false; | |
| 208 if (subpattern_it == subpattern_end) { | |
| 209 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); | |
| 210 return true; | |
| 211 } | |
| 212 text.remove_prefix(subpattern.size()); | |
| 213 } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { | |
| 214 if (!url.has_host()) | |
| 215 return false; | |
| 216 const size_t match_begin = | |
| 217 FindSubdomainAnchoredSubpattern(spec, host_part, subpattern); | |
| 218 if (match_begin == base::StringPiece::npos) | |
| 219 return false; | |
| 220 if (subpattern_it == subpattern_end) { | |
| 221 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); | |
| 222 return true; | |
| 223 } | |
| 224 text.remove_prefix(match_begin + subpattern.size()); | |
| 225 } else { | |
| 226 DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE); | |
| 227 // Get back to the initial |subpattern|, process it in the loop below. | |
| 228 subpattern_it = subpatterns.begin(); | |
| 229 } | |
| 230 | |
| 231 // Consecutively find all the remaining subpatterns in the |text|. If the | |
| 232 // pattern has a right anchor, don't search for the last subpattern, but | |
| 233 // instead check that it is a suffix of the |text|. | |
| 234 while (subpattern_it != subpattern_end) { | |
| 235 subpattern = *subpattern_it; | |
| 236 DCHECK(!subpattern.empty()); | |
| 237 | |
| 238 if (++subpattern_it == subpattern_end && | |
| 239 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { | |
| 240 break; | |
| 241 } | |
| 242 | |
| 243 const size_t match_position = FindSubpattern(text, subpattern); | |
| 244 if (match_position == base::StringPiece::npos) | |
| 245 return false; | |
| 246 text.remove_prefix(match_position + subpattern.size()); | |
| 247 } | |
| 248 | |
| 249 return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY || | |
| 250 EndsWithFuzzy(text, subpattern); | |
| 251 } | |
| 252 | |
| 253 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) { | |
| 254 // Note: Each fall-through in this switch is intentional. | |
| 255 switch (pattern.anchor_left()) { | |
| 256 case proto::ANCHOR_TYPE_SUBDOMAIN: | |
| 257 out << '|'; | |
| 258 case proto::ANCHOR_TYPE_BOUNDARY: | |
| 259 out << '|'; | |
| 260 default: | |
| 261 break; | |
| 262 } | |
| 263 out << pattern.url_pattern(); | |
| 264 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY) | |
| 265 out << '|'; | |
| 266 if (pattern.match_case()) | |
| 267 out << "$match-case"; | |
| 268 | |
| 269 return out; | |
| 270 } | |
| 271 | |
| 76 } // namespace subresource_filter | 272 } // namespace subresource_filter |
| OLD | NEW |