OLD | NEW |
1 // Copyright 2016 The Chromium Authors. All rights reserved. | 1 // Copyright 2016 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
| 5 // The matching logic distinguishes between the terms URL pattern and |
| 6 // subpattern. A URL pattern usually stands for the full thing, e.g. |
| 7 // "example.com^*path*par=val^", whereas subpattern denotes a maximal substring |
| 8 // of a pattern not containing the wildcard '*' character. For the example above |
| 9 // the subpatterns are: "example.com^", "path" and "par=val^". |
| 10 // |
| 11 // The separator placeholder '^' symbol is used in subpatterns to match any |
| 12 // separator character, which is any ASCII symbol except letters, digits, and |
| 13 // the following: '_', '-', '.', '%'. Note that the separator placeholder |
| 14 // character '^' is itself a separator, as well as '\0'. |
| 15 |
5 #include "components/subresource_filter/core/common/url_pattern.h" | 16 #include "components/subresource_filter/core/common/url_pattern.h" |
6 | 17 |
| 18 #include <stddef.h> |
| 19 |
| 20 #include <ostream> |
| 21 |
| 22 #include "base/logging.h" |
7 #include "components/subresource_filter/core/common/flat/rules_generated.h" | 23 #include "components/subresource_filter/core/common/flat/rules_generated.h" |
| 24 #include "components/subresource_filter/core/common/fuzzy_pattern_matching.h" |
| 25 #include "components/subresource_filter/core/common/string_splitter.h" |
| 26 #include "url/gurl.h" |
| 27 #include "url/third_party/mozilla/url_parse.h" |
8 | 28 |
9 namespace subresource_filter { | 29 namespace subresource_filter { |
10 | 30 |
11 namespace { | 31 namespace { |
12 | 32 |
| 33 class IsWildcard { |
| 34 public: |
| 35 bool operator()(char c) const { return c == '*'; } |
| 36 }; |
| 37 |
13 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { | 38 proto::UrlPatternType ConvertUrlPatternType(flat::UrlPatternType type) { |
14 switch (type) { | 39 switch (type) { |
15 case flat::UrlPatternType_SUBSTRING: | 40 case flat::UrlPatternType_SUBSTRING: |
16 return proto::URL_PATTERN_TYPE_SUBSTRING; | 41 return proto::URL_PATTERN_TYPE_SUBSTRING; |
17 case flat::UrlPatternType_WILDCARDED: | 42 case flat::UrlPatternType_WILDCARDED: |
18 return proto::URL_PATTERN_TYPE_WILDCARDED; | 43 return proto::URL_PATTERN_TYPE_WILDCARDED; |
19 case flat::UrlPatternType_REGEXP: | 44 case flat::UrlPatternType_REGEXP: |
20 return proto::URL_PATTERN_TYPE_REGEXP; | 45 return proto::URL_PATTERN_TYPE_REGEXP; |
21 default: | 46 default: |
22 return proto::URL_PATTERN_TYPE_UNSPECIFIED; | 47 return proto::URL_PATTERN_TYPE_UNSPECIFIED; |
(...skipping 11 matching lines...) Expand all Loading... |
34 default: | 59 default: |
35 return proto::ANCHOR_TYPE_UNSPECIFIED; | 60 return proto::ANCHOR_TYPE_UNSPECIFIED; |
36 } | 61 } |
37 } | 62 } |
38 | 63 |
39 base::StringPiece ConvertString(const flatbuffers::String* string) { | 64 base::StringPiece ConvertString(const flatbuffers::String* string) { |
40 return string ? base::StringPiece(string->data(), string->size()) | 65 return string ? base::StringPiece(string->data(), string->size()) |
41 : base::StringPiece(); | 66 : base::StringPiece(); |
42 } | 67 } |
43 | 68 |
| 69 // Returns whether |position| within the |url| belongs to its |host| component |
| 70 // and corresponds to the beginning of a (sub-)domain. |
| 71 inline bool IsSubdomainAnchored(base::StringPiece url, |
| 72 url::Component host, |
| 73 size_t position) { |
| 74 DCHECK_LE(position, url.size()); |
| 75 const size_t host_begin = static_cast<size_t>(host.begin); |
| 76 const size_t host_end = static_cast<size_t>(host.end()); |
| 77 DCHECK_LE(host_end, url.size()); |
| 78 |
| 79 return position == host_begin || |
| 80 (position > host_begin && position <= host_end && |
| 81 url[position - 1] == '.'); |
| 82 } |
| 83 |
| 84 // Returns the position of the leftmost occurrence of a |subpattern| in the |
| 85 // |text| starting no earlier than |from| the specified position. If the |
| 86 // |subpattern| has separator placeholders, searches for a fuzzy occurrence. |
| 87 size_t FindSubpattern(base::StringPiece text, |
| 88 base::StringPiece subpattern, |
| 89 size_t from = 0) { |
| 90 const bool is_fuzzy = |
| 91 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); |
| 92 return is_fuzzy ? FindFuzzy(text, subpattern, from) |
| 93 : text.find(subpattern, from); |
| 94 } |
| 95 |
| 96 // Same as FindSubpattern(url, subpattern), but searches for an occurrence that |
| 97 // starts at the beginning of a (sub-)domain within the url's |host| component. |
| 98 size_t FindSubdomainAnchoredSubpattern(base::StringPiece url, |
| 99 url::Component host, |
| 100 base::StringPiece subpattern) { |
| 101 const bool is_fuzzy = |
| 102 (subpattern.find(kSeparatorPlaceholder) != base::StringPiece::npos); |
| 103 |
| 104 for (size_t position = 0; position <= url.size(); ++position) { |
| 105 position = is_fuzzy ? FindFuzzy(url, subpattern, position) |
| 106 : url.find(subpattern, position); |
| 107 if (position == base::StringPiece::npos || |
| 108 IsSubdomainAnchored(url, host, position)) { |
| 109 return position; |
| 110 } |
| 111 } |
| 112 return base::StringPiece::npos; |
| 113 } |
| 114 |
44 } // namespace | 115 } // namespace |
45 | 116 |
46 UrlPattern::UrlPattern() = default; | 117 UrlPattern::UrlPattern() = default; |
47 | 118 |
48 UrlPattern::UrlPattern(base::StringPiece url_pattern, | 119 UrlPattern::UrlPattern(base::StringPiece url_pattern, |
49 proto::UrlPatternType type) | 120 proto::UrlPatternType type) |
50 : type(type), url_pattern(url_pattern) {} | 121 : type_(type), url_pattern_(url_pattern) {} |
51 | 122 |
52 UrlPattern::UrlPattern(base::StringPiece url_pattern, | 123 UrlPattern::UrlPattern(base::StringPiece url_pattern, |
53 proto::AnchorType anchor_left, | 124 proto::AnchorType anchor_left, |
54 proto::AnchorType anchor_right) | 125 proto::AnchorType anchor_right) |
55 : type(proto::URL_PATTERN_TYPE_WILDCARDED), | 126 : type_(proto::URL_PATTERN_TYPE_WILDCARDED), |
56 url_pattern(url_pattern), | 127 url_pattern_(url_pattern), |
57 anchor_left(anchor_left), | 128 anchor_left_(anchor_left), |
58 anchor_right(anchor_right) {} | 129 anchor_right_(anchor_right) {} |
59 | 130 |
60 UrlPattern::UrlPattern(const flat::UrlRule& rule) | 131 UrlPattern::UrlPattern(const flat::UrlRule& rule) |
61 : type(ConvertUrlPatternType(rule.url_pattern_type())), | 132 : type_(ConvertUrlPatternType(rule.url_pattern_type())), |
62 url_pattern(ConvertString(rule.url_pattern())), | 133 url_pattern_(ConvertString(rule.url_pattern())), |
63 anchor_left(ConvertAnchorType(rule.anchor_left())), | 134 anchor_left_(ConvertAnchorType(rule.anchor_left())), |
64 anchor_right(ConvertAnchorType(rule.anchor_right())), | 135 anchor_right_(ConvertAnchorType(rule.anchor_right())), |
65 match_case(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} | 136 match_case_(!!(rule.options() & flat::OptionFlag_IS_MATCH_CASE)) {} |
66 | 137 |
67 UrlPattern::UrlPattern(const proto::UrlRule& rule) | 138 UrlPattern::UrlPattern(const proto::UrlRule& rule) |
68 : type(rule.url_pattern_type()), | 139 : type_(rule.url_pattern_type()), |
69 url_pattern(rule.url_pattern()), | 140 url_pattern_(rule.url_pattern()), |
70 anchor_left(rule.anchor_left()), | 141 anchor_left_(rule.anchor_left()), |
71 anchor_right(rule.anchor_right()), | 142 anchor_right_(rule.anchor_right()), |
72 match_case(rule.match_case()) {} | 143 match_case_(rule.match_case()) {} |
73 | 144 |
74 UrlPattern::~UrlPattern() = default; | 145 UrlPattern::~UrlPattern() = default; |
75 | 146 |
| 147 bool UrlPattern::MatchesUrl(const GURL& url) const { |
| 148 // Note: Empty domains are also invalid. |
| 149 DCHECK(url.is_valid()); |
| 150 DCHECK(type_ == proto::URL_PATTERN_TYPE_SUBSTRING || |
| 151 type_ == proto::URL_PATTERN_TYPE_WILDCARDED); |
| 152 |
| 153 StringSplitter<IsWildcard> subpatterns(url_pattern_); |
| 154 auto subpattern_it = subpatterns.begin(); |
| 155 auto subpattern_end = subpatterns.end(); |
| 156 |
| 157 if (subpattern_it == subpattern_end) { |
| 158 return anchor_left_ == proto::ANCHOR_TYPE_NONE || |
| 159 anchor_right_ == proto::ANCHOR_TYPE_NONE; |
| 160 } |
| 161 |
| 162 const base::StringPiece spec = url.possibly_invalid_spec(); |
| 163 const url::Component host_part = url.parsed_for_possibly_invalid_spec().host; |
| 164 DCHECK(!spec.empty()); |
| 165 |
| 166 base::StringPiece subpattern = *subpattern_it; |
| 167 ++subpattern_it; |
| 168 |
| 169 // If there is only one |subpattern|, and it has a right anchor, then simply |
| 170 // check that it is a suffix of the |spec|, and the left anchor is fulfilled. |
| 171 if (subpattern_it == subpattern_end && |
| 172 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { |
| 173 if (!EndsWithFuzzy(spec, subpattern)) |
| 174 return false; |
| 175 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) |
| 176 return spec.size() == subpattern.size(); |
| 177 if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { |
| 178 DCHECK_LE(subpattern.size(), spec.size()); |
| 179 return url.has_host() && |
| 180 IsSubdomainAnchored(spec, host_part, |
| 181 spec.size() - subpattern.size()); |
| 182 } |
| 183 return true; |
| 184 } |
| 185 |
| 186 // Otherwise, the first |subpattern| does not have to be a suffix. But it |
| 187 // still can have a left anchor. Check and handle that. |
| 188 base::StringPiece text = spec; |
| 189 if (anchor_left_ == proto::ANCHOR_TYPE_BOUNDARY) { |
| 190 if (!StartsWithFuzzy(spec, subpattern)) |
| 191 return false; |
| 192 if (subpattern_it == subpattern_end) { |
| 193 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); |
| 194 return true; |
| 195 } |
| 196 text.remove_prefix(subpattern.size()); |
| 197 } else if (anchor_left_ == proto::ANCHOR_TYPE_SUBDOMAIN) { |
| 198 if (!url.has_host()) |
| 199 return false; |
| 200 const size_t match_begin = |
| 201 FindSubdomainAnchoredSubpattern(spec, host_part, subpattern); |
| 202 if (match_begin == base::StringPiece::npos) |
| 203 return false; |
| 204 if (subpattern_it == subpattern_end) { |
| 205 DCHECK_EQ(anchor_right_, proto::ANCHOR_TYPE_NONE); |
| 206 return true; |
| 207 } |
| 208 text.remove_prefix(match_begin + subpattern.size()); |
| 209 } else { |
| 210 DCHECK_EQ(anchor_left_, proto::ANCHOR_TYPE_NONE); |
| 211 // Get back to the initial |subpattern|, process it in the loop below. |
| 212 subpattern_it = subpatterns.begin(); |
| 213 } |
| 214 |
| 215 // Consecutively find all the remaining subpatterns in the |text|. If the |
| 216 // pattern has a right anchor, don't search for the last subpattern, but |
| 217 // instead check that it is a suffix of the |text|. |
| 218 while (subpattern_it != subpattern_end) { |
| 219 subpattern = *subpattern_it; |
| 220 DCHECK(!subpattern.empty()); |
| 221 |
| 222 if (++subpattern_it == subpattern_end && |
| 223 anchor_right_ == proto::ANCHOR_TYPE_BOUNDARY) { |
| 224 break; |
| 225 } |
| 226 |
| 227 const size_t match_position = FindSubpattern(text, subpattern); |
| 228 if (match_position == base::StringPiece::npos) |
| 229 return false; |
| 230 text.remove_prefix(match_position + subpattern.size()); |
| 231 } |
| 232 |
| 233 return anchor_right_ != proto::ANCHOR_TYPE_BOUNDARY || |
| 234 EndsWithFuzzy(text, subpattern); |
| 235 } |
| 236 |
| 237 std::ostream& operator<<(std::ostream& out, const UrlPattern& pattern) { |
| 238 // Note: Each fall-through in this switch is intentional. |
| 239 switch (pattern.anchor_left()) { |
| 240 case proto::ANCHOR_TYPE_SUBDOMAIN: |
| 241 out << '|'; |
| 242 case proto::ANCHOR_TYPE_BOUNDARY: |
| 243 out << '|'; |
| 244 default: |
| 245 break; |
| 246 } |
| 247 out << pattern.url_pattern(); |
| 248 if (pattern.anchor_right() == proto::ANCHOR_TYPE_BOUNDARY) |
| 249 out << '|'; |
| 250 if (pattern.match_case()) |
| 251 out << "$match-case"; |
| 252 |
| 253 return out; |
| 254 } |
| 255 |
76 } // namespace subresource_filter | 256 } // namespace subresource_filter |
OLD | NEW |