OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 // NB: Modelled after Mozilla's code (originally written by Pamela Greene, | 5 // NB: Modelled after Mozilla's code (originally written by Pamela Greene, |
6 // later modified by others), but almost entirely rewritten for Chrome. | 6 // later modified by others), but almost entirely rewritten for Chrome. |
7 // (netwerk/dns/src/nsEffectiveTLDService.cpp) | 7 // (netwerk/dns/src/nsEffectiveTLDService.cpp) |
8 /* ***** BEGIN LICENSE BLOCK ***** | 8 /* ***** BEGIN LICENSE BLOCK ***** |
9 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 | 9 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
10 * | 10 * |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
68 size_t g_graph_length = sizeof(kDafsa); | 68 size_t g_graph_length = sizeof(kDafsa); |
69 | 69 |
70 struct MappedHostComponent { | 70 struct MappedHostComponent { |
71 size_t original_begin; | 71 size_t original_begin; |
72 size_t original_end; | 72 size_t original_end; |
73 | 73 |
74 size_t canonical_begin; | 74 size_t canonical_begin; |
75 size_t canonical_end; | 75 size_t canonical_end; |
76 }; | 76 }; |
77 | 77 |
78 // This function follows the specification at https://publicsuffix.org/list/. | |
79 // |host| is assumed to be canonicalized. | |
78 size_t GetRegistryLengthImpl(base::StringPiece host, | 80 size_t GetRegistryLengthImpl(base::StringPiece host, |
79 UnknownRegistryFilter unknown_filter, | 81 UnknownRegistryFilter unknown_filter, |
80 PrivateRegistryFilter private_filter) { | 82 PrivateRegistryFilter private_filter) { |
81 if (host.empty()) | 83 if (host.empty()) |
82 return std::string::npos; | 84 return std::string::npos; |
83 | 85 |
84 // Skip leading dots. | 86 // A single trailing dot (which fully qualifies the domain name) isn't |
85 const size_t host_check_begin = host.find_first_not_of('.'); | 87 // relevant in this determination, but does need to be included in the final |
86 if (host_check_begin == std::string::npos) | 88 // returned length. Multiple trailing dots are disallowed. |
89 const size_t host_check_rbegin = host.find_last_not_of('.'); | |
90 if (host_check_rbegin == std::string::npos) | |
87 return 0; // Host is only dots. | 91 return 0; // Host is only dots. |
92 if (host_check_rbegin < host.length() - 2) | |
93 return 0; // Host has more than one trailing dot. | |
88 | 94 |
89 // A single trailing dot isn't relevant in this determination, but does need | 95 // Skip any number of leading dots. |host| is known to have at least one non- |
90 // to be included in the final returned length. | 96 // dot character, so find_first_not_of() always succeeds here. |
91 size_t host_check_len = host.length(); | 97 const size_t host_check_rend = host.find_first_not_of('.') - 1; |
92 if (host[host_check_len - 1] == '.') { | 98 |
93 --host_check_len; | 99 // "If no rules match, the prevailing rule is '*'" |
94 DCHECK(host_check_len > 0); // If this weren't true, the host would be ".", | 100 // -- publicsuffix.org, Algorithm 2 |
95 // and we'd have already returned above. | 101 // |
96 if (host[host_check_len - 1] == '.') | 102 // The default-wildcard behavior works by starting in a wildcard state. Omit |
97 return 0; // Multiple trailing dots. | 103 // this step if the caller passes in EXCLUDE_UNKNOWN_REGISTRIES. |
104 size_t in_wildcard = (unknown_filter == INCLUDE_UNKNOWN_REGISTRIES); | |
Ryan Sleevi
2017/03/06 17:20:04
Based on your usage (155/162/172), this should be
| |
105 | |
106 // "Match domain against all rules and take note of the matching ones." | |
107 // -- publicsuffix.org, Algorithm 1 | |
108 // | |
109 // Feed |host| to the |suffix_lookup| DAFSA in reverse character order, | |
110 // checking for rule matches at each label boundary. | |
111 size_t prevailing_rule_pos = host.length(); | |
112 FixedSetIncrementalLookup suffix_lookup(g_graph, g_graph_length); | |
113 for (size_t pos = host_check_rbegin; pos != host_check_rend; --pos) { | |
114 // Feed a character into the DAFSA. | |
115 if (!suffix_lookup.Advance(host[pos]) && !in_wildcard) { | |
116 // The DAFSA is exhausted, and there's no active wildcard rule, so it's | |
117 // possible to stop early. | |
118 break; | |
119 } | |
120 | |
121 // At label boundaries, check the return value of the DAFSA state. This | |
122 // indicates whether there is a matching rule in the public suffix list. | |
123 bool is_last_char_in_label = | |
124 ((pos - 1) == host_check_rend || host[pos - 1] == '.'); | |
125 if (is_last_char_in_label) { | |
Ryan Sleevi
2017/03/06 17:20:04
Does it make more sense to do
if (!is_last_char_i
| |
126 int dafsa_result = suffix_lookup.GetResultForCurrentSequence(); | |
127 if (dafsa_result != kDafsaNotFound && | |
128 ((dafsa_result & kDafsaPrivateRule) == 0 || | |
129 private_filter == INCLUDE_PRIVATE_REGISTRIES)) { | |
130 if (dafsa_result & kDafsaExceptionRule) { | |
131 // "If more than one rule matches, the prevailing rule is the one | |
132 // which is an exception rule." -- publicsuffix.org, Algorithm 3 | |
133 // | |
134 // There can only be at most one exception rule match for a given | |
135 // string. Thus, the first matching exception rule always wins. | |
136 size_t previous_dot = host.find('.', pos); | |
137 if (previous_dot == std::string::npos) { | |
138 // Getting here implies an exception rule with no dots (e.g. | |
139 // "!foo"). But exception rules are only allowed when there is a | |
140 // corresponding wildcard rule. The corresponding wildcard rule for | |
141 // this case would have to be '*', which is explicitly disallowed. | |
Ryan Sleevi
2017/03/06 17:20:04
Maybe it's because I haven't had my morning coffee
| |
142 NOTREACHED() << "Invalid exception rule"; | |
143 return 0; | |
144 } | |
145 DCHECK(in_wildcard); | |
146 | |
147 // "If the prevailing rule is a exception rule, modify it by removing | |
148 // the leftmost label." -- publicsuffix.org, Algorithm 5 | |
149 return host.length() - previous_dot - 1; | |
150 } | |
151 | |
152 if (dafsa_result & kDafsaWildcardRule) { | |
153 // When a wildcard rule is encountered, any sequence of characters in | |
154 // the next label will be treated as a match. | |
155 in_wildcard = true; | |
156 } else { | |
157 // "If there is no matching exception rule, the prevailing rule is the | |
158 // one with the most labels." -- publicsuffix.org, Algorithm 4 | |
159 // | |
160 // Because of the loop structure, the currently-matched rule must have | |
161 // more labels than any previous match, so the match at |pos| wins. | |
162 in_wildcard = false; | |
163 prevailing_rule_pos = pos; | |
164 } | |
165 } else if (in_wildcard) { | |
166 // The wildcard rule encountered at the end of the previous label | |
167 // matches the hostname up to |pos|. This becomes the prevailing match. | |
168 // | |
169 // TODO(nick): Currently an empty label may match a wildcard rule. This | |
170 // would yield a match on the rule "*.b.c" for the malformed host | |
171 // "a..b.c". This is unnecessarily permissive. | |
Ryan Sleevi
2017/03/06 17:20:04
Can you clarify the 'currently' - do you mean in t
| |
172 in_wildcard = false; | |
173 prevailing_rule_pos = pos; | |
174 } | |
175 } | |
98 } | 176 } |
99 | 177 |
100 // Walk up the domain tree, most specific to least specific, | 178 // "The registered or registrable domain is the public suffix plus one |
101 // looking for matches at each level. | 179 // additional label." -- publicsuffix.org, Algorithm 7 |
102 size_t prev_start = std::string::npos; | 180 // |
103 size_t curr_start = host_check_begin; | 181 // If the hostname has no labels beyond its public suffix, fail. |
104 size_t next_dot = host.find('.', curr_start); | 182 if ((prevailing_rule_pos - 1) == host_check_rend) |
105 if (next_dot >= host_check_len) // Catches std::string::npos as well. | 183 return 0; |
106 return 0; // This can't have a registry + domain. | |
107 while (1) { | |
108 const char* domain_str = host.data() + curr_start; | |
109 size_t domain_length = host_check_len - curr_start; | |
110 int type = LookupStringInFixedSet(g_graph, g_graph_length, domain_str, | |
111 domain_length); | |
112 bool do_check = type != kDafsaNotFound && | |
113 (!(type & kDafsaPrivateRule) || | |
114 private_filter == INCLUDE_PRIVATE_REGISTRIES); | |
115 | 184 |
116 // If the apparent match is a private registry and we're not including | 185 // If the end of the string is reached with an active wildcard rule, fail. |
117 // those, it can't be an actual match. | 186 // This corresponds to https://crbug.com/459802, 'the platform.sh problem', |
118 if (do_check) { | 187 // asking: What is the public suffix of "y.z" when the rule set is ["*.y.z", |
119 // Exception rules override wildcard rules when the domain is an exact | 188 // "z"]. Removing the next line would cause the behavior to match the |
120 // match, but wildcards take precedence when there's a subdomain. | 189 // algorithm documented at publicsuffix.org, which treats "z" as the |
121 if (type & kDafsaWildcardRule && (prev_start != std::string::npos)) { | 190 // prevailing rule in this case. |
122 // If prev_start == host_check_begin, then the host is the registry | 191 if (in_wildcard) |
123 // itself, so return 0. | 192 return 0; |
124 return (prev_start == host_check_begin) ? 0 | |
125 : (host.length() - prev_start); | |
126 } | |
127 | 193 |
128 if (type & kDafsaExceptionRule) { | 194 // "The public suffix is the set of labels from the domain which match the |
129 if (next_dot == std::string::npos) { | 195 // labels of the prevailing rule, using the matching algorithm above." |
130 // If we get here, we had an exception rule with no dots (e.g. | 196 // -- publicsuffix.org, Algorithm 6 |
131 // "!foo"). This would only be valid if we had a corresponding | 197 return host.length() - prevailing_rule_pos; |
132 // wildcard rule, which would have to be "*". But we explicitly | |
133 // disallow that case, so this kind of rule is invalid. | |
134 NOTREACHED() << "Invalid exception rule"; | |
135 return 0; | |
136 } | |
137 return host.length() - next_dot - 1; | |
138 } | |
139 | |
140 // If curr_start == host_check_begin, then the host is the registry | |
141 // itself, so return 0. | |
142 return (curr_start == host_check_begin) ? 0 | |
143 : (host.length() - curr_start); | |
144 } | |
145 | |
146 if (next_dot >= host_check_len) // Catches std::string::npos as well. | |
147 break; | |
148 | |
149 prev_start = curr_start; | |
150 curr_start = next_dot + 1; | |
151 next_dot = host.find('.', curr_start); | |
152 } | |
153 | |
154 // No rule found in the registry. curr_start now points to the first | |
155 // character of the last subcomponent of the host, so if we allow unknown | |
156 // registries, return the length of this subcomponent. | |
157 return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ? | |
158 (host.length() - curr_start) : 0; | |
159 } | 198 } |
160 | 199 |
161 base::StringPiece GetDomainAndRegistryImpl( | 200 base::StringPiece GetDomainAndRegistryImpl( |
162 base::StringPiece host, | 201 base::StringPiece host, |
163 PrivateRegistryFilter private_filter) { | 202 PrivateRegistryFilter private_filter) { |
164 DCHECK(!host.empty()); | 203 DCHECK(!host.empty()); |
165 | 204 |
166 // Find the length of the registry for this host. | 205 // Find the length of the registry for this host. |
167 const size_t registry_length = | 206 const size_t registry_length = |
168 GetRegistryLengthImpl(host, INCLUDE_UNKNOWN_REGISTRIES, private_filter); | 207 GetRegistryLengthImpl(host, INCLUDE_UNKNOWN_REGISTRIES, private_filter); |
169 if ((registry_length == std::string::npos) || (registry_length == 0)) | 208 if ((registry_length == std::string::npos) || (registry_length == 0)) |
170 return base::StringPiece(); // No registry. | 209 return base::StringPiece(); // No registry. |
171 // The "2" in this next line is 1 for the dot, plus a 1-char minimum preceding | 210 // The "2" in this next line is 1 for the dot, plus a 1-char minimum preceding |
172 // subcomponent length. | 211 // subcomponent length. |
173 DCHECK(host.length() >= 2); | 212 DCHECK(host.length() >= 2); |
174 if (registry_length > (host.length() - 2)) { | 213 if (registry_length > (host.length() - 2)) { |
175 NOTREACHED() << | 214 NOTREACHED() << |
176 "Host does not have at least one subcomponent before registry!"; | 215 "Host does not have at least one subcomponent before registry!"; |
177 return base::StringPiece(); | 216 return base::StringPiece(); |
178 } | 217 } |
179 | 218 |
180 // Move past the dot preceding the registry, and search for the next previous | 219 // Move past the dot preceding the registry, and search for the dot before |
181 // dot. Return the host from after that dot, or the whole host when there is | 220 // that. Return the host from after that dot, or the whole host when there is |
182 // no dot. | 221 // no dot. |
183 const size_t dot = host.rfind('.', host.length() - registry_length - 2); | 222 const size_t dot = host.rfind('.', host.length() - registry_length - 2); |
184 if (dot == std::string::npos) | 223 if (dot == std::string::npos) |
185 return host; | 224 return host; |
186 return host.substr(dot + 1); | 225 return host.substr(dot + 1); |
187 } | 226 } |
188 | 227 |
189 // Same as GetDomainAndRegistry, but returns the domain and registry as a | 228 // Same as GetDomainAndRegistry, but returns the domain and registry as a |
190 // StringPiece that references the underlying string of the passed-in |gurl|. | 229 // StringPiece that references the underlying string of the passed-in |gurl|. |
191 // TODO(pkalinnikov): Eliminate this helper by exposing StringPiece as the | 230 // TODO(pkalinnikov): Eliminate this helper by exposing StringPiece as the |
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
252 canon_output.Complete(); | 291 canon_output.Complete(); |
253 | 292 |
254 size_t canonical_rcd_len = | 293 size_t canonical_rcd_len = |
255 GetRegistryLengthImpl(canonical_host, unknown_filter, private_filter); | 294 GetRegistryLengthImpl(canonical_host, unknown_filter, private_filter); |
256 if (canonical_rcd_len == 0 || canonical_rcd_len == std::string::npos) | 295 if (canonical_rcd_len == 0 || canonical_rcd_len == std::string::npos) |
257 return canonical_rcd_len; // Error or no registry controlled domain. | 296 return canonical_rcd_len; // Error or no registry controlled domain. |
258 | 297 |
259 // Find which host component the result started in. | 298 // Find which host component the result started in. |
260 size_t canonical_rcd_begin = canonical_host.length() - canonical_rcd_len; | 299 size_t canonical_rcd_begin = canonical_host.length() - canonical_rcd_len; |
261 for (const auto& mapping : components) { | 300 for (const auto& mapping : components) { |
262 // In the common case, GetRegistryLengthImpl will identify the beginning | 301 // In the common case, GetRegistryLengthImpl will identify the beginning of |
263 // of a component and we can just return where that component was in the | 302 // a component. Just return where that component was in the original string. |
264 // original string. | |
265 if (canonical_rcd_begin == mapping.canonical_begin) | 303 if (canonical_rcd_begin == mapping.canonical_begin) |
266 return host.length() - mapping.original_begin; | 304 return host.length() - mapping.original_begin; |
267 | 305 |
268 if (canonical_rcd_begin >= mapping.canonical_end) | 306 if (canonical_rcd_begin >= mapping.canonical_end) |
269 continue; | 307 continue; |
270 | 308 |
271 // The registry controlled domain begin was identified as being in the | 309 // The registry controlled domain begin was identified as being in the |
272 // middle of this dot-separated domain component in the non-canonical | 310 // middle of this dot-separated domain component in the non-canonical |
273 // input. This indicates some form of escaped dot, or a non-ASCII | 311 // input. This indicates some form of escaped dot, or a non-ASCII |
274 // character that was canonicalized to a dot. | 312 // character that was canonicalized to a dot. |
275 // | 313 // |
276 // Brute-force search from the end by repeatedly canonicalizing longer | 314 // Brute-force search from the end by repeatedly canonicalizing longer |
277 // substrings until we get a match for the canonicalized version. This | 315 // substrings, until finding a match for the canonicalized version. This |
278 // can't be done with binary search because canonicalization might increase | 316 // can't be done with binary search because canonicalization might increase |
279 // or decrease the length of the produced string depending on where it's | 317 // or decrease the length of the produced string depending on where it's |
280 // split. This depends on the canonicalization process not changing the | 318 // split. This depends on the canonicalization process not changing the |
281 // order of the characters. Punycode can change the order of characters, | 319 // order of the characters. Punycode can change the order of characters, but |
282 // but it doesn't work across dots so this is safe. | 320 // it doesn't work across dots so this is safe. |
283 | 321 |
284 // Expected canonical registry controlled domain. | 322 // Expected canonical registry controlled domain. |
285 base::StringPiece canonical_rcd(&canonical_host[canonical_rcd_begin], | 323 base::StringPiece canonical_rcd(&canonical_host[canonical_rcd_begin], |
286 canonical_rcd_len); | 324 canonical_rcd_len); |
287 | 325 |
288 for (int current_try = static_cast<int>(mapping.original_end) - 1; | 326 for (int current_try = static_cast<int>(mapping.original_end) - 1; |
289 current_try >= static_cast<int>(mapping.original_begin); | 327 current_try >= static_cast<int>(mapping.original_begin); |
290 current_try--) { | 328 current_try--) { |
291 std::string try_string; | 329 std::string try_string; |
292 url::StdStringCanonOutput try_output(&try_string); | 330 url::StdStringCanonOutput try_output(&try_string); |
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
441 | 479 |
442 void SetFindDomainGraph(const unsigned char* domains, size_t length) { | 480 void SetFindDomainGraph(const unsigned char* domains, size_t length) { |
443 CHECK(domains); | 481 CHECK(domains); |
444 CHECK_NE(length, 0u); | 482 CHECK_NE(length, 0u); |
445 g_graph = domains; | 483 g_graph = domains; |
446 g_graph_length = length; | 484 g_graph_length = length; |
447 } | 485 } |
448 | 486 |
449 } // namespace registry_controlled_domains | 487 } // namespace registry_controlled_domains |
450 } // namespace net | 488 } // namespace net |
OLD | NEW |