Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(11)

Side by Side Diff: net/base/registry_controlled_domains/registry_controlled_domain.cc

Issue 2649033004: [3 of 4] Speedup GetRegistryLengthImpl() by seeding the DAFSA in reverse
Patch Set: Language. Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 // NB: Modelled after Mozilla's code (originally written by Pamela Greene, 5 // NB: Modelled after Mozilla's code (originally written by Pamela Greene,
6 // later modified by others), but almost entirely rewritten for Chrome. 6 // later modified by others), but almost entirely rewritten for Chrome.
7 // (netwerk/dns/src/nsEffectiveTLDService.cpp) 7 // (netwerk/dns/src/nsEffectiveTLDService.cpp)
8 /* ***** BEGIN LICENSE BLOCK ***** 8 /* ***** BEGIN LICENSE BLOCK *****
9 * Version: MPL 1.1/GPL 2.0/LGPL 2.1 9 * Version: MPL 1.1/GPL 2.0/LGPL 2.1
10 * 10 *
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
68 size_t g_graph_length = sizeof(kDafsa); 68 size_t g_graph_length = sizeof(kDafsa);
69 69
70 struct MappedHostComponent { 70 struct MappedHostComponent {
71 size_t original_begin; 71 size_t original_begin;
72 size_t original_end; 72 size_t original_end;
73 73
74 size_t canonical_begin; 74 size_t canonical_begin;
75 size_t canonical_end; 75 size_t canonical_end;
76 }; 76 };
77 77
78 // This function follows the specification at https://publicsuffix.org/list/.
79 // |host| is assumed to be canonicalized.
78 size_t GetRegistryLengthImpl(base::StringPiece host, 80 size_t GetRegistryLengthImpl(base::StringPiece host,
79 UnknownRegistryFilter unknown_filter, 81 UnknownRegistryFilter unknown_filter,
80 PrivateRegistryFilter private_filter) { 82 PrivateRegistryFilter private_filter) {
81 if (host.empty()) 83 if (host.empty())
82 return std::string::npos; 84 return std::string::npos;
83 85
84 // Skip leading dots. 86 // A single trailing dot (which fully qualifies the domain name) isn't
85 const size_t host_check_begin = host.find_first_not_of('.'); 87 // relevant in this determination, but does need to be included in the final
86 if (host_check_begin == std::string::npos) 88 // returned length. Multiple trailing dots are disallowed.
89 const size_t host_check_rbegin = host.find_last_not_of('.');
90 if (host_check_rbegin == std::string::npos)
87 return 0; // Host is only dots. 91 return 0; // Host is only dots.
92 if (host_check_rbegin < host.length() - 2)
93 return 0; // Host has more than one trailing dot.
88 94
89 // A single trailing dot isn't relevant in this determination, but does need 95 // Skip any number of leading dots. |host| is known to have at least one non-
90 // to be included in the final returned length. 96 // dot character, so find_first_not_of() always succeeds here.
91 size_t host_check_len = host.length(); 97 const size_t host_check_rend = host.find_first_not_of('.') - 1;
92 if (host[host_check_len - 1] == '.') { 98
93 --host_check_len; 99 // "If no rules match, the prevailing rule is '*'"
94 DCHECK(host_check_len > 0); // If this weren't true, the host would be ".", 100 // -- publicsuffix.org, Algorithm 2
95 // and we'd have already returned above. 101 //
96 if (host[host_check_len - 1] == '.') 102 // The default-wildcard behavior works by starting in a wildcard state. Omit
97 return 0; // Multiple trailing dots. 103 // this step if the caller passes in EXCLUDE_UNKNOWN_REGISTRIES.
104 size_t in_wildcard = (unknown_filter == INCLUDE_UNKNOWN_REGISTRIES);
Ryan Sleevi 2017/03/06 17:20:04 Based on your usage (155/162/172), this should be
105
106 // "Match domain against all rules and take note of the matching ones."
107 // -- publicsuffix.org, Algorithm 1
108 //
109 // Feed |host| to the |suffix_lookup| DAFSA in reverse character order,
110 // checking for rule matches at each label boundary.
111 size_t prevailing_rule_pos = host.length();
112 FixedSetIncrementalLookup suffix_lookup(g_graph, g_graph_length);
113 for (size_t pos = host_check_rbegin; pos != host_check_rend; --pos) {
114 // Feed a character into the DAFSA.
115 if (!suffix_lookup.Advance(host[pos]) && !in_wildcard) {
116 // The DAFSA is exhausted, and there's no active wildcard rule, so it's
117 // possible to stop early.
118 break;
119 }
120
121 // At label boundaries, check the return value of the DAFSA state. This
122 // indicates whether there is a matching rule in the public suffix list.
123 bool is_last_char_in_label =
124 ((pos - 1) == host_check_rend || host[pos - 1] == '.');
125 if (is_last_char_in_label) {
Ryan Sleevi 2017/03/06 17:20:04 Does it make more sense to do if (!is_last_char_i
126 int dafsa_result = suffix_lookup.GetResultForCurrentSequence();
127 if (dafsa_result != kDafsaNotFound &&
128 ((dafsa_result & kDafsaPrivateRule) == 0 ||
129 private_filter == INCLUDE_PRIVATE_REGISTRIES)) {
130 if (dafsa_result & kDafsaExceptionRule) {
131 // "If more than one rule matches, the prevailing rule is the one
132 // which is an exception rule." -- publicsuffix.org, Algorithm 3
133 //
134 // There can only be at most one exception rule match for a given
135 // string. Thus, the first matching exception rule always wins.
136 size_t previous_dot = host.find('.', pos);
137 if (previous_dot == std::string::npos) {
138 // Getting here implies an exception rule with no dots (e.g.
139 // "!foo"). But exception rules are only allowed when there is a
140 // corresponding wildcard rule. The corresponding wildcard rule for
141 // this case would have to be '*', which is explicitly disallowed.
Ryan Sleevi 2017/03/06 17:20:04 Maybe it's because I haven't had my morning coffee
142 NOTREACHED() << "Invalid exception rule";
143 return 0;
144 }
145 DCHECK(in_wildcard);
146
147 // "If the prevailing rule is a exception rule, modify it by removing
148 // the leftmost label." -- publicsuffix.org, Algorithm 5
149 return host.length() - previous_dot - 1;
150 }
151
152 if (dafsa_result & kDafsaWildcardRule) {
153 // When a wildcard rule is encountered, any sequence of characters in
154 // the next label will be treated as a match.
155 in_wildcard = true;
156 } else {
157 // "If there is no matching exception rule, the prevailing rule is the
158 // one with the most labels." -- publicsuffix.org, Algorithm 4
159 //
160 // Because of the loop structure, the currently-matched rule must have
161 // more labels than any previous match, so the match at |pos| wins.
162 in_wildcard = false;
163 prevailing_rule_pos = pos;
164 }
165 } else if (in_wildcard) {
166 // The wildcard rule encountered at the end of the previous label
167 // matches the hostname up to |pos|. This becomes the prevailing match.
168 //
169 // TODO(nick): Currently an empty label may match a wildcard rule. This
170 // would yield a match on the rule "*.b.c" for the malformed host
171 // "a..b.c". This is unnecessarily permissive.
Ryan Sleevi 2017/03/06 17:20:04 Can you clarify the 'currently' - do you mean in t
172 in_wildcard = false;
173 prevailing_rule_pos = pos;
174 }
175 }
98 } 176 }
99 177
100 // Walk up the domain tree, most specific to least specific, 178 // "The registered or registrable domain is the public suffix plus one
101 // looking for matches at each level. 179 // additional label." -- publicsuffix.org, Algorithm 7
102 size_t prev_start = std::string::npos; 180 //
103 size_t curr_start = host_check_begin; 181 // If the hostname has no labels beyond its public suffix, fail.
104 size_t next_dot = host.find('.', curr_start); 182 if ((prevailing_rule_pos - 1) == host_check_rend)
105 if (next_dot >= host_check_len) // Catches std::string::npos as well. 183 return 0;
106 return 0; // This can't have a registry + domain.
107 while (1) {
108 const char* domain_str = host.data() + curr_start;
109 size_t domain_length = host_check_len - curr_start;
110 int type = LookupStringInFixedSet(g_graph, g_graph_length, domain_str,
111 domain_length);
112 bool do_check = type != kDafsaNotFound &&
113 (!(type & kDafsaPrivateRule) ||
114 private_filter == INCLUDE_PRIVATE_REGISTRIES);
115 184
116 // If the apparent match is a private registry and we're not including 185 // If the end of the string is reached with an active wildcard rule, fail.
117 // those, it can't be an actual match. 186 // This corresponds to https://crbug.com/459802, 'the platform.sh problem',
118 if (do_check) { 187 // asking: What is the public suffix of "y.z" when the rule set is ["*.y.z",
119 // Exception rules override wildcard rules when the domain is an exact 188 // "z"]. Removing the next line would cause the behavior to match the
120 // match, but wildcards take precedence when there's a subdomain. 189 // algorithm documented at publicsuffix.org, which treats "z" as the
121 if (type & kDafsaWildcardRule && (prev_start != std::string::npos)) { 190 // prevailing rule in this case.
122 // If prev_start == host_check_begin, then the host is the registry 191 if (in_wildcard)
123 // itself, so return 0. 192 return 0;
124 return (prev_start == host_check_begin) ? 0
125 : (host.length() - prev_start);
126 }
127 193
128 if (type & kDafsaExceptionRule) { 194 // "The public suffix is the set of labels from the domain which match the
129 if (next_dot == std::string::npos) { 195 // labels of the prevailing rule, using the matching algorithm above."
130 // If we get here, we had an exception rule with no dots (e.g. 196 // -- publicsuffix.org, Algorithm 6
131 // "!foo"). This would only be valid if we had a corresponding 197 return host.length() - prevailing_rule_pos;
132 // wildcard rule, which would have to be "*". But we explicitly
133 // disallow that case, so this kind of rule is invalid.
134 NOTREACHED() << "Invalid exception rule";
135 return 0;
136 }
137 return host.length() - next_dot - 1;
138 }
139
140 // If curr_start == host_check_begin, then the host is the registry
141 // itself, so return 0.
142 return (curr_start == host_check_begin) ? 0
143 : (host.length() - curr_start);
144 }
145
146 if (next_dot >= host_check_len) // Catches std::string::npos as well.
147 break;
148
149 prev_start = curr_start;
150 curr_start = next_dot + 1;
151 next_dot = host.find('.', curr_start);
152 }
153
154 // No rule found in the registry. curr_start now points to the first
155 // character of the last subcomponent of the host, so if we allow unknown
156 // registries, return the length of this subcomponent.
157 return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ?
158 (host.length() - curr_start) : 0;
159 } 198 }
160 199
161 base::StringPiece GetDomainAndRegistryImpl( 200 base::StringPiece GetDomainAndRegistryImpl(
162 base::StringPiece host, 201 base::StringPiece host,
163 PrivateRegistryFilter private_filter) { 202 PrivateRegistryFilter private_filter) {
164 DCHECK(!host.empty()); 203 DCHECK(!host.empty());
165 204
166 // Find the length of the registry for this host. 205 // Find the length of the registry for this host.
167 const size_t registry_length = 206 const size_t registry_length =
168 GetRegistryLengthImpl(host, INCLUDE_UNKNOWN_REGISTRIES, private_filter); 207 GetRegistryLengthImpl(host, INCLUDE_UNKNOWN_REGISTRIES, private_filter);
169 if ((registry_length == std::string::npos) || (registry_length == 0)) 208 if ((registry_length == std::string::npos) || (registry_length == 0))
170 return base::StringPiece(); // No registry. 209 return base::StringPiece(); // No registry.
171 // The "2" in this next line is 1 for the dot, plus a 1-char minimum preceding 210 // The "2" in this next line is 1 for the dot, plus a 1-char minimum preceding
172 // subcomponent length. 211 // subcomponent length.
173 DCHECK(host.length() >= 2); 212 DCHECK(host.length() >= 2);
174 if (registry_length > (host.length() - 2)) { 213 if (registry_length > (host.length() - 2)) {
175 NOTREACHED() << 214 NOTREACHED() <<
176 "Host does not have at least one subcomponent before registry!"; 215 "Host does not have at least one subcomponent before registry!";
177 return base::StringPiece(); 216 return base::StringPiece();
178 } 217 }
179 218
180 // Move past the dot preceding the registry, and search for the next previous 219 // Move past the dot preceding the registry, and search for the dot before
181 // dot. Return the host from after that dot, or the whole host when there is 220 // that. Return the host from after that dot, or the whole host when there is
182 // no dot. 221 // no dot.
183 const size_t dot = host.rfind('.', host.length() - registry_length - 2); 222 const size_t dot = host.rfind('.', host.length() - registry_length - 2);
184 if (dot == std::string::npos) 223 if (dot == std::string::npos)
185 return host; 224 return host;
186 return host.substr(dot + 1); 225 return host.substr(dot + 1);
187 } 226 }
188 227
189 // Same as GetDomainAndRegistry, but returns the domain and registry as a 228 // Same as GetDomainAndRegistry, but returns the domain and registry as a
190 // StringPiece that references the underlying string of the passed-in |gurl|. 229 // StringPiece that references the underlying string of the passed-in |gurl|.
191 // TODO(pkalinnikov): Eliminate this helper by exposing StringPiece as the 230 // TODO(pkalinnikov): Eliminate this helper by exposing StringPiece as the
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
252 canon_output.Complete(); 291 canon_output.Complete();
253 292
254 size_t canonical_rcd_len = 293 size_t canonical_rcd_len =
255 GetRegistryLengthImpl(canonical_host, unknown_filter, private_filter); 294 GetRegistryLengthImpl(canonical_host, unknown_filter, private_filter);
256 if (canonical_rcd_len == 0 || canonical_rcd_len == std::string::npos) 295 if (canonical_rcd_len == 0 || canonical_rcd_len == std::string::npos)
257 return canonical_rcd_len; // Error or no registry controlled domain. 296 return canonical_rcd_len; // Error or no registry controlled domain.
258 297
259 // Find which host component the result started in. 298 // Find which host component the result started in.
260 size_t canonical_rcd_begin = canonical_host.length() - canonical_rcd_len; 299 size_t canonical_rcd_begin = canonical_host.length() - canonical_rcd_len;
261 for (const auto& mapping : components) { 300 for (const auto& mapping : components) {
262 // In the common case, GetRegistryLengthImpl will identify the beginning 301 // In the common case, GetRegistryLengthImpl will identify the beginning of
263 // of a component and we can just return where that component was in the 302 // a component. Just return where that component was in the original string.
264 // original string.
265 if (canonical_rcd_begin == mapping.canonical_begin) 303 if (canonical_rcd_begin == mapping.canonical_begin)
266 return host.length() - mapping.original_begin; 304 return host.length() - mapping.original_begin;
267 305
268 if (canonical_rcd_begin >= mapping.canonical_end) 306 if (canonical_rcd_begin >= mapping.canonical_end)
269 continue; 307 continue;
270 308
271 // The registry controlled domain begin was identified as being in the 309 // The registry controlled domain begin was identified as being in the
272 // middle of this dot-separated domain component in the non-canonical 310 // middle of this dot-separated domain component in the non-canonical
273 // input. This indicates some form of escaped dot, or a non-ASCII 311 // input. This indicates some form of escaped dot, or a non-ASCII
274 // character that was canonicalized to a dot. 312 // character that was canonicalized to a dot.
275 // 313 //
276 // Brute-force search from the end by repeatedly canonicalizing longer 314 // Brute-force search from the end by repeatedly canonicalizing longer
277 // substrings until we get a match for the canonicalized version. This 315 // substrings, until finding a match for the canonicalized version. This
278 // can't be done with binary search because canonicalization might increase 316 // can't be done with binary search because canonicalization might increase
279 // or decrease the length of the produced string depending on where it's 317 // or decrease the length of the produced string depending on where it's
280 // split. This depends on the canonicalization process not changing the 318 // split. This depends on the canonicalization process not changing the
281 // order of the characters. Punycode can change the order of characters, 319 // order of the characters. Punycode can change the order of characters, but
282 // but it doesn't work across dots so this is safe. 320 // it doesn't work across dots so this is safe.
283 321
284 // Expected canonical registry controlled domain. 322 // Expected canonical registry controlled domain.
285 base::StringPiece canonical_rcd(&canonical_host[canonical_rcd_begin], 323 base::StringPiece canonical_rcd(&canonical_host[canonical_rcd_begin],
286 canonical_rcd_len); 324 canonical_rcd_len);
287 325
288 for (int current_try = static_cast<int>(mapping.original_end) - 1; 326 for (int current_try = static_cast<int>(mapping.original_end) - 1;
289 current_try >= static_cast<int>(mapping.original_begin); 327 current_try >= static_cast<int>(mapping.original_begin);
290 current_try--) { 328 current_try--) {
291 std::string try_string; 329 std::string try_string;
292 url::StdStringCanonOutput try_output(&try_string); 330 url::StdStringCanonOutput try_output(&try_string);
(...skipping 148 matching lines...) Expand 10 before | Expand all | Expand 10 after
441 479
442 void SetFindDomainGraph(const unsigned char* domains, size_t length) { 480 void SetFindDomainGraph(const unsigned char* domains, size_t length) {
443 CHECK(domains); 481 CHECK(domains);
444 CHECK_NE(length, 0u); 482 CHECK_NE(length, 0u);
445 g_graph = domains; 483 g_graph = domains;
446 g_graph_length = length; 484 g_graph_length = length;
447 } 485 }
448 486
449 } // namespace registry_controlled_domains 487 } // namespace registry_controlled_domains
450 } // namespace net 488 } // namespace net
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698