Index: net/base/registry_controlled_domains/registry_controlled_domain.cc |
diff --git a/net/base/registry_controlled_domains/registry_controlled_domain.cc b/net/base/registry_controlled_domains/registry_controlled_domain.cc |
index 3777582812bb7103d4062d13a22a53ad342aac8e..5b3cf603d34ac386f7d815ee6991a03bd3267371 100644 |
--- a/net/base/registry_controlled_domains/registry_controlled_domain.cc |
+++ b/net/base/registry_controlled_domains/registry_controlled_domain.cc |
@@ -75,87 +75,126 @@ struct MappedHostComponent { |
size_t canonical_end; |
}; |
+// This function follows the specification at https://publicsuffix.org/list/. |
+// |host| is assumed to be canonicalized. |
size_t GetRegistryLengthImpl(base::StringPiece host, |
UnknownRegistryFilter unknown_filter, |
PrivateRegistryFilter private_filter) { |
if (host.empty()) |
return std::string::npos; |
- // Skip leading dots. |
- const size_t host_check_begin = host.find_first_not_of('.'); |
- if (host_check_begin == std::string::npos) |
+ // A single trailing dot (which fully qualifies the domain name) isn't |
+ // relevant in this determination, but does need to be included in the final |
+ // returned length. Multiple trailing dots are disallowed. |
+ const size_t host_check_rbegin = host.find_last_not_of('.'); |
+ if (host_check_rbegin == std::string::npos) |
return 0; // Host is only dots. |
+ if (host_check_rbegin < host.length() - 2) |
+ return 0; // Host has more than one trailing dot. |
+ |
+ // Skip any number of leading dots. |host| is known to have at least one non- |
+ // dot character, so find_first_not_of() always succeeds here. |
+ const size_t host_check_rend = host.find_first_not_of('.') - 1; |
+ |
+ // "If no rules match, the prevailing rule is '*'" |
+ // -- publicsuffix.org, Algorithm 2 |
+ // |
+ // The default-wildcard behavior works by starting in a wildcard state. Omit |
+ // this step if the caller passes in EXCLUDE_UNKNOWN_REGISTRIES. |
+ size_t in_wildcard = (unknown_filter == INCLUDE_UNKNOWN_REGISTRIES); |
Ryan Sleevi
2017/03/06 17:20:04
Based on your usage (155/162/172), this should be
|
+ |
+ // "Match domain against all rules and take note of the matching ones." |
+ // -- publicsuffix.org, Algorithm 1 |
+ // |
+ // Feed |host| to the |suffix_lookup| DAFSA in reverse character order, |
+ // checking for rule matches at each label boundary. |
+ size_t prevailing_rule_pos = host.length(); |
+ FixedSetIncrementalLookup suffix_lookup(g_graph, g_graph_length); |
+ for (size_t pos = host_check_rbegin; pos != host_check_rend; --pos) { |
+ // Feed a character into the DAFSA. |
+ if (!suffix_lookup.Advance(host[pos]) && !in_wildcard) { |
+ // The DAFSA is exhausted, and there's no active wildcard rule, so it's |
+ // possible to stop early. |
+ break; |
+ } |
- // A single trailing dot isn't relevant in this determination, but does need |
- // to be included in the final returned length. |
- size_t host_check_len = host.length(); |
- if (host[host_check_len - 1] == '.') { |
- --host_check_len; |
- DCHECK(host_check_len > 0); // If this weren't true, the host would be ".", |
- // and we'd have already returned above. |
- if (host[host_check_len - 1] == '.') |
- return 0; // Multiple trailing dots. |
- } |
- |
- // Walk up the domain tree, most specific to least specific, |
- // looking for matches at each level. |
- size_t prev_start = std::string::npos; |
- size_t curr_start = host_check_begin; |
- size_t next_dot = host.find('.', curr_start); |
- if (next_dot >= host_check_len) // Catches std::string::npos as well. |
- return 0; // This can't have a registry + domain. |
- while (1) { |
- const char* domain_str = host.data() + curr_start; |
- size_t domain_length = host_check_len - curr_start; |
- int type = LookupStringInFixedSet(g_graph, g_graph_length, domain_str, |
- domain_length); |
- bool do_check = type != kDafsaNotFound && |
- (!(type & kDafsaPrivateRule) || |
- private_filter == INCLUDE_PRIVATE_REGISTRIES); |
- |
- // If the apparent match is a private registry and we're not including |
- // those, it can't be an actual match. |
- if (do_check) { |
- // Exception rules override wildcard rules when the domain is an exact |
- // match, but wildcards take precedence when there's a subdomain. |
- if (type & kDafsaWildcardRule && (prev_start != std::string::npos)) { |
- // If prev_start == host_check_begin, then the host is the registry |
- // itself, so return 0. |
- return (prev_start == host_check_begin) ? 0 |
- : (host.length() - prev_start); |
- } |
+ // At label boundaries, check the return value of the DAFSA state. This |
+ // indicates whether there is a matching rule in the public suffix list. |
+ bool is_last_char_in_label = |
+ ((pos - 1) == host_check_rend || host[pos - 1] == '.'); |
+ if (is_last_char_in_label) { |
Ryan Sleevi
2017/03/06 17:20:04
Does it make more sense to do
if (!is_last_char_i
|
+ int dafsa_result = suffix_lookup.GetResultForCurrentSequence(); |
+ if (dafsa_result != kDafsaNotFound && |
+ ((dafsa_result & kDafsaPrivateRule) == 0 || |
+ private_filter == INCLUDE_PRIVATE_REGISTRIES)) { |
+ if (dafsa_result & kDafsaExceptionRule) { |
+ // "If more than one rule matches, the prevailing rule is the one |
+ // which is an exception rule." -- publicsuffix.org, Algorithm 3 |
+ // |
+ // There can only be at most one exception rule match for a given |
+ // string. Thus, the first matching exception rule always wins. |
+ size_t previous_dot = host.find('.', pos); |
+ if (previous_dot == std::string::npos) { |
+ // Getting here implies an exception rule with no dots (e.g. |
+ // "!foo"). But exception rules are only allowed when there is a |
+ // corresponding wildcard rule. The corresponding wildcard rule for |
+ // this case would have to be '*', which is explicitly disallowed. |
Ryan Sleevi
2017/03/06 17:20:04
Maybe it's because I haven't had my morning coffee
|
+ NOTREACHED() << "Invalid exception rule"; |
+ return 0; |
+ } |
+ DCHECK(in_wildcard); |
+ |
+ // "If the prevailing rule is a exception rule, modify it by removing |
+ // the leftmost label." -- publicsuffix.org, Algorithm 5 |
+ return host.length() - previous_dot - 1; |
+ } |
- if (type & kDafsaExceptionRule) { |
- if (next_dot == std::string::npos) { |
- // If we get here, we had an exception rule with no dots (e.g. |
- // "!foo"). This would only be valid if we had a corresponding |
- // wildcard rule, which would have to be "*". But we explicitly |
- // disallow that case, so this kind of rule is invalid. |
- NOTREACHED() << "Invalid exception rule"; |
- return 0; |
+ if (dafsa_result & kDafsaWildcardRule) { |
+ // When a wildcard rule is encountered, any sequence of characters in |
+ // the next label will be treated as a match. |
+ in_wildcard = true; |
+ } else { |
+ // "If there is no matching exception rule, the prevailing rule is the |
+ // one with the most labels." -- publicsuffix.org, Algorithm 4 |
+ // |
+ // Because of the loop structure, the currently-matched rule must have |
+ // more labels than any previous match, so the match at |pos| wins. |
+ in_wildcard = false; |
+ prevailing_rule_pos = pos; |
} |
- return host.length() - next_dot - 1; |
+ } else if (in_wildcard) { |
+ // The wildcard rule encountered at the end of the previous label |
+ // matches the hostname up to |pos|. This becomes the prevailing match. |
+ // |
+ // TODO(nick): Currently an empty label may match a wildcard rule. This |
+ // would yield a match on the rule "*.b.c" for the malformed host |
+ // "a..b.c". This is unnecessarily permissive. |
Ryan Sleevi
2017/03/06 17:20:04
Can you clarify the 'currently' - do you mean in t
|
+ in_wildcard = false; |
+ prevailing_rule_pos = pos; |
} |
- |
- // If curr_start == host_check_begin, then the host is the registry |
- // itself, so return 0. |
- return (curr_start == host_check_begin) ? 0 |
- : (host.length() - curr_start); |
} |
- |
- if (next_dot >= host_check_len) // Catches std::string::npos as well. |
- break; |
- |
- prev_start = curr_start; |
- curr_start = next_dot + 1; |
- next_dot = host.find('.', curr_start); |
} |
- // No rule found in the registry. curr_start now points to the first |
- // character of the last subcomponent of the host, so if we allow unknown |
- // registries, return the length of this subcomponent. |
- return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ? |
- (host.length() - curr_start) : 0; |
+ // "The registered or registrable domain is the public suffix plus one |
+ // additional label." -- publicsuffix.org, Algorithm 7 |
+ // |
+ // If the hostname has no labels beyond its public suffix, fail. |
+ if ((prevailing_rule_pos - 1) == host_check_rend) |
+ return 0; |
+ |
+ // If the end of the string is reached with an active wildcard rule, fail. |
+ // This corresponds to https://crbug.com/459802, 'the platform.sh problem', |
+ // asking: What is the public suffix of "y.z" when the rule set is ["*.y.z", |
+ // "z"]. Removing the next line would cause the behavior to match the |
+ // algorithm documented at publicsuffix.org, which treats "z" as the |
+ // prevailing rule in this case. |
+ if (in_wildcard) |
+ return 0; |
+ |
+ // "The public suffix is the set of labels from the domain which match the |
+ // labels of the prevailing rule, using the matching algorithm above." |
+ // -- publicsuffix.org, Algorithm 6 |
+ return host.length() - prevailing_rule_pos; |
} |
base::StringPiece GetDomainAndRegistryImpl( |
@@ -177,8 +216,8 @@ base::StringPiece GetDomainAndRegistryImpl( |
return base::StringPiece(); |
} |
- // Move past the dot preceding the registry, and search for the next previous |
- // dot. Return the host from after that dot, or the whole host when there is |
+ // Move past the dot preceding the registry, and search for the dot before |
+ // that. Return the host from after that dot, or the whole host when there is |
// no dot. |
const size_t dot = host.rfind('.', host.length() - registry_length - 2); |
if (dot == std::string::npos) |
@@ -259,9 +298,8 @@ size_t DoPermissiveGetHostRegistryLength(base::BasicStringPiece<Str> host, |
// Find which host component the result started in. |
size_t canonical_rcd_begin = canonical_host.length() - canonical_rcd_len; |
for (const auto& mapping : components) { |
- // In the common case, GetRegistryLengthImpl will identify the beginning |
- // of a component and we can just return where that component was in the |
- // original string. |
+ // In the common case, GetRegistryLengthImpl will identify the beginning of |
+ // a component. Just return where that component was in the original string. |
if (canonical_rcd_begin == mapping.canonical_begin) |
return host.length() - mapping.original_begin; |
@@ -274,12 +312,12 @@ size_t DoPermissiveGetHostRegistryLength(base::BasicStringPiece<Str> host, |
// character that was canonicalized to a dot. |
// |
// Brute-force search from the end by repeatedly canonicalizing longer |
- // substrings until we get a match for the canonicalized version. This |
+ // substrings, until finding a match for the canonicalized version. This |
// can't be done with binary search because canonicalization might increase |
// or decrease the length of the produced string depending on where it's |
// split. This depends on the canonicalization process not changing the |
- // order of the characters. Punycode can change the order of characters, |
- // but it doesn't work across dots so this is safe. |
+ // order of the characters. Punycode can change the order of characters, but |
+ // it doesn't work across dots so this is safe. |
// Expected canonical registry controlled domain. |
base::StringPiece canonical_rcd(&canonical_host[canonical_rcd_begin], |