Chromium Code Reviews| Index: net/base/registry_controlled_domains/registry_controlled_domain.cc |
| diff --git a/net/base/registry_controlled_domains/registry_controlled_domain.cc b/net/base/registry_controlled_domains/registry_controlled_domain.cc |
| index 3777582812bb7103d4062d13a22a53ad342aac8e..5b3cf603d34ac386f7d815ee6991a03bd3267371 100644 |
| --- a/net/base/registry_controlled_domains/registry_controlled_domain.cc |
| +++ b/net/base/registry_controlled_domains/registry_controlled_domain.cc |
| @@ -75,87 +75,126 @@ struct MappedHostComponent { |
| size_t canonical_end; |
| }; |
| +// This function follows the specification at https://publicsuffix.org/list/. |
| +// |host| is assumed to be canonicalized. |
| size_t GetRegistryLengthImpl(base::StringPiece host, |
| UnknownRegistryFilter unknown_filter, |
| PrivateRegistryFilter private_filter) { |
| if (host.empty()) |
| return std::string::npos; |
| - // Skip leading dots. |
| - const size_t host_check_begin = host.find_first_not_of('.'); |
| - if (host_check_begin == std::string::npos) |
| + // A single trailing dot (which fully qualifies the domain name) isn't |
| + // relevant in this determination, but does need to be included in the final |
| + // returned length. Multiple trailing dots are disallowed. |
| + const size_t host_check_rbegin = host.find_last_not_of('.'); |
| + if (host_check_rbegin == std::string::npos) |
| return 0; // Host is only dots. |
| + if (host_check_rbegin < host.length() - 2) |
| + return 0; // Host has more than one trailing dot. |
| + |
| + // Skip any number of leading dots. |host| is known to have at least one non- |
| + // dot character, so find_first_not_of() always succeeds here. |
| + const size_t host_check_rend = host.find_first_not_of('.') - 1; |
| + |
| + // "If no rules match, the prevailing rule is '*'" |
| + // -- publicsuffix.org, Algorithm 2 |
| + // |
| + // The default-wildcard behavior works by starting in a wildcard state. Omit |
| + // this step if the caller passes in EXCLUDE_UNKNOWN_REGISTRIES. |
| + size_t in_wildcard = (unknown_filter == INCLUDE_UNKNOWN_REGISTRIES); |
|
Ryan Sleevi
2017/03/06 17:20:04
Based on your usage (155/162/172), this should be
|
| + |
| + // "Match domain against all rules and take note of the matching ones." |
| + // -- publicsuffix.org, Algorithm 1 |
| + // |
| + // Feed |host| to the |suffix_lookup| DAFSA in reverse character order, |
| + // checking for rule matches at each label boundary. |
| + size_t prevailing_rule_pos = host.length(); |
| + FixedSetIncrementalLookup suffix_lookup(g_graph, g_graph_length); |
| + for (size_t pos = host_check_rbegin; pos != host_check_rend; --pos) { |
| + // Feed a character into the DAFSA. |
| + if (!suffix_lookup.Advance(host[pos]) && !in_wildcard) { |
| + // The DAFSA is exhausted, and there's no active wildcard rule, so it's |
| + // possible to stop early. |
| + break; |
| + } |
| - // A single trailing dot isn't relevant in this determination, but does need |
| - // to be included in the final returned length. |
| - size_t host_check_len = host.length(); |
| - if (host[host_check_len - 1] == '.') { |
| - --host_check_len; |
| - DCHECK(host_check_len > 0); // If this weren't true, the host would be ".", |
| - // and we'd have already returned above. |
| - if (host[host_check_len - 1] == '.') |
| - return 0; // Multiple trailing dots. |
| - } |
| - |
| - // Walk up the domain tree, most specific to least specific, |
| - // looking for matches at each level. |
| - size_t prev_start = std::string::npos; |
| - size_t curr_start = host_check_begin; |
| - size_t next_dot = host.find('.', curr_start); |
| - if (next_dot >= host_check_len) // Catches std::string::npos as well. |
| - return 0; // This can't have a registry + domain. |
| - while (1) { |
| - const char* domain_str = host.data() + curr_start; |
| - size_t domain_length = host_check_len - curr_start; |
| - int type = LookupStringInFixedSet(g_graph, g_graph_length, domain_str, |
| - domain_length); |
| - bool do_check = type != kDafsaNotFound && |
| - (!(type & kDafsaPrivateRule) || |
| - private_filter == INCLUDE_PRIVATE_REGISTRIES); |
| - |
| - // If the apparent match is a private registry and we're not including |
| - // those, it can't be an actual match. |
| - if (do_check) { |
| - // Exception rules override wildcard rules when the domain is an exact |
| - // match, but wildcards take precedence when there's a subdomain. |
| - if (type & kDafsaWildcardRule && (prev_start != std::string::npos)) { |
| - // If prev_start == host_check_begin, then the host is the registry |
| - // itself, so return 0. |
| - return (prev_start == host_check_begin) ? 0 |
| - : (host.length() - prev_start); |
| - } |
| + // At label boundaries, check the return value of the DAFSA state. This |
| + // indicates whether there is a matching rule in the public suffix list. |
| + bool is_last_char_in_label = |
| + ((pos - 1) == host_check_rend || host[pos - 1] == '.'); |
| + if (is_last_char_in_label) { |
|
Ryan Sleevi
2017/03/06 17:20:04
Does it make more sense to do
if (!is_last_char_i
|
| + int dafsa_result = suffix_lookup.GetResultForCurrentSequence(); |
| + if (dafsa_result != kDafsaNotFound && |
| + ((dafsa_result & kDafsaPrivateRule) == 0 || |
| + private_filter == INCLUDE_PRIVATE_REGISTRIES)) { |
| + if (dafsa_result & kDafsaExceptionRule) { |
| + // "If more than one rule matches, the prevailing rule is the one |
| + // which is an exception rule." -- publicsuffix.org, Algorithm 3 |
| + // |
| + // There can only be at most one exception rule match for a given |
| + // string. Thus, the first matching exception rule always wins. |
| + size_t previous_dot = host.find('.', pos); |
| + if (previous_dot == std::string::npos) { |
| + // Getting here implies an exception rule with no dots (e.g. |
| + // "!foo"). But exception rules are only allowed when there is a |
| + // corresponding wildcard rule. The corresponding wildcard rule for |
| + // this case would have to be '*', which is explicitly disallowed. |
|
Ryan Sleevi
2017/03/06 17:20:04
Maybe it's because I haven't had my morning coffee
|
| + NOTREACHED() << "Invalid exception rule"; |
| + return 0; |
| + } |
| + DCHECK(in_wildcard); |
| + |
| + // "If the prevailing rule is a exception rule, modify it by removing |
| + // the leftmost label." -- publicsuffix.org, Algorithm 5 |
| + return host.length() - previous_dot - 1; |
| + } |
| - if (type & kDafsaExceptionRule) { |
| - if (next_dot == std::string::npos) { |
| - // If we get here, we had an exception rule with no dots (e.g. |
| - // "!foo"). This would only be valid if we had a corresponding |
| - // wildcard rule, which would have to be "*". But we explicitly |
| - // disallow that case, so this kind of rule is invalid. |
| - NOTREACHED() << "Invalid exception rule"; |
| - return 0; |
| + if (dafsa_result & kDafsaWildcardRule) { |
| + // When a wildcard rule is encountered, any sequence of characters in |
| + // the next label will be treated as a match. |
| + in_wildcard = true; |
| + } else { |
| + // "If there is no matching exception rule, the prevailing rule is the |
| + // one with the most labels." -- publicsuffix.org, Algorithm 4 |
| + // |
| + // Because of the loop structure, the currently-matched rule must have |
| + // more labels than any previous match, so the match at |pos| wins. |
| + in_wildcard = false; |
| + prevailing_rule_pos = pos; |
| } |
| - return host.length() - next_dot - 1; |
| + } else if (in_wildcard) { |
| + // The wildcard rule encountered at the end of the previous label |
| + // matches the hostname up to |pos|. This becomes the prevailing match. |
| + // |
| + // TODO(nick): Currently an empty label may match a wildcard rule. This |
| + // would yield a match on the rule "*.b.c" for the malformed host |
| + // "a..b.c". This is unnecessarily permissive. |
|
Ryan Sleevi
2017/03/06 17:20:04
Can you clarify the 'currently' - do you mean in t
|
| + in_wildcard = false; |
| + prevailing_rule_pos = pos; |
| } |
| - |
| - // If curr_start == host_check_begin, then the host is the registry |
| - // itself, so return 0. |
| - return (curr_start == host_check_begin) ? 0 |
| - : (host.length() - curr_start); |
| } |
| - |
| - if (next_dot >= host_check_len) // Catches std::string::npos as well. |
| - break; |
| - |
| - prev_start = curr_start; |
| - curr_start = next_dot + 1; |
| - next_dot = host.find('.', curr_start); |
| } |
| - // No rule found in the registry. curr_start now points to the first |
| - // character of the last subcomponent of the host, so if we allow unknown |
| - // registries, return the length of this subcomponent. |
| - return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ? |
| - (host.length() - curr_start) : 0; |
| + // "The registered or registrable domain is the public suffix plus one |
| + // additional label." -- publicsuffix.org, Algorithm 7 |
| + // |
| + // If the hostname has no labels beyond its public suffix, fail. |
| + if ((prevailing_rule_pos - 1) == host_check_rend) |
| + return 0; |
| + |
| + // If the end of the string is reached with an active wildcard rule, fail. |
| + // This corresponds to https://crbug.com/459802, 'the platform.sh problem', |
| + // asking: What is the public suffix of "y.z" when the rule set is ["*.y.z", |
| + // "z"]. Removing the next line would cause the behavior to match the |
| + // algorithm documented at publicsuffix.org, which treats "z" as the |
| + // prevailing rule in this case. |
| + if (in_wildcard) |
| + return 0; |
| + |
| + // "The public suffix is the set of labels from the domain which match the |
| + // labels of the prevailing rule, using the matching algorithm above." |
| + // -- publicsuffix.org, Algorithm 6 |
| + return host.length() - prevailing_rule_pos; |
| } |
| base::StringPiece GetDomainAndRegistryImpl( |
| @@ -177,8 +216,8 @@ base::StringPiece GetDomainAndRegistryImpl( |
| return base::StringPiece(); |
| } |
| - // Move past the dot preceding the registry, and search for the next previous |
| - // dot. Return the host from after that dot, or the whole host when there is |
| + // Move past the dot preceding the registry, and search for the dot before |
| + // that. Return the host from after that dot, or the whole host when there is |
| // no dot. |
| const size_t dot = host.rfind('.', host.length() - registry_length - 2); |
| if (dot == std::string::npos) |
| @@ -259,9 +298,8 @@ size_t DoPermissiveGetHostRegistryLength(base::BasicStringPiece<Str> host, |
| // Find which host component the result started in. |
| size_t canonical_rcd_begin = canonical_host.length() - canonical_rcd_len; |
| for (const auto& mapping : components) { |
| - // In the common case, GetRegistryLengthImpl will identify the beginning |
| - // of a component and we can just return where that component was in the |
| - // original string. |
| + // In the common case, GetRegistryLengthImpl will identify the beginning of |
| + // a component. Just return where that component was in the original string. |
| if (canonical_rcd_begin == mapping.canonical_begin) |
| return host.length() - mapping.original_begin; |
| @@ -274,12 +312,12 @@ size_t DoPermissiveGetHostRegistryLength(base::BasicStringPiece<Str> host, |
| // character that was canonicalized to a dot. |
| // |
| // Brute-force search from the end by repeatedly canonicalizing longer |
| - // substrings until we get a match for the canonicalized version. This |
| + // substrings, until finding a match for the canonicalized version. This |
| // can't be done with binary search because canonicalization might increase |
| // or decrease the length of the produced string depending on where it's |
| // split. This depends on the canonicalization process not changing the |
| - // order of the characters. Punycode can change the order of characters, |
| - // but it doesn't work across dots so this is safe. |
| + // order of the characters. Punycode can change the order of characters, but |
| + // it doesn't work across dots so this is safe. |
| // Expected canonical registry controlled domain. |
| base::StringPiece canonical_rcd(&canonical_host[canonical_rcd_begin], |