Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(524)

Unified Diff: net/base/registry_controlled_domains/registry_controlled_domain.cc

Issue 2649033004: [3 of 4] Speedup GetRegistryLengthImpl() by seeding the DAFSA in reverse
Patch Set: Language. Created 3 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: net/base/registry_controlled_domains/registry_controlled_domain.cc
diff --git a/net/base/registry_controlled_domains/registry_controlled_domain.cc b/net/base/registry_controlled_domains/registry_controlled_domain.cc
index 3777582812bb7103d4062d13a22a53ad342aac8e..5b3cf603d34ac386f7d815ee6991a03bd3267371 100644
--- a/net/base/registry_controlled_domains/registry_controlled_domain.cc
+++ b/net/base/registry_controlled_domains/registry_controlled_domain.cc
@@ -75,87 +75,126 @@ struct MappedHostComponent {
size_t canonical_end;
};
+// This function follows the specification at https://publicsuffix.org/list/.
+// |host| is assumed to be canonicalized.
size_t GetRegistryLengthImpl(base::StringPiece host,
UnknownRegistryFilter unknown_filter,
PrivateRegistryFilter private_filter) {
if (host.empty())
return std::string::npos;
- // Skip leading dots.
- const size_t host_check_begin = host.find_first_not_of('.');
- if (host_check_begin == std::string::npos)
+ // A single trailing dot (which fully qualifies the domain name) isn't
+ // relevant in this determination, but does need to be included in the final
+ // returned length. Multiple trailing dots are disallowed.
+ const size_t host_check_rbegin = host.find_last_not_of('.');
+ if (host_check_rbegin == std::string::npos)
return 0; // Host is only dots.
+ if (host_check_rbegin < host.length() - 2)
+ return 0; // Host has more than one trailing dot.
+
+ // Skip any number of leading dots. |host| is known to have at least one non-
+ // dot character, so find_first_not_of() always succeeds here.
+ const size_t host_check_rend = host.find_first_not_of('.') - 1;
+
+ // "If no rules match, the prevailing rule is '*'"
+ // -- publicsuffix.org, Algorithm 2
+ //
+ // The default-wildcard behavior works by starting in a wildcard state. Omit
+ // this step if the caller passes in EXCLUDE_UNKNOWN_REGISTRIES.
+ size_t in_wildcard = (unknown_filter == INCLUDE_UNKNOWN_REGISTRIES);
Ryan Sleevi 2017/03/06 17:20:04 Based on your usage (155/162/172), this should be
+
+ // "Match domain against all rules and take note of the matching ones."
+ // -- publicsuffix.org, Algorithm 1
+ //
+ // Feed |host| to the |suffix_lookup| DAFSA in reverse character order,
+ // checking for rule matches at each label boundary.
+ size_t prevailing_rule_pos = host.length();
+ FixedSetIncrementalLookup suffix_lookup(g_graph, g_graph_length);
+ for (size_t pos = host_check_rbegin; pos != host_check_rend; --pos) {
+ // Feed a character into the DAFSA.
+ if (!suffix_lookup.Advance(host[pos]) && !in_wildcard) {
+ // The DAFSA is exhausted, and there's no active wildcard rule, so it's
+ // possible to stop early.
+ break;
+ }
- // A single trailing dot isn't relevant in this determination, but does need
- // to be included in the final returned length.
- size_t host_check_len = host.length();
- if (host[host_check_len - 1] == '.') {
- --host_check_len;
- DCHECK(host_check_len > 0); // If this weren't true, the host would be ".",
- // and we'd have already returned above.
- if (host[host_check_len - 1] == '.')
- return 0; // Multiple trailing dots.
- }
-
- // Walk up the domain tree, most specific to least specific,
- // looking for matches at each level.
- size_t prev_start = std::string::npos;
- size_t curr_start = host_check_begin;
- size_t next_dot = host.find('.', curr_start);
- if (next_dot >= host_check_len) // Catches std::string::npos as well.
- return 0; // This can't have a registry + domain.
- while (1) {
- const char* domain_str = host.data() + curr_start;
- size_t domain_length = host_check_len - curr_start;
- int type = LookupStringInFixedSet(g_graph, g_graph_length, domain_str,
- domain_length);
- bool do_check = type != kDafsaNotFound &&
- (!(type & kDafsaPrivateRule) ||
- private_filter == INCLUDE_PRIVATE_REGISTRIES);
-
- // If the apparent match is a private registry and we're not including
- // those, it can't be an actual match.
- if (do_check) {
- // Exception rules override wildcard rules when the domain is an exact
- // match, but wildcards take precedence when there's a subdomain.
- if (type & kDafsaWildcardRule && (prev_start != std::string::npos)) {
- // If prev_start == host_check_begin, then the host is the registry
- // itself, so return 0.
- return (prev_start == host_check_begin) ? 0
- : (host.length() - prev_start);
- }
+ // At label boundaries, check the return value of the DAFSA state. This
+ // indicates whether there is a matching rule in the public suffix list.
+ bool is_last_char_in_label =
+ ((pos - 1) == host_check_rend || host[pos - 1] == '.');
+ if (is_last_char_in_label) {
Ryan Sleevi 2017/03/06 17:20:04 Does it make more sense to do if (!is_last_char_i
+ int dafsa_result = suffix_lookup.GetResultForCurrentSequence();
+ if (dafsa_result != kDafsaNotFound &&
+ ((dafsa_result & kDafsaPrivateRule) == 0 ||
+ private_filter == INCLUDE_PRIVATE_REGISTRIES)) {
+ if (dafsa_result & kDafsaExceptionRule) {
+ // "If more than one rule matches, the prevailing rule is the one
+ // which is an exception rule." -- publicsuffix.org, Algorithm 3
+ //
+ // There can only be at most one exception rule match for a given
+ // string. Thus, the first matching exception rule always wins.
+ size_t previous_dot = host.find('.', pos);
+ if (previous_dot == std::string::npos) {
+ // Getting here implies an exception rule with no dots (e.g.
+ // "!foo"). But exception rules are only allowed when there is a
+ // corresponding wildcard rule. The corresponding wildcard rule for
+ // this case would have to be '*', which is explicitly disallowed.
Ryan Sleevi 2017/03/06 17:20:04 Maybe it's because I haven't had my morning coffee
+ NOTREACHED() << "Invalid exception rule";
+ return 0;
+ }
+ DCHECK(in_wildcard);
+
+ // "If the prevailing rule is a exception rule, modify it by removing
+ // the leftmost label." -- publicsuffix.org, Algorithm 5
+ return host.length() - previous_dot - 1;
+ }
- if (type & kDafsaExceptionRule) {
- if (next_dot == std::string::npos) {
- // If we get here, we had an exception rule with no dots (e.g.
- // "!foo"). This would only be valid if we had a corresponding
- // wildcard rule, which would have to be "*". But we explicitly
- // disallow that case, so this kind of rule is invalid.
- NOTREACHED() << "Invalid exception rule";
- return 0;
+ if (dafsa_result & kDafsaWildcardRule) {
+ // When a wildcard rule is encountered, any sequence of characters in
+ // the next label will be treated as a match.
+ in_wildcard = true;
+ } else {
+ // "If there is no matching exception rule, the prevailing rule is the
+ // one with the most labels." -- publicsuffix.org, Algorithm 4
+ //
+ // Because of the loop structure, the currently-matched rule must have
+ // more labels than any previous match, so the match at |pos| wins.
+ in_wildcard = false;
+ prevailing_rule_pos = pos;
}
- return host.length() - next_dot - 1;
+ } else if (in_wildcard) {
+ // The wildcard rule encountered at the end of the previous label
+ // matches the hostname up to |pos|. This becomes the prevailing match.
+ //
+ // TODO(nick): Currently an empty label may match a wildcard rule. This
+ // would yield a match on the rule "*.b.c" for the malformed host
+ // "a..b.c". This is unnecessarily permissive.
Ryan Sleevi 2017/03/06 17:20:04 Can you clarify the 'currently' - do you mean in t
+ in_wildcard = false;
+ prevailing_rule_pos = pos;
}
-
- // If curr_start == host_check_begin, then the host is the registry
- // itself, so return 0.
- return (curr_start == host_check_begin) ? 0
- : (host.length() - curr_start);
}
-
- if (next_dot >= host_check_len) // Catches std::string::npos as well.
- break;
-
- prev_start = curr_start;
- curr_start = next_dot + 1;
- next_dot = host.find('.', curr_start);
}
- // No rule found in the registry. curr_start now points to the first
- // character of the last subcomponent of the host, so if we allow unknown
- // registries, return the length of this subcomponent.
- return unknown_filter == INCLUDE_UNKNOWN_REGISTRIES ?
- (host.length() - curr_start) : 0;
+ // "The registered or registrable domain is the public suffix plus one
+ // additional label." -- publicsuffix.org, Algorithm 7
+ //
+ // If the hostname has no labels beyond its public suffix, fail.
+ if ((prevailing_rule_pos - 1) == host_check_rend)
+ return 0;
+
+ // If the end of the string is reached with an active wildcard rule, fail.
+ // This corresponds to https://crbug.com/459802, 'the platform.sh problem',
+ // asking: What is the public suffix of "y.z" when the rule set is ["*.y.z",
+ // "z"]. Removing the next line would cause the behavior to match the
+ // algorithm documented at publicsuffix.org, which treats "z" as the
+ // prevailing rule in this case.
+ if (in_wildcard)
+ return 0;
+
+ // "The public suffix is the set of labels from the domain which match the
+ // labels of the prevailing rule, using the matching algorithm above."
+ // -- publicsuffix.org, Algorithm 6
+ return host.length() - prevailing_rule_pos;
}
base::StringPiece GetDomainAndRegistryImpl(
@@ -177,8 +216,8 @@ base::StringPiece GetDomainAndRegistryImpl(
return base::StringPiece();
}
- // Move past the dot preceding the registry, and search for the next previous
- // dot. Return the host from after that dot, or the whole host when there is
+ // Move past the dot preceding the registry, and search for the dot before
+ // that. Return the host from after that dot, or the whole host when there is
// no dot.
const size_t dot = host.rfind('.', host.length() - registry_length - 2);
if (dot == std::string::npos)
@@ -259,9 +298,8 @@ size_t DoPermissiveGetHostRegistryLength(base::BasicStringPiece<Str> host,
// Find which host component the result started in.
size_t canonical_rcd_begin = canonical_host.length() - canonical_rcd_len;
for (const auto& mapping : components) {
- // In the common case, GetRegistryLengthImpl will identify the beginning
- // of a component and we can just return where that component was in the
- // original string.
+ // In the common case, GetRegistryLengthImpl will identify the beginning of
+ // a component. Just return where that component was in the original string.
if (canonical_rcd_begin == mapping.canonical_begin)
return host.length() - mapping.original_begin;
@@ -274,12 +312,12 @@ size_t DoPermissiveGetHostRegistryLength(base::BasicStringPiece<Str> host,
// character that was canonicalized to a dot.
//
// Brute-force search from the end by repeatedly canonicalizing longer
- // substrings until we get a match for the canonicalized version. This
+ // substrings, until finding a match for the canonicalized version. This
// can't be done with binary search because canonicalization might increase
// or decrease the length of the produced string depending on where it's
// split. This depends on the canonicalization process not changing the
- // order of the characters. Punycode can change the order of characters,
- // but it doesn't work across dots so this is safe.
+ // order of the characters. Punycode can change the order of characters, but
+ // it doesn't work across dots so this is safe.
// Expected canonical registry controlled domain.
base::StringPiece canonical_rcd(&canonical_host[canonical_rcd_begin],

Powered by Google App Engine
This is Rietveld 408576698