Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(966)

Unified Diff: components/subresource_filter/core/common/indexed_ruleset.cc

Issue 2816693002: [IndexedRuleset] Improve worst-case domain list matching. (Closed)
Patch Set: Add TODO. Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/subresource_filter/core/common/indexed_ruleset.cc
diff --git a/components/subresource_filter/core/common/indexed_ruleset.cc b/components/subresource_filter/core/common/indexed_ruleset.cc
index 73cfd59f944b69d63e2b2a515b2f2ee3385dc88e..3a2e3e01626b6da4370e7a6a745b064aebaa0b58 100644
--- a/components/subresource_filter/core/common/indexed_ruleset.cc
+++ b/components/subresource_filter/core/common/indexed_ruleset.cc
@@ -10,6 +10,7 @@
#include "base/logging.h"
#include "base/numerics/safe_conversions.h"
+#include "base/strings/string_util.h"
#include "components/subresource_filter/core/common/first_party_origin.h"
#include "components/subresource_filter/core/common/ngram_extractor.h"
#include "components/subresource_filter/core/common/url_pattern.h"
@@ -20,6 +21,26 @@ namespace subresource_filter {
namespace {
using FlatStringOffset = flatbuffers::Offset<flatbuffers::String>;
+using FlatDomains = flatbuffers::Vector<FlatStringOffset>;
+using FlatDomainsOffset = flatbuffers::Offset<FlatDomains>;
+
+base::StringPiece ToStringPiece(const flatbuffers::String* string) {
+ DCHECK(string);
+ return base::StringPiece(string->c_str(), string->size());
+}
+
+// Performs three-way comparison between two domains. In the total order defined
+// by this predicate, the lengths of domains will be monotonically decreasing.
+int CompareDomains(base::StringPiece lhs_domain, base::StringPiece rhs_domain) {
+ if (lhs_domain.size() != rhs_domain.size())
+ return lhs_domain.size() > rhs_domain.size() ? -1 : 1;
+ return lhs_domain.compare(rhs_domain);
+}
+
+bool HasNoUpperAscii(base::StringPiece string) {
+ return std::none_of(string.begin(), string.end(),
+ [](char c) { return base::IsAsciiUpper(c); });
+}
// Checks whether a URL |rule| can be converted to its FlatBuffers equivalent,
// and performs the actual conversion.
@@ -48,29 +69,58 @@ class UrlRuleFlatBufferConverter {
flatbuffers::FlatBufferBuilder* builder) const {
DCHECK(is_convertible());
- flatbuffers::Offset<flatbuffers::Vector<FlatStringOffset>> domains_offset;
+ FlatDomainsOffset domains_included_offset;
+ FlatDomainsOffset domains_excluded_offset;
if (rule_.domains_size()) {
- std::vector<FlatStringOffset> domains;
- domains.reserve(rule_.domains_size());
+ // TODO(pkalinnikov): Consider sharing the vectors between rules.
+ std::vector<FlatStringOffset> domains_included;
+ std::vector<FlatStringOffset> domains_excluded;
+ // Reserve only for |domains_included| because it is expected to be the
+ // one used more frequently.
+ domains_included.reserve(rule_.domains_size());
- std::string domain;
for (const auto& domain_list_item : rule_.domains()) {
- domain.clear();
- domain.reserve(domain_list_item.domain().size() + 1);
+ // Note: The |domain| can have non-ASCII UTF-8 characters, but
+ // ToLowerASCII leaves these intact.
+ // TODO(pkalinnikov): Convert non-ASCII characters to lower case too.
+ // TODO(pkalinnikov): Possibly convert Punycode to IDN here or directly
+ // assume this is done in the proto::UrlRule.
+ const std::string& domain = domain_list_item.domain();
+ auto offset = builder->CreateSharedString(
+ HasNoUpperAscii(domain) ? domain : base::ToLowerASCII(domain));
+
if (domain_list_item.exclude())
- domain += '~';
- domain += domain_list_item.domain();
- domains.push_back(builder->CreateSharedString(domain));
+ domains_excluded.push_back(offset);
+ else
+ domains_included.push_back(offset);
+ }
+
+ // The comparator ensuring the domains order necessary for fast matching.
+ auto precedes = [&builder](FlatStringOffset lhs, FlatStringOffset rhs) {
+ return CompareDomains(ToStringPiece(flatbuffers::GetTemporaryPointer(
+ *builder, lhs)),
+ ToStringPiece(flatbuffers::GetTemporaryPointer(
+ *builder, rhs))) < 0;
+ };
+
+ // The domains are stored in sorted order to support fast matching.
+ if (!domains_included.empty()) {
+ // TODO(pkalinnikov): Don't sort if it is already sorted offline.
+ std::sort(domains_included.begin(), domains_included.end(), precedes);
+ domains_included_offset = builder->CreateVector(domains_included);
+ }
+ if (!domains_excluded.empty()) {
+ std::sort(domains_excluded.begin(), domains_excluded.end(), precedes);
+ domains_excluded_offset = builder->CreateVector(domains_excluded);
}
- domains_offset = builder->CreateVector(domains);
}
auto url_pattern_offset = builder->CreateString(rule_.url_pattern());
- return flat::CreateUrlRule(*builder, options_, element_types_,
- activation_types_, url_pattern_type_,
- anchor_left_, anchor_right_, domains_offset,
- url_pattern_offset);
+ return flat::CreateUrlRule(
+ *builder, options_, element_types_, activation_types_,
+ url_pattern_type_, anchor_left_, anchor_right_, domains_included_offset,
+ domains_excluded_offset, url_pattern_offset);
}
private:
@@ -203,7 +253,7 @@ class UrlRuleFlatBufferConverter {
// RulesetIndexer --------------------------------------------------------------
// static
-const int RulesetIndexer::kIndexedFormatVersion = 16;
+const int RulesetIndexer::kIndexedFormatVersion = 17;
RulesetIndexer::MutableUrlPatternIndex::MutableUrlPatternIndex() = default;
RulesetIndexer::MutableUrlPatternIndex::~MutableUrlPatternIndex() = default;
@@ -313,62 +363,94 @@ using FlatUrlRuleList = flatbuffers::Vector<flatbuffers::Offset<flat::UrlRule>>;
using FlatNGramIndex =
flatbuffers::Vector<flatbuffers::Offset<flat::NGramToRules>>;
-// Returns whether the |origin| matches the list of |domains|. A match means
-// that the longest domain in |domains| that |origin| is a sub-domain of is not
-// an exception OR all the |domains| are exceptions and neither matches the
-// |origin|. Thus, domain filters with more domain components trump filters with
-// fewer domain components, i.e. the more specific a filter is, the higher the
-// priority.
+// Returns the size of the longest (sub-)domain of |origin| matching one of the
+// |domains| in the list.
//
-// A rule whose domain list is empty or contains only negative domains is still
-// considered a "generic" rule. Therefore, if |disable_generic_rules| is set,
-// this function will always return false for such |domains| lists.
-//
-// TODO(pkalinnikov): Make it fast.
-bool DoesOriginMatchDomainList(
- const url::Origin& origin,
- const flatbuffers::Vector<FlatStringOffset>* domains,
- bool disable_generic_rules) {
- if (!domains || !domains->size())
- return !disable_generic_rules;
- // Unique |origin| matches lists of exception domains only.
- if (origin.unique()) {
- for (const flatbuffers::String* domain_filter : *domains) {
- DCHECK_GT(domain_filter->size(), 0u);
- if (domain_filter->Get(0) != '~')
- return false;
+// The |domains| should be sorted in descending order of their length, and
+// ascending alphabetical order within the groups of same-length domains.
+size_t GetLongestMatchingSubdomain(const url::Origin& origin,
+ const FlatDomains& domains) {
+ // If the |domains| list is short, then the simple strategy is usually faster.
+ if (domains.size() <= 5) {
+ for (auto* domain : domains) {
+ const base::StringPiece domain_piece = ToStringPiece(domain);
+ if (origin.DomainIs(domain_piece))
+ return domain_piece.size();
+ }
+ return 0;
+ }
+ // Otherwise look for each subdomain of the |origin| using binary search.
+
+ DCHECK(!origin.unique());
+ base::StringPiece canonicalized_host(origin.host());
+ if (canonicalized_host.empty())
+ return 0;
+
+ // If the host name ends with a dot, then ignore it.
+ if (canonicalized_host.back() == '.')
+ canonicalized_host.remove_suffix(1);
+
+ // The |left| bound of the search is shared between iterations, because
+ // subdomains are considered in decreasing order of their lengths, therefore
+ // each consecutive lower_bound will be at least as far as the previous.
+ flatbuffers::uoffset_t left = 0;
+ for (size_t position = 0;; ++position) {
+ const base::StringPiece subdomain = canonicalized_host.substr(position);
+
+ flatbuffers::uoffset_t right = domains.size();
+ while (left + 1 < right) {
+ auto middle = left + (right - left) / 2;
+ DCHECK_LT(middle, domains.size());
+ if (CompareDomains(ToStringPiece(domains[middle]), subdomain) <= 0)
+ left = middle;
+ else
+ right = middle;
}
- return !disable_generic_rules;
+
+ DCHECK_LT(left, domains.size());
+ if (ToStringPiece(domains[left]) == subdomain)
+ return subdomain.size();
+
+ position = canonicalized_host.find('.', position);
+ if (position == base::StringPiece::npos)
+ break;
}
- size_t max_domain_length = 0;
- bool is_positive = true;
- bool negatives_only = true;
+ return 0;
+}
- for (flatbuffers::uoffset_t i = 0, size = domains->size(); i != size; ++i) {
- const flatbuffers::String* domain_filter = domains->Get(i);
- if (domain_filter->Length() <= max_domain_length)
- continue;
+// Returns whether the |origin| matches the domain list of the |rule|. A match
+// means that the longest domain in |domains| that |origin| is a sub-domain of
+// is not an exception OR all the |domains| are exceptions and neither matches
+// the |origin|. Thus, domain filters with more domain components trump filters
+// with fewer domain components, i.e. the more specific a filter is, the higher
+// the priority.
+//
+// A rule whose domain list is empty or contains only negative domains is still
+// considered a "generic" rule. Therefore, if |disable_generic_rules| is set,
+// this function will always return false for such rules.
+bool DoesOriginMatchDomainList(const url::Origin& origin,
+ const flat::UrlRule& rule,
+ bool disable_generic_rules) {
+ const bool is_generic = !rule.domains_included();
+ DCHECK(is_generic || rule.domains_included()->size());
+ if (disable_generic_rules && is_generic)
+ return false;
- base::StringPiece filter_piece(domain_filter->c_str(),
- domain_filter->Length());
- const bool is_negative = (domain_filter->Get(0) == '~');
- if (is_negative) {
- filter_piece.remove_prefix(1);
- if (filter_piece.length() == max_domain_length)
- continue;
- } else {
- negatives_only = false;
- }
+ // Unique |origin| matches lists of exception domains only.
+ if (origin.unique())
+ return is_generic;
- if (!origin.DomainIs(filter_piece))
- continue;
- max_domain_length = filter_piece.length();
- is_positive = !is_negative;
+ size_t longest_matching_included_domain_length = 1;
+ if (!is_generic) {
+ longest_matching_included_domain_length =
+ GetLongestMatchingSubdomain(origin, *rule.domains_included());
}
-
- return max_domain_length ? is_positive
- : negatives_only && !disable_generic_rules;
+ if (longest_matching_included_domain_length && rule.domains_excluded()) {
+ return GetLongestMatchingSubdomain(origin, *rule.domains_excluded()) <
+ longest_matching_included_domain_length;
+ }
+ return !!longest_matching_included_domain_length;
}
// Returns whether the request matches flags of the specified URL |rule|. Takes
@@ -423,7 +505,7 @@ bool MatchesAny(const FlatUrlRuleList* rules,
if (!UrlPattern(*rule).MatchesUrl(url))
continue;
- if (DoesOriginMatchDomainList(document_origin, rule->domains(),
+ if (DoesOriginMatchDomainList(document_origin, *rule,
disable_generic_rules)) {
return true;
}

Powered by Google App Engine
This is Rietveld 408576698