components/subresource_filter/core/common/indexed_ruleset.cc - Issue 2816693002: [IndexedRuleset] Improve worst-case domain list matching.

Unified Diff: components/subresource_filter/core/common/indexed_ruleset.cc

Issue 2816693002: [IndexedRuleset] Improve worst-case domain list matching. (Closed)

Patch Set: Add TODO. Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

« no previous file with comments | « components/subresource_filter/core/common/flat/rules.fbs ('k') | components/subresource_filter/core/common/indexed_ruleset_unittest.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Hide Comments ('s')

Index: components/subresource_filter/core/common/indexed_ruleset.cc

diff --git a/components/subresource_filter/core/common/indexed_ruleset.cc b/components/subresource_filter/core/common/indexed_ruleset.cc

index 73cfd59f944b69d63e2b2a515b2f2ee3385dc88e..3a2e3e01626b6da4370e7a6a745b064aebaa0b58 100644

--- a/components/subresource_filter/core/common/indexed_ruleset.cc

+++ b/components/subresource_filter/core/common/indexed_ruleset.cc

@@ -10,6 +10,7 @@

#include "base/logging.h"

#include "base/numerics/safe_conversions.h"

+#include "base/strings/string_util.h"

#include "components/subresource_filter/core/common/first_party_origin.h"

#include "components/subresource_filter/core/common/ngram_extractor.h"

#include "components/subresource_filter/core/common/url_pattern.h"

@@ -20,6 +21,26 @@ namespace subresource_filter {

namespace {

using FlatStringOffset = flatbuffers::Offset<flatbuffers::String>;

+using FlatDomains = flatbuffers::Vector<FlatStringOffset>;

+using FlatDomainsOffset = flatbuffers::Offset<FlatDomains>;

+base::StringPiece ToStringPiece(const flatbuffers::String* string) {

+ DCHECK(string);

+ return base::StringPiece(string->c_str(), string->size());

+// Performs three-way comparison between two domains. In the total order defined

+// by this predicate, the lengths of domains will be monotonically decreasing.

+int CompareDomains(base::StringPiece lhs_domain, base::StringPiece rhs_domain) {

+ if (lhs_domain.size() != rhs_domain.size())

+ return lhs_domain.size() > rhs_domain.size() ? -1 : 1;

+ return lhs_domain.compare(rhs_domain);

+bool HasNoUpperAscii(base::StringPiece string) {

+ return std::none_of(string.begin(), string.end(),

+ [](char c) { return base::IsAsciiUpper(c); });

// Checks whether a URL |rule| can be converted to its FlatBuffers equivalent,

// and performs the actual conversion.

@@ -48,29 +69,58 @@ class UrlRuleFlatBufferConverter {

flatbuffers::FlatBufferBuilder* builder) const {

DCHECK(is_convertible());

- flatbuffers::Offset<flatbuffers::Vector<FlatStringOffset>> domains_offset;

+ FlatDomainsOffset domains_included_offset;

+ FlatDomainsOffset domains_excluded_offset;

if (rule_.domains_size()) {

- std::vector<FlatStringOffset> domains;

- domains.reserve(rule_.domains_size());

+ // TODO(pkalinnikov): Consider sharing the vectors between rules.

+ std::vector<FlatStringOffset> domains_included;

+ std::vector<FlatStringOffset> domains_excluded;

+ // Reserve only for |domains_included| because it is expected to be the

+ // one used more frequently.

+ domains_included.reserve(rule_.domains_size());

- std::string domain;

for (const auto& domain_list_item : rule_.domains()) {

- domain.clear();

- domain.reserve(domain_list_item.domain().size() + 1);

+ // Note: The |domain| can have non-ASCII UTF-8 characters, but

+ // ToLowerASCII leaves these intact.

+ // TODO(pkalinnikov): Convert non-ASCII characters to lower case too.

+ // TODO(pkalinnikov): Possibly convert Punycode to IDN here or directly

+ // assume this is done in the proto::UrlRule.

+ const std::string& domain = domain_list_item.domain();

+ auto offset = builder->CreateSharedString(

+ HasNoUpperAscii(domain) ? domain : base::ToLowerASCII(domain));

if (domain_list_item.exclude())

- domain += '~';

- domain += domain_list_item.domain();

- domains.push_back(builder->CreateSharedString(domain));

+ domains_excluded.push_back(offset);

+ else

+ domains_included.push_back(offset);

+ }

+ // The comparator ensuring the domains order necessary for fast matching.

+ auto precedes = [&builder](FlatStringOffset lhs, FlatStringOffset rhs) {

+ return CompareDomains(ToStringPiece(flatbuffers::GetTemporaryPointer(

+ *builder, lhs)),

+ ToStringPiece(flatbuffers::GetTemporaryPointer(

+ *builder, rhs))) < 0;

+ };

+ // The domains are stored in sorted order to support fast matching.

+ if (!domains_included.empty()) {

+ // TODO(pkalinnikov): Don't sort if it is already sorted offline.

+ std::sort(domains_included.begin(), domains_included.end(), precedes);

+ domains_included_offset = builder->CreateVector(domains_included);

+ }

+ if (!domains_excluded.empty()) {

+ std::sort(domains_excluded.begin(), domains_excluded.end(), precedes);

+ domains_excluded_offset = builder->CreateVector(domains_excluded);

}

- domains_offset = builder->CreateVector(domains);

}

auto url_pattern_offset = builder->CreateString(rule_.url_pattern());

- return flat::CreateUrlRule(*builder, options_, element_types_,

- activation_types_, url_pattern_type_,

- anchor_left_, anchor_right_, domains_offset,

- url_pattern_offset);

+ return flat::CreateUrlRule(

+ *builder, options_, element_types_, activation_types_,

+ url_pattern_type_, anchor_left_, anchor_right_, domains_included_offset,

+ domains_excluded_offset, url_pattern_offset);

}

private:

@@ -203,7 +253,7 @@ class UrlRuleFlatBufferConverter {

// RulesetIndexer --------------------------------------------------------------

// static

-const int RulesetIndexer::kIndexedFormatVersion = 16;

+const int RulesetIndexer::kIndexedFormatVersion = 17;

RulesetIndexer::MutableUrlPatternIndex::MutableUrlPatternIndex() = default;

RulesetIndexer::MutableUrlPatternIndex::~MutableUrlPatternIndex() = default;

@@ -313,62 +363,94 @@ using FlatUrlRuleList = flatbuffers::Vector<flatbuffers::Offset<flat::UrlRule>>;

using FlatNGramIndex =

flatbuffers::Vector<flatbuffers::Offset<flat::NGramToRules>>;

-// Returns whether the |origin| matches the list of |domains|. A match means

-// that the longest domain in |domains| that |origin| is a sub-domain of is not

-// an exception OR all the |domains| are exceptions and neither matches the

-// |origin|. Thus, domain filters with more domain components trump filters with

-// fewer domain components, i.e. the more specific a filter is, the higher the

-// priority.

+// Returns the size of the longest (sub-)domain of |origin| matching one of the

+// |domains| in the list.

-// A rule whose domain list is empty or contains only negative domains is still

-// considered a "generic" rule. Therefore, if |disable_generic_rules| is set,

-// this function will always return false for such |domains| lists.

-//

-// TODO(pkalinnikov): Make it fast.

-bool DoesOriginMatchDomainList(

- const url::Origin& origin,

- const flatbuffers::Vector<FlatStringOffset>* domains,

- bool disable_generic_rules) {

- if (!domains || !domains->size())

- return !disable_generic_rules;

- // Unique |origin| matches lists of exception domains only.

- if (origin.unique()) {

- for (const flatbuffers::String* domain_filter : *domains) {

- DCHECK_GT(domain_filter->size(), 0u);

- if (domain_filter->Get(0) != '~')

- return false;

+// The |domains| should be sorted in descending order of their length, and

+// ascending alphabetical order within the groups of same-length domains.

+size_t GetLongestMatchingSubdomain(const url::Origin& origin,

+ const FlatDomains& domains) {

+ // If the |domains| list is short, then the simple strategy is usually faster.

+ if (domains.size() <= 5) {

+ for (auto* domain : domains) {

+ const base::StringPiece domain_piece = ToStringPiece(domain);

+ if (origin.DomainIs(domain_piece))

+ return domain_piece.size();

+ }

+ return 0;

+ }

+ // Otherwise look for each subdomain of the |origin| using binary search.

+ DCHECK(!origin.unique());

+ base::StringPiece canonicalized_host(origin.host());

+ if (canonicalized_host.empty())

+ return 0;

+ // If the host name ends with a dot, then ignore it.

+ if (canonicalized_host.back() == '.')

+ canonicalized_host.remove_suffix(1);

+ // The |left| bound of the search is shared between iterations, because

+ // subdomains are considered in decreasing order of their lengths, therefore

+ // each consecutive lower_bound will be at least as far as the previous.

+ flatbuffers::uoffset_t left = 0;

+ for (size_t position = 0;; ++position) {

+ const base::StringPiece subdomain = canonicalized_host.substr(position);

+ flatbuffers::uoffset_t right = domains.size();

+ while (left + 1 < right) {

+ auto middle = left + (right - left) / 2;

+ DCHECK_LT(middle, domains.size());

+ if (CompareDomains(ToStringPiece(domains[middle]), subdomain) <= 0)

+ left = middle;

+ else

+ right = middle;

}

- return !disable_generic_rules;

+ DCHECK_LT(left, domains.size());

+ if (ToStringPiece(domains[left]) == subdomain)

+ return subdomain.size();

+ position = canonicalized_host.find('.', position);

+ if (position == base::StringPiece::npos)

+ break;

}

- size_t max_domain_length = 0;

- bool is_positive = true;

- bool negatives_only = true;

+ return 0;

- for (flatbuffers::uoffset_t i = 0, size = domains->size(); i != size; ++i) {

- const flatbuffers::String* domain_filter = domains->Get(i);

- if (domain_filter->Length() <= max_domain_length)

- continue;

+// Returns whether the |origin| matches the domain list of the |rule|. A match

+// means that the longest domain in |domains| that |origin| is a sub-domain of

+// is not an exception OR all the |domains| are exceptions and neither matches

+// the |origin|. Thus, domain filters with more domain components trump filters

+// with fewer domain components, i.e. the more specific a filter is, the higher

+// the priority.

+//

+// A rule whose domain list is empty or contains only negative domains is still

+// considered a "generic" rule. Therefore, if |disable_generic_rules| is set,

+// this function will always return false for such rules.

+bool DoesOriginMatchDomainList(const url::Origin& origin,

+ const flat::UrlRule& rule,

+ bool disable_generic_rules) {

+ const bool is_generic = !rule.domains_included();

+ DCHECK(is_generic || rule.domains_included()->size());

+ if (disable_generic_rules && is_generic)

+ return false;

- base::StringPiece filter_piece(domain_filter->c_str(),

- domain_filter->Length());

- const bool is_negative = (domain_filter->Get(0) == '~');

- if (is_negative) {

- filter_piece.remove_prefix(1);

- if (filter_piece.length() == max_domain_length)

- continue;

- } else {

- negatives_only = false;

- }

+ // Unique |origin| matches lists of exception domains only.

+ if (origin.unique())

+ return is_generic;

- if (!origin.DomainIs(filter_piece))

- continue;

- max_domain_length = filter_piece.length();

- is_positive = !is_negative;

+ size_t longest_matching_included_domain_length = 1;

+ if (!is_generic) {

+ longest_matching_included_domain_length =

+ GetLongestMatchingSubdomain(origin, *rule.domains_included());

}

- return max_domain_length ? is_positive

- : negatives_only && !disable_generic_rules;

+ if (longest_matching_included_domain_length && rule.domains_excluded()) {

+ return GetLongestMatchingSubdomain(origin, *rule.domains_excluded()) <

+ longest_matching_included_domain_length;

+ }

+ return !!longest_matching_included_domain_length;

}

// Returns whether the request matches flags of the specified URL |rule|. Takes

@@ -423,7 +505,7 @@ bool MatchesAny(const FlatUrlRuleList* rules,

if (!UrlPattern(*rule).MatchesUrl(url))

continue;

- if (DoesOriginMatchDomainList(document_origin, rule->domains(),

+ if (DoesOriginMatchDomainList(document_origin, *rule,

disable_generic_rules)) {

return true;

}