| Index: components/subresource_filter/core/common/indexed_ruleset.cc
|
| diff --git a/components/subresource_filter/core/common/indexed_ruleset.cc b/components/subresource_filter/core/common/indexed_ruleset.cc
|
| index 73cfd59f944b69d63e2b2a515b2f2ee3385dc88e..3a2e3e01626b6da4370e7a6a745b064aebaa0b58 100644
|
| --- a/components/subresource_filter/core/common/indexed_ruleset.cc
|
| +++ b/components/subresource_filter/core/common/indexed_ruleset.cc
|
| @@ -10,6 +10,7 @@
|
|
|
| #include "base/logging.h"
|
| #include "base/numerics/safe_conversions.h"
|
| +#include "base/strings/string_util.h"
|
| #include "components/subresource_filter/core/common/first_party_origin.h"
|
| #include "components/subresource_filter/core/common/ngram_extractor.h"
|
| #include "components/subresource_filter/core/common/url_pattern.h"
|
| @@ -20,6 +21,26 @@ namespace subresource_filter {
|
| namespace {
|
|
|
| using FlatStringOffset = flatbuffers::Offset<flatbuffers::String>;
|
| +using FlatDomains = flatbuffers::Vector<FlatStringOffset>;
|
| +using FlatDomainsOffset = flatbuffers::Offset<FlatDomains>;
|
| +
|
| +base::StringPiece ToStringPiece(const flatbuffers::String* string) {
|
| + DCHECK(string);
|
| + return base::StringPiece(string->c_str(), string->size());
|
| +}
|
| +
|
| +// Performs three-way comparison between two domains. In the total order defined
|
| +// by this predicate, the lengths of domains will be monotonically decreasing.
|
| +int CompareDomains(base::StringPiece lhs_domain, base::StringPiece rhs_domain) {
|
| + if (lhs_domain.size() != rhs_domain.size())
|
| + return lhs_domain.size() > rhs_domain.size() ? -1 : 1;
|
| + return lhs_domain.compare(rhs_domain);
|
| +}
|
| +
|
| +bool HasNoUpperAscii(base::StringPiece string) {
|
| + return std::none_of(string.begin(), string.end(),
|
| + [](char c) { return base::IsAsciiUpper(c); });
|
| +}
|
|
|
| // Checks whether a URL |rule| can be converted to its FlatBuffers equivalent,
|
| // and performs the actual conversion.
|
| @@ -48,29 +69,58 @@ class UrlRuleFlatBufferConverter {
|
| flatbuffers::FlatBufferBuilder* builder) const {
|
| DCHECK(is_convertible());
|
|
|
| - flatbuffers::Offset<flatbuffers::Vector<FlatStringOffset>> domains_offset;
|
| + FlatDomainsOffset domains_included_offset;
|
| + FlatDomainsOffset domains_excluded_offset;
|
| if (rule_.domains_size()) {
|
| - std::vector<FlatStringOffset> domains;
|
| - domains.reserve(rule_.domains_size());
|
| + // TODO(pkalinnikov): Consider sharing the vectors between rules.
|
| + std::vector<FlatStringOffset> domains_included;
|
| + std::vector<FlatStringOffset> domains_excluded;
|
| + // Reserve only for |domains_included| because it is expected to be the
|
| + // one used more frequently.
|
| + domains_included.reserve(rule_.domains_size());
|
|
|
| - std::string domain;
|
| for (const auto& domain_list_item : rule_.domains()) {
|
| - domain.clear();
|
| - domain.reserve(domain_list_item.domain().size() + 1);
|
| + // Note: The |domain| can have non-ASCII UTF-8 characters, but
|
| + // ToLowerASCII leaves these intact.
|
| + // TODO(pkalinnikov): Convert non-ASCII characters to lower case too.
|
| + // TODO(pkalinnikov): Possibly convert Punycode to IDN here or directly
|
| + // assume this is done in the proto::UrlRule.
|
| + const std::string& domain = domain_list_item.domain();
|
| + auto offset = builder->CreateSharedString(
|
| + HasNoUpperAscii(domain) ? domain : base::ToLowerASCII(domain));
|
| +
|
| if (domain_list_item.exclude())
|
| - domain += '~';
|
| - domain += domain_list_item.domain();
|
| - domains.push_back(builder->CreateSharedString(domain));
|
| + domains_excluded.push_back(offset);
|
| + else
|
| + domains_included.push_back(offset);
|
| + }
|
| +
|
| + // The comparator ensuring the domains order necessary for fast matching.
|
| + auto precedes = [&builder](FlatStringOffset lhs, FlatStringOffset rhs) {
|
| + return CompareDomains(ToStringPiece(flatbuffers::GetTemporaryPointer(
|
| + *builder, lhs)),
|
| + ToStringPiece(flatbuffers::GetTemporaryPointer(
|
| + *builder, rhs))) < 0;
|
| + };
|
| +
|
| + // The domains are stored in sorted order to support fast matching.
|
| + if (!domains_included.empty()) {
|
| + // TODO(pkalinnikov): Don't sort if it is already sorted offline.
|
| + std::sort(domains_included.begin(), domains_included.end(), precedes);
|
| + domains_included_offset = builder->CreateVector(domains_included);
|
| + }
|
| + if (!domains_excluded.empty()) {
|
| + std::sort(domains_excluded.begin(), domains_excluded.end(), precedes);
|
| + domains_excluded_offset = builder->CreateVector(domains_excluded);
|
| }
|
| - domains_offset = builder->CreateVector(domains);
|
| }
|
|
|
| auto url_pattern_offset = builder->CreateString(rule_.url_pattern());
|
|
|
| - return flat::CreateUrlRule(*builder, options_, element_types_,
|
| - activation_types_, url_pattern_type_,
|
| - anchor_left_, anchor_right_, domains_offset,
|
| - url_pattern_offset);
|
| + return flat::CreateUrlRule(
|
| + *builder, options_, element_types_, activation_types_,
|
| + url_pattern_type_, anchor_left_, anchor_right_, domains_included_offset,
|
| + domains_excluded_offset, url_pattern_offset);
|
| }
|
|
|
| private:
|
| @@ -203,7 +253,7 @@ class UrlRuleFlatBufferConverter {
|
| // RulesetIndexer --------------------------------------------------------------
|
|
|
| // static
|
| -const int RulesetIndexer::kIndexedFormatVersion = 16;
|
| +const int RulesetIndexer::kIndexedFormatVersion = 17;
|
|
|
| RulesetIndexer::MutableUrlPatternIndex::MutableUrlPatternIndex() = default;
|
| RulesetIndexer::MutableUrlPatternIndex::~MutableUrlPatternIndex() = default;
|
| @@ -313,62 +363,94 @@ using FlatUrlRuleList = flatbuffers::Vector<flatbuffers::Offset<flat::UrlRule>>;
|
| using FlatNGramIndex =
|
| flatbuffers::Vector<flatbuffers::Offset<flat::NGramToRules>>;
|
|
|
| -// Returns whether the |origin| matches the list of |domains|. A match means
|
| -// that the longest domain in |domains| that |origin| is a sub-domain of is not
|
| -// an exception OR all the |domains| are exceptions and neither matches the
|
| -// |origin|. Thus, domain filters with more domain components trump filters with
|
| -// fewer domain components, i.e. the more specific a filter is, the higher the
|
| -// priority.
|
| +// Returns the size of the longest (sub-)domain of |origin| matching one of the
|
| +// |domains| in the list.
|
| //
|
| -// A rule whose domain list is empty or contains only negative domains is still
|
| -// considered a "generic" rule. Therefore, if |disable_generic_rules| is set,
|
| -// this function will always return false for such |domains| lists.
|
| -//
|
| -// TODO(pkalinnikov): Make it fast.
|
| -bool DoesOriginMatchDomainList(
|
| - const url::Origin& origin,
|
| - const flatbuffers::Vector<FlatStringOffset>* domains,
|
| - bool disable_generic_rules) {
|
| - if (!domains || !domains->size())
|
| - return !disable_generic_rules;
|
| - // Unique |origin| matches lists of exception domains only.
|
| - if (origin.unique()) {
|
| - for (const flatbuffers::String* domain_filter : *domains) {
|
| - DCHECK_GT(domain_filter->size(), 0u);
|
| - if (domain_filter->Get(0) != '~')
|
| - return false;
|
| +// The |domains| should be sorted in descending order of their length, and
|
| +// ascending alphabetical order within the groups of same-length domains.
|
| +size_t GetLongestMatchingSubdomain(const url::Origin& origin,
|
| + const FlatDomains& domains) {
|
| + // If the |domains| list is short, then the simple strategy is usually faster.
|
| + if (domains.size() <= 5) {
|
| + for (auto* domain : domains) {
|
| + const base::StringPiece domain_piece = ToStringPiece(domain);
|
| + if (origin.DomainIs(domain_piece))
|
| + return domain_piece.size();
|
| + }
|
| + return 0;
|
| + }
|
| + // Otherwise look for each subdomain of the |origin| using binary search.
|
| +
|
| + DCHECK(!origin.unique());
|
| + base::StringPiece canonicalized_host(origin.host());
|
| + if (canonicalized_host.empty())
|
| + return 0;
|
| +
|
| + // If the host name ends with a dot, then ignore it.
|
| + if (canonicalized_host.back() == '.')
|
| + canonicalized_host.remove_suffix(1);
|
| +
|
| + // The |left| bound of the search is shared between iterations, because
|
| + // subdomains are considered in decreasing order of their lengths, therefore
|
| + // each consecutive lower_bound will be at least as far as the previous.
|
| + flatbuffers::uoffset_t left = 0;
|
| + for (size_t position = 0;; ++position) {
|
| + const base::StringPiece subdomain = canonicalized_host.substr(position);
|
| +
|
| + flatbuffers::uoffset_t right = domains.size();
|
| + while (left + 1 < right) {
|
| + auto middle = left + (right - left) / 2;
|
| + DCHECK_LT(middle, domains.size());
|
| + if (CompareDomains(ToStringPiece(domains[middle]), subdomain) <= 0)
|
| + left = middle;
|
| + else
|
| + right = middle;
|
| }
|
| - return !disable_generic_rules;
|
| +
|
| + DCHECK_LT(left, domains.size());
|
| + if (ToStringPiece(domains[left]) == subdomain)
|
| + return subdomain.size();
|
| +
|
| + position = canonicalized_host.find('.', position);
|
| + if (position == base::StringPiece::npos)
|
| + break;
|
| }
|
|
|
| - size_t max_domain_length = 0;
|
| - bool is_positive = true;
|
| - bool negatives_only = true;
|
| + return 0;
|
| +}
|
|
|
| - for (flatbuffers::uoffset_t i = 0, size = domains->size(); i != size; ++i) {
|
| - const flatbuffers::String* domain_filter = domains->Get(i);
|
| - if (domain_filter->Length() <= max_domain_length)
|
| - continue;
|
| +// Returns whether the |origin| matches the domain list of the |rule|. A match
|
| +// means that the longest domain in |domains| that |origin| is a sub-domain of
|
| +// is not an exception OR all the |domains| are exceptions and neither matches
|
| +// the |origin|. Thus, domain filters with more domain components trump filters
|
| +// with fewer domain components, i.e. the more specific a filter is, the higher
|
| +// the priority.
|
| +//
|
| +// A rule whose domain list is empty or contains only negative domains is still
|
| +// considered a "generic" rule. Therefore, if |disable_generic_rules| is set,
|
| +// this function will always return false for such rules.
|
| +bool DoesOriginMatchDomainList(const url::Origin& origin,
|
| + const flat::UrlRule& rule,
|
| + bool disable_generic_rules) {
|
| + const bool is_generic = !rule.domains_included();
|
| + DCHECK(is_generic || rule.domains_included()->size());
|
| + if (disable_generic_rules && is_generic)
|
| + return false;
|
|
|
| - base::StringPiece filter_piece(domain_filter->c_str(),
|
| - domain_filter->Length());
|
| - const bool is_negative = (domain_filter->Get(0) == '~');
|
| - if (is_negative) {
|
| - filter_piece.remove_prefix(1);
|
| - if (filter_piece.length() == max_domain_length)
|
| - continue;
|
| - } else {
|
| - negatives_only = false;
|
| - }
|
| + // Unique |origin| matches lists of exception domains only.
|
| + if (origin.unique())
|
| + return is_generic;
|
|
|
| - if (!origin.DomainIs(filter_piece))
|
| - continue;
|
| - max_domain_length = filter_piece.length();
|
| - is_positive = !is_negative;
|
| + size_t longest_matching_included_domain_length = 1;
|
| + if (!is_generic) {
|
| + longest_matching_included_domain_length =
|
| + GetLongestMatchingSubdomain(origin, *rule.domains_included());
|
| }
|
| -
|
| - return max_domain_length ? is_positive
|
| - : negatives_only && !disable_generic_rules;
|
| + if (longest_matching_included_domain_length && rule.domains_excluded()) {
|
| + return GetLongestMatchingSubdomain(origin, *rule.domains_excluded()) <
|
| + longest_matching_included_domain_length;
|
| + }
|
| + return !!longest_matching_included_domain_length;
|
| }
|
|
|
| // Returns whether the request matches flags of the specified URL |rule|. Takes
|
| @@ -423,7 +505,7 @@ bool MatchesAny(const FlatUrlRuleList* rules,
|
| if (!UrlPattern(*rule).MatchesUrl(url))
|
| continue;
|
|
|
| - if (DoesOriginMatchDomainList(document_origin, rule->domains(),
|
| + if (DoesOriginMatchDomainList(document_origin, *rule,
|
| disable_generic_rules)) {
|
| return true;
|
| }
|
|
|