Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(671)

Unified Diff: components/subresource_filter/core/common/url_pattern_index.h

Issue 2844293003: Factor out UrlPatternIndex from IndexedRuleset. (Closed)
Patch Set: Address final nits. Created 3 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: components/subresource_filter/core/common/url_pattern_index.h
diff --git a/components/subresource_filter/core/common/url_pattern_index.h b/components/subresource_filter/core/common/url_pattern_index.h
new file mode 100644
index 0000000000000000000000000000000000000000..adeade650483f96224942368a577f567ea4f307e
--- /dev/null
+++ b/components/subresource_filter/core/common/url_pattern_index.h
@@ -0,0 +1,135 @@
+// Copyright 2017 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#ifndef COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_INDEX_H_
+#define COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_INDEX_H_
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include <vector>
+
+#include "base/macros.h"
+#include "base/strings/string_piece.h"
+#include "components/subresource_filter/core/common/closed_hash_map.h"
+#include "components/subresource_filter/core/common/flat/url_pattern_index_generated.h"
+#include "components/subresource_filter/core/common/proto/rules.pb.h"
+#include "components/subresource_filter/core/common/uint64_hasher.h"
+#include "third_party/flatbuffers/src/include/flatbuffers/flatbuffers.h"
+
+class GURL;
+
+namespace url {
+class Origin;
+}
+
+namespace subresource_filter {
+
+// The integer type used to represent N-grams.
+using NGram = uint64_t;
+// The hasher used for hashing N-grams.
+using NGramHasher = Uint64Hasher;
+// The hash table probe sequence used both by UrlPatternIndex and its builder.
+using NGramHashTableProber = DefaultProber<NGram, NGramHasher>;
+
+// FlatBuffer offset aliases.
+using UrlRuleOffset = flatbuffers::Offset<flat::UrlRule>;
+using UrlPatternIndexOffset = flatbuffers::Offset<flat::UrlPatternIndex>;
+
+constexpr size_t kNGramSize = 5;
+static_assert(kNGramSize <= sizeof(NGram), "NGram type is too narrow.");
+
+// Serializes the |rule| to the FlatBuffer |builder|, and returns an offset to
+// it in the resulting buffer. Returns null offset iff the |rule| could not be
+// serialized because of unsupported options or it is otherwise invalid.
+UrlRuleOffset SerializeUrlRule(const proto::UrlRule& rule,
+ flatbuffers::FlatBufferBuilder* builder);
+
+// The class used to construct an index over the URL patterns of a set of URL
+// rules. The rules themselves need to be converted to FlatBuffers format by the
+// client of this class, as well as persisted into the |flat_builder| that is
+// supplied in the constructor.
+class UrlPatternIndexBuilder {
+ public:
+ explicit UrlPatternIndexBuilder(flatbuffers::FlatBufferBuilder* flat_builder);
+ ~UrlPatternIndexBuilder();
+
+ // Adds a UrlRule to the index. The caller should have already persisted the
+ // rule into the same |flat_builder| by a call to SerializeUrlRule returning a
+ // non-null |offset|, and should pass in the resulting |offset| here.
+ void IndexUrlRule(UrlRuleOffset offset);
+
+ // Finalizes construction of the index, serializes it using |flat_builder|,
+ // and returns an offset to it in the resulting FlatBuffer.
+ UrlPatternIndexOffset Finish();
+
+ private:
+ using MutableUrlRuleList = std::vector<UrlRuleOffset>;
+ using MutableNGramIndex =
+ ClosedHashMap<NGram, MutableUrlRuleList, NGramHashTableProber>;
+
+ // Returns an N-gram of the |pattern| encoded into the NGram integer type. The
+ // N-gram is picked using a greedy heuristic, i.e. the one is chosen which
+ // corresponds to the shortest list of rules within the index. If there are no
+ // valid N-grams in the |pattern|, the return value is 0.
+ NGram GetMostDistinctiveNGram(base::StringPiece pattern);
+
+ // This index contains all non-REGEXP rules that have at least one acceptable
+ // N-gram. For each given rule, the N-gram used as an index key is picked
+ // greedily (see GetMostDistinctiveNGram).
+ MutableNGramIndex ngram_index_;
+
+ // A fallback list that contains all the rules with no acceptable N-gram.
+ MutableUrlRuleList fallback_rules_;
+
+ // Must outlive this instance.
+ flatbuffers::FlatBufferBuilder* flat_builder_;
+
+ DISALLOW_COPY_AND_ASSIGN(UrlPatternIndexBuilder);
+};
+
+// Encapsulates a read-only index built over the URL patterns of a set of URL
+// rules, and provides fast matching of network requests against these rules.
+class UrlPatternIndexMatcher {
+ public:
+ // Creates an instance to access the given |flat_index|. If |flat_index| is
+ // nullptr, then all requests return no match.
+ explicit UrlPatternIndexMatcher(const flat::UrlPatternIndex* flat_index);
+ ~UrlPatternIndexMatcher();
+
+ // If the index contains one or more UrlRules that match the request, returns
+ // one of them (it is undefined which one). Otherwise, returns nullptr.
+ //
+ // Notes on parameters:
+ // - |url| should be valid, otherwise the return value is nullptr.
+ // - Exactly one of |element_type| and |activation_type| should be specified,
+ // i.e., not equal to *_UNSPECIFIED, otherwise the return value is nullptr.
+ // - |is_third_party| should be pre-computed by the caller, e.g. using the
+ // registry_controlled_domains library, to reflect the relation between
+ // |url| and |first_party_origin|.
+ //
+ // A rule is deemed to match the request iff all of the following applies:
+ // - The |url| matches the rule's UrlPattern (see url_pattern.h).
+ // - The |first_party_origin| matches the rule's targeted domains list.
+ // - |element_type| or |activation_type| is among the rule's targeted types.
+ // - The |is_third_party| bit matches the rule's requirement on the requested
+ // |url| being first-/third-party w.r.t. its |first_party_origin|.
+ // - The rule is not generic if |disable_generic_rules| is true.
+ const flat::UrlRule* FindMatch(const GURL& url,
+ const url::Origin& first_party_origin,
+ proto::ElementType element_type,
+ proto::ActivationType activation_type,
+ bool is_third_party,
+ bool disable_generic_rules) const;
+
+ private:
+ // Must outlive this instance.
+ const flat::UrlPatternIndex* flat_index_;
+
+ DISALLOW_COPY_AND_ASSIGN(UrlPatternIndexMatcher);
+};
+
+} // namespace subresource_filter
+
+#endif // COMPONENTS_SUBRESOURCE_FILTER_CORE_COMMON_URL_PATTERN_INDEX_H_

Powered by Google App Engine
This is Rietveld 408576698