Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3468)

Unified Diff: chrome/browser/extensions/api/declarative/url_matcher.cc

Issue 9390018: Implementation of a Matching strategy for URLs in the Declarative WebRequest API. (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Refactored for memory improvements Created 8 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: chrome/browser/extensions/api/declarative/url_matcher.cc
diff --git a/chrome/browser/extensions/api/declarative/url_matcher.cc b/chrome/browser/extensions/api/declarative/url_matcher.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33643bf43f8bd7ffddd9d0116ae4bbc2ff1784ea
--- /dev/null
+++ b/chrome/browser/extensions/api/declarative/url_matcher.cc
@@ -0,0 +1,570 @@
+// Copyright (c) 2012 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "chrome/browser/extensions/api/declarative/url_matcher.h"
+
+#include <algorithm>
+
+namespace extensions {
+
+// This set of classes implement a mapping of URL Component Patterns, such as
+// host_prefix, host_suffix, host_equals, ..., etc., to SubstringPatterns.
+//
+// The idea of this mapping is to reduce the problem of comparing many
+// URL Component Patterns against one URL to the problem of searching many
+// substrings in one string:
+//
+// ---------------------- --------------------
+// | URL Query operator | ----translate----> | SubstringPattern |
+// ---------------------- --------------------
+// ^
+// |
+// compare
+// |
+// v
+// ---------------------- --------------------
+// | URL to compare | | |
+// | to all URL Query | ----translate----> | String |
+// | operators | | |
+// ---------------------- --------------------
+//
+// The reason for this problem reduction is that there are efficient algorithms
+// for searching many substrings in one string (see Aho-Corasick algorithm).
+//
+// Case 1: {host,path,query}_{prefix,suffix,equals} searches.
+// ==========================================================
+//
+// For searches in this class, we normalize URLs as follows:
+//
+// Step 1:
+// Remove scheme, port and segment from URL:
+// -> http://www.example.com:8080/index.html?search=foo#first_match becomes
+// www.example.com/index.html?search=foo
+//
+// We remove the scheme and port number because they can be checked later
+// in a secondary filter step. We remove the segment (the #... part) because
+// this is not guaranteed to be ASCII-7 encoded.
+//
+// Step 2:
+// Translate URL to String and add the following position markers:
+// - BU = Beginning of URL
+// - ED = End of Domain
+// - EP = End of Path
+// - EU = End of URL
+// Furthermore, the hostname is canonicalized to start with a ".".
+//
+// Position markers are represented as characters >127, which are therefore
+// guaranteed not to be part of the ASCII-7 encoded URL character set.
+//
+// -> www.example.com/index.html?search=foo becomes
+// BU .www.example.com ED /index.html EP ?search=foo EU
+//
+// -> www.example.com/index.html becomes
+// BU .www.example.com ED /index.html EP EU
+//
+// Step 3:
+// Translate URL Component Patterns as follows:
+//
+// host_prefix(prefix) = BU add_missing_dot_prefix(prefix)
+// -> host_prefix("www.example") = BU .www.example
+//
+// host_suffix(suffix) = suffix ED
+// -> host_suffix("example.com") = example.com ED
+// -> host_suffix(".example.com") = .example.com ED
+//
+// host_equals(domain) = BU add_missing_dot_prefix(domain) ED
+// -> host_equals("www.example.com") = BU .www.example.com ED
+//
+// Similarly for path query parameters ({path, query}_{prefix, suffix, equals}).
+//
+// With this, we can search the SubstringPatterns in the normalized URL.
+//
+//
+// Case 2: url_{prefix,suffix,equals,contains} searches.
+// =====================================================
+//
+// Step 1: as above
+//
+// Step 2:
+// Translate URL to String and add the following position markers:
+// - BU = Beginning of URL
+// - EU = End of URL
+// Furthermore, the hostname is canonicalized to start with a ".".
+//
+// -> www.example.com/index.html?search=foo becomes
+// BU .www.example.com/index.html?search=foo EU
+//
+// url_prefix(prefix) = BU add_missing_dot_prefix(prefix)
+// -> url_prefix("www.example") = BU .www.example
+//
+// url_contains(substring) = substring
+// -> url_contains("index") = index
+//
+//
+// Case 3: {host,path,query}_contains searches.
+// ============================================
+//
+// These kinds of searches are not supported directly but can be derived
+// by a combination of a url_contains() query followed by an explicit test:
+//
+// host_contains(str) = url_contains(str) followed by test whether str occurs
+// in host comonent of original URL.
+// -> host_contains("example.co") = example.co
+// followed by gurl.host().find("example.co");
+//
+// [similarly for path_contains and query_contains].
+
+
+//
+// URLMatcherCondition
+//
+
+URLMatcherCondition::URLMatcherCondition(
+ Criterion criterion,
+ const SubstringPattern* substring_pattern)
+ : criterion_(criterion),
+ substring_pattern_(substring_pattern) {
+ DCHECK(substring_pattern_);
+}
+
+bool URLMatcherCondition::IsFullUrlCondition() const {
+ switch (criterion_) {
+ case extensions::URLMatcherCondition::HOST_CONTAINS:
+ case extensions::URLMatcherCondition::PATH_CONTAINS:
+ case extensions::URLMatcherCondition::QUERY_CONTAINS:
+ case extensions::URLMatcherCondition::URL_PREFIX:
+ case extensions::URLMatcherCondition::URL_SUFFIX:
+ case extensions::URLMatcherCondition::URL_CONTAINS:
+ case extensions::URLMatcherCondition::URL_EQUALS:
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+bool URLMatcherCondition::IsMatch(
+ const std::set<SubstringPattern::ID>& matching_substring_patterns,
+ const GURL& url) const {
+ if (matching_substring_patterns.find(substring_pattern_->id()) ==
+ matching_substring_patterns.end())
+ return false;
+ // The criteria HOST_CONTAINS, PATH_CONTAINS, QUERY_CONTAINS are based on
+ // a substring match on the raw URL. In case of a match, we need to verify
+ // that the match was found in the correct component of the URL.
+ switch (criterion_) {
+ case HOST_CONTAINS:
+ return url.host().find(substring_pattern_->pattern()) !=
+ std::string::npos;
+ case PATH_CONTAINS:
+ return url.path().find(substring_pattern_->pattern()) !=
+ std::string::npos;
+ case QUERY_CONTAINS:
+ return url.query().find(substring_pattern_->pattern()) !=
+ std::string::npos;
+ default:
+ break;
+ }
+ return true;
+}
+
+//
+// URLMatcherConditionFactory
+//
+
+namespace {
+// These are symbols that are not contained in 7-bit ASCII used in GURLs.
+char kBeginningOfURL[] = {128, 0};
+char kEndOfDomain[] = {129, 0};
+char kEndOfPath[] = {130, 0};
+char kEndOfURL[] = {131, 0};
+} // namespace
+
+URLMatcherConditionFactory::URLMatcherConditionFactory() : id_counter_(0) {}
+
+std::string URLMatcherConditionFactory::CanonlicalizeURLForComponentSearches(
+ const GURL& url) {
+ return kBeginningOfURL + CanonicalizeHostname(url.host()) + kEndOfDomain +
+ url.path() + kEndOfPath + (url.has_query() ? "?" + url.query() : "") +
+ kEndOfURL;
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateHostPrefixCondition(
+ const std::string& prefix) {
+ return CreateCondition(URLMatcherCondition::HOST_PREFIX,
+ kBeginningOfURL + CanonicalizeHostname(prefix));
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateHostSuffixCondition(
+ const std::string& suffix) {
+ return CreateCondition(URLMatcherCondition::HOST_SUFFIX,
+ suffix + kEndOfDomain);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateHostContainsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::HOST_CONTAINS, str);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateHostEqualsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::HOST_EQUALS,
+ kBeginningOfURL + CanonicalizeHostname(str) + kEndOfDomain);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreatePathPrefixCondition(
+ const std::string& prefix) {
+ return CreateCondition(URLMatcherCondition::PATH_PREFIX,
+ kEndOfDomain + prefix);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreatePathSuffixCondition(
+ const std::string& suffix) {
+ return CreateCondition(URLMatcherCondition::HOST_SUFFIX, suffix + kEndOfPath);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreatePathContainsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::PATH_CONTAINS, str);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreatePathEqualsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::PATH_EQUALS,
+ kEndOfDomain + str + kEndOfPath);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateQueryPrefixCondition(
+ const std::string& prefix) {
+ return CreateCondition(URLMatcherCondition::QUERY_PREFIX,
+ kEndOfPath + prefix);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateQuerySuffixCondition(
+ const std::string& suffix) {
+ return CreateCondition(URLMatcherCondition::QUERY_SUFFIX, suffix + kEndOfURL);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateQueryContainsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::QUERY_CONTAINS, str);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateQueryEqualsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::QUERY_EQUALS,
+ kEndOfPath + str + kEndOfURL);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateHostSuffixPathPrefixCondition(
+ const std::string& host_suffix,
+ const std::string& path_prefix) {
+ return CreateCondition(URLMatcherCondition::HOST_SUFFIX_PATH_PREFIX,
+ host_suffix + kEndOfDomain + path_prefix);
+}
+
+std::string URLMatcherConditionFactory::CanonlicalizeURLForFullSearches(
+ const GURL& url) {
+ return kBeginningOfURL + CanonicalizeHostname(url.host()) + url.path() +
+ (url.has_query() ? "?" + url.query() : "") + kEndOfURL;
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateURLPrefixCondition(
+ const std::string& prefix) {
+ return CreateCondition(URLMatcherCondition::URL_PREFIX,
+ kBeginningOfURL + CanonicalizeHostname(prefix));
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateURLSuffixCondition(
+ const std::string& suffix) {
+ return CreateCondition(URLMatcherCondition::URL_SUFFIX, suffix + kEndOfURL);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateURLContainsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::URL_CONTAINS, str);
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateURLEqualsCondition(
+ const std::string& str) {
+ return CreateCondition(URLMatcherCondition::QUERY_EQUALS,
+ kBeginningOfURL + CanonicalizeHostname(str) + kEndOfURL);
+}
+
+void URLMatcherConditionFactory::ForgetUnusedPatterns(
+ const std::set<SubstringPattern::ID>& used_patterns) {
+ PatternSingletons::iterator i = pattern_singletons_.begin();
+ while (i != pattern_singletons_.end()) {
+ if (used_patterns.find(i->second->id()) != used_patterns.end()) {
+ ++i;
+ } else {
+ pattern_singletons_.erase(i++);
+ }
+ }
+}
+
+scoped_ptr<URLMatcherCondition>
+ URLMatcherConditionFactory::CreateCondition(
+ URLMatcherCondition::Criterion criterion,
+ const std::string& pattern) {
+ PatternSingletons::const_iterator iter = pattern_singletons_.find(pattern);
+ if (iter == pattern_singletons_.end()) {
+ pattern_singletons_[pattern] =
+ make_linked_ptr(new SubstringPattern(pattern, id_counter_++));
+ }
+
+ return scoped_ptr<URLMatcherCondition>(
+ new URLMatcherCondition(criterion, pattern_singletons_[pattern].get()));
+}
+
+std::string URLMatcherConditionFactory::CanonicalizeHostname(
+ const std::string& hostname) const {
+ if (!hostname.empty() && hostname[0] == '.')
+ return hostname;
+ else
+ return "." + hostname;
+}
+
+
+//
+// URLMatcherConditionSet
+//
+
+URLMatcherConditionSet::URLMatcherConditionSet(
+ ID id,
+ scoped_ptr<Conditions> conditions)
+ : id_(id) {
+ conditions_.swap(*conditions);
+}
+
+bool URLMatcherConditionSet::IsMatch(
+ const std::set<SubstringPattern::ID>& matching_substring_patterns,
+ const GURL& url) const {
+ for (Conditions::const_iterator i = conditions_.begin();
+ i != conditions_.end(); ++i) {
+ if (!(*i)->IsMatch(matching_substring_patterns, url))
+ return false;
+ }
+ return true;
+}
+
+//
+// URLMatcher
+//
+
+URLMatcher::URLMatcher() {}
+
+void URLMatcher::AddConditionSets(
+ scoped_ptr<ScopedVector<const URLMatcherConditionSet> > condition_sets) {
+ std::vector<const URLMatcherConditionSet*> released_condition_sets;
+ condition_sets->release(&released_condition_sets);
+ for (std::vector<const URLMatcherConditionSet*>::const_iterator i =
+ released_condition_sets.begin(); i != released_condition_sets.end();
+ ++i) {
+ DCHECK(url_matcher_condition_sets_.find((*i)->id()) ==
+ url_matcher_condition_sets_.end());
+ url_matcher_condition_sets_[(*i)->id()] = make_linked_ptr(*i);
+ }
+ UpdateInternalDatastructures();
+}
+
+void URLMatcher::RemoveConditionSets(
+ const std::vector<URLMatcherConditionSet::ID>& condition_ids) {
+ for (std::vector<URLMatcherConditionSet::ID>::const_iterator i =
+ condition_ids.begin(); i != condition_ids.end(); ++i) {
+ DCHECK(url_matcher_condition_sets_.find(*i) !=
+ url_matcher_condition_sets_.end());
+ url_matcher_condition_sets_.erase(*i);
+ }
+ UpdateInternalDatastructures();
+}
+
+
+std::set<URLMatcherConditionSet::ID> URLMatcher::MatchUrl(const GURL& url) {
+ // See URLMatcherConditionFactory for the canonicalization of URLs and the
+ // distinction between full url searches and url component searches.
+
+ // Perform all full url searches.
+ std::string full_url =
+ condition_factory_.CanonlicalizeURLForFullSearches(url);
+ std::set<SubstringPattern::ID> full_url_matches;
+ full_url_matcher_.Match(full_url, &full_url_matches);
+
+ // Perform all url component searches.
+ std::string component_url =
+ condition_factory_.CanonlicalizeURLForComponentSearches(url);
+ std::set<SubstringPattern::ID> component_url_matches;
+ url_component_matcher_.Match(component_url, &component_url_matches);
+
+ // Build the union of both result sets.
+ std::set<SubstringPattern::ID> matches;
+ matches.swap(full_url_matches);
+ matches.insert(component_url_matches.begin(), component_url_matches.end());
+
+ // Calculate all URLMatcherConditionSet for which all URLMatcherConditions
+ // were fulfilled.
+ std::set<URLMatcherConditionSet::ID> result;
+ for (std::set<SubstringPattern::ID>::const_iterator i = matches.begin();
+ i != matches.end(); ++i) {
+ // For each URLMatcherConditionSet there is exactly one condition
+ // registered in substring_match_triggers_. This means that the following
+ // logic tests each URLMatcherConditionSet exactly once if it can be
+ // completely fulfilled.
+ std::set<URLMatcherConditionSet::ID>& condition_sets =
+ substring_match_triggers_[*i];
+ for (std::set<URLMatcherConditionSet::ID>::const_iterator j =
+ condition_sets.begin(); j != condition_sets.end(); ++j) {
+ if (url_matcher_condition_sets_[*j]->IsMatch(matches, url))
+ result.insert(url_matcher_condition_sets_[*j]->id());
+ }
+ }
+
+ return result;
+}
+
+void URLMatcher::UpdateSubstringSetMatcher(bool full_url_conditions) {
+ // The purpose of |full_url_conditions| is just that we need to execute
+ // the same logic once for Full URL searches and once for URL Component
+ // searches (see URLMatcherConditionFactory).
+
+ // Determine which patterns need to be registered when this function
+ // terminates.
+ std::set<const SubstringPattern*> new_patterns;
+ for (URLMatcherConditionSets::const_iterator condition_set_iter =
+ url_matcher_condition_sets_.begin();
+ condition_set_iter != url_matcher_condition_sets_.end();
+ ++condition_set_iter) {
+ const URLMatcherConditionSet::Conditions& conditions =
+ condition_set_iter->second->conditions();
+ for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+ conditions.begin(); condition_iter != conditions.end();
+ ++condition_iter) {
+ // If we are called to process Full URL searches, ignore all others,
+ // and vice versa.
+ if (full_url_conditions == (*condition_iter)->IsFullUrlCondition())
+ new_patterns.insert((*condition_iter)->substring_pattern());
+ }
+ }
+
+ // This is the set of patterns that were registered before this function
+ // is called.
+ std::set<const SubstringPattern*>& registered_patterns =
+ full_url_conditions ? registered_full_url_patterns_
+ : registered_url_component_patterns_;
+
+ // Add all patterns that are in new_patterns but not in registered_patterns.
+ std::vector<const SubstringPattern*> patterns_to_register;
+ std::set_difference(
+ new_patterns.begin(), new_patterns.end(),
+ registered_patterns.begin(), registered_patterns.end(),
+ std::back_inserter(patterns_to_register));
+
+ // Remove all patterns that are in registered_patterns but not in
+ // new_patterns.
+ std::vector<const SubstringPattern*> patterns_to_unregister;
+ std::set_difference(
+ registered_patterns.begin(), registered_patterns.end(),
+ new_patterns.begin(), new_patterns.end(),
+ std::back_inserter(patterns_to_unregister));
+
+ // Update the SubstringSetMatcher.
+ SubstringSetMatcher& url_matcher =
+ full_url_conditions ? full_url_matcher_ : url_component_matcher_;
+ url_matcher.RegisterAndUnregisterPatterns(patterns_to_register,
+ patterns_to_unregister);
+
+ // Update the set of registered_patterns for the next time this function
+ // is being called.
+ registered_patterns.swap(new_patterns);
+}
+
+void URLMatcher::UpdateTriggers() {
+ // Count substring pattern frequencies.
+ std::map<SubstringPattern::ID, size_t> substring_pattern_frequencies;
+ for (URLMatcherConditionSets::const_iterator condition_set_iter =
+ url_matcher_condition_sets_.begin();
+ condition_set_iter != url_matcher_condition_sets_.end();
+ ++condition_set_iter) {
+ const URLMatcherConditionSet::Conditions& conditions =
+ condition_set_iter->second->conditions();
+ for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+ conditions.begin(); condition_iter != conditions.end();
+ ++condition_iter) {
+ const SubstringPattern* pattern = (*condition_iter)->substring_pattern();
+ substring_pattern_frequencies[pattern->id()]++;
+ }
+ }
+
+ // Update trigger conditions: Determine for each URLMatcherConditionSet which
+ // URLMatcherCondition occurs least frequently in this URLMatcher. We assume
+ // that this condition is very specific and occurs rarely in URLs. If a match
+ // occurs for this URLMatcherCondition, we want to test all other
+ // URLMatcherCondition in the respective URLMatcherConditionSet as well to
+ // see whether the entire URLMatcherConditionSet is considered matching.
+ substring_match_triggers_.clear();
+ for (URLMatcherConditionSets::const_iterator condition_set_iter =
+ url_matcher_condition_sets_.begin();
+ condition_set_iter != url_matcher_condition_sets_.end();
+ ++condition_set_iter) {
+ const URLMatcherConditionSet::Conditions& conditions =
+ condition_set_iter->second->conditions();
+ if (conditions.empty())
+ continue;
+ SubstringPattern::ID trigger = conditions[0]->substring_pattern()->id();
+ for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+ conditions.begin() + 1; condition_iter != conditions.end();
+ ++condition_iter) {
+ SubstringPattern::ID current_id =
+ (*condition_iter)->substring_pattern()->id();
+ if (substring_pattern_frequencies[trigger] >
+ substring_pattern_frequencies[current_id]) {
+ trigger = current_id;
+ }
+ }
+ substring_match_triggers_[trigger].insert(condition_set_iter->second->id());
+ }
+}
+
+void URLMatcher::UpdateConditionFactory() {
+ std::set<SubstringPattern::ID> used_patterns;
+ for (URLMatcherConditionSets::const_iterator condition_set_iter =
+ url_matcher_condition_sets_.begin();
+ condition_set_iter != url_matcher_condition_sets_.end();
+ ++condition_set_iter) {
+ const URLMatcherConditionSet::Conditions& conditions =
+ condition_set_iter->second->conditions();
+ for (URLMatcherConditionSet::Conditions::const_iterator condition_iter =
+ conditions.begin(); condition_iter != conditions.end();
+ ++condition_iter) {
+ used_patterns.insert((*condition_iter)->substring_pattern()->id());
+ }
+ }
+ condition_factory_.ForgetUnusedPatterns(used_patterns);
+}
+
+void URLMatcher::UpdateInternalDatastructures() {
+ UpdateSubstringSetMatcher(false);
+ UpdateSubstringSetMatcher(true);
+ UpdateTriggers();
+ UpdateConditionFactory();
+}
+
+} // namespace extensions

Powered by Google App Engine
This is Rietveld 408576698