Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(257)

Unified Diff: extensions/common/matcher/substring_set_matcher.cc

Issue 14780003: Speed improvements to SubstringSetMatcher (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 7 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: extensions/common/matcher/substring_set_matcher.cc
diff --git a/extensions/common/matcher/substring_set_matcher.cc b/extensions/common/matcher/substring_set_matcher.cc
index b5fdae6da37f80c3b9a1b963e1cbb052e54af8b5..a71bce19f872382f53e7ff6fa34df7ef1d573197 100644
--- a/extensions/common/matcher/substring_set_matcher.cc
+++ b/extensions/common/matcher/substring_set_matcher.cc
@@ -4,6 +4,7 @@
#include "extensions/common/matcher/substring_set_matcher.h"
+#include <algorithm>
#include <queue>
#include "base/logging.h"
@@ -11,6 +12,44 @@
namespace extensions {
+namespace {
+
+// Compare StringPattern instances based on their string patterns.
+bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
+ return a->pattern() <= b->pattern();
+}
+
+// Given the set of patterns, compute how many nodes will the corresponding
+// Aho-Corasick tree have. Note that |patterns| need to be sorted.
+unsigned TreeSize(const std::vector<const StringPattern*>& patterns) {
battre 2013/05/03 21:59:12 This should be size_t; also in all other places.
vabr (Chromium) 2013/05/05 23:36:49 On my machine, size_t is 8 bytes, unsigned is only
+ unsigned result = 1u; // 1 for the root node.
+ if (patterns.empty())
+ return result;
+
+ std::vector<const StringPattern*>::const_iterator last = patterns.begin();
+ std::vector<const StringPattern*>::const_iterator current = last + 1;
+ // For the first pattern, each letter is a label of an edge to a new node.
+ result += (*last)->pattern().size();
+
+ // For the subsequent patterns, only count the edges which were not counted
+ // yet. For this it suffices to test against the previous pattern, because the
+ // patterns are sorted.
+ for (; current != patterns.end(); ++last, ++current) {
+ const std::string& last_s = (*last)->pattern();
battre 2013/05/03 21:59:12 what does "_s" stand for? Can you pick a name with
vabr (Chromium) 2013/05/05 23:36:49 Done.
+ const std::string& current_s = (*current)->pattern();
+ const unsigned prefix_bound = std::min(last_s.size(), current_s.size());
+
+ unsigned common_prefix = 0;
+ while (common_prefix < prefix_bound &&
+ last_s[common_prefix] == current_s[common_prefix])
+ ++common_prefix;
+ result += current_s.size() - common_prefix;
+ }
+ return result;
+}
+
+} // namespace
+
//
// SubstringSetMatcher
//
@@ -49,27 +88,43 @@ void SubstringSetMatcher::RegisterAndUnregisterPatterns(
patterns_.erase((*i)->id());
}
+ // We compute the total number of tree nodes needed by using this vector.
+ std::vector<const StringPattern*> sorted_patterns(patterns_.size());
+ // First, fill it from |patterns_|.
+ size_t next = 0;
+ for (SubstringPatternSet::const_iterator i = patterns_.begin();
+ i != patterns_.end();
+ ++i, ++next) {
+ sorted_patterns[next] = i->second;
+ }
+
+ std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
+ tree_.reserve(TreeSize(sorted_patterns));
+
RebuildAhoCorasickTree();
}
bool SubstringSetMatcher::Match(const std::string& text,
std::set<StringPattern::ID>* matches) const {
- size_t old_number_of_matches = matches->size();
+ const size_t old_number_of_matches = matches->size();
// Handle patterns matching the empty string.
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
- int current_node = 0;
- size_t text_length = text.length();
- for (size_t i = 0; i < text_length; ++i) {
- while (!tree_[current_node].HasEdge(text[i]) && current_node != 0)
+ unsigned current_node = 0;
+ for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
+ unsigned edge_from_current = tree_[current_node].GetEdge(*i);
+ while (edge_from_current == AhoCorasickNode::kInvalidIndex &&
+ current_node != 0) {
current_node = tree_[current_node].failure();
- if (tree_[current_node].HasEdge(text[i])) {
- current_node = tree_[current_node].GetEdge(text[i]);
+ edge_from_current = tree_[current_node].GetEdge(*i);
+ }
+ if (edge_from_current != AhoCorasickNode::kInvalidIndex) {
+ current_node = edge_from_current;
matches->insert(tree_[current_node].matches().begin(),
tree_[current_node].matches().end());
} else {
- DCHECK_EQ(0, current_node);
+ DCHECK_EQ(0u, current_node);
}
}
@@ -101,26 +156,26 @@ void SubstringSetMatcher::RebuildAhoCorasickTree() {
void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
const StringPattern* pattern) {
const std::string& text = pattern->pattern();
- size_t text_length = text.length();
// Iterators on the tree and the text.
- int current_node = 0;
- size_t text_pos = 0;
+ unsigned current_node = 0;
+ std::string::const_iterator i = text.begin();
// Follow existing paths for as long as possible.
- while (text_pos < text_length &&
- tree_[current_node].HasEdge(text[text_pos])) {
- current_node = tree_[current_node].GetEdge(text[text_pos]);
- ++text_pos;
+ unsigned edge_from_current = tree_[current_node].GetEdge(*i);
+ while (i != text.end() &&
battre 2013/05/03 21:59:12 text is immutable. Should we have a const std::str
vabr (Chromium) 2013/05/05 23:36:49 It does not seem to make a difference performance-
+ edge_from_current != AhoCorasickNode::kInvalidIndex) {
+ current_node = edge_from_current;
+ ++i;
+ edge_from_current = tree_[current_node].GetEdge(*i);
}
// Create new nodes if necessary.
- while (text_pos < text_length) {
- AhoCorasickNode new_node;
- tree_.push_back(new_node);
- tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1);
+ while (i != text.end()) {
+ tree_.push_back(AhoCorasickNode());
+ tree_[current_node].SetEdge(*i, tree_.size() - 1);
current_node = tree_.size() - 1;
- ++text_pos;
+ ++i;
}
// Register match.
@@ -130,14 +185,14 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
void SubstringSetMatcher::CreateFailureEdges() {
typedef AhoCorasickNode::Edges Edges;
- std::queue<int> queue;
+ std::queue<unsigned> queue;
AhoCorasickNode& root = tree_[0];
root.set_failure(0);
- const AhoCorasickNode::Edges& root_edges = root.edges();
+ const Edges& root_edges = root.edges();
for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
++e) {
- int leads_to = e->second;
+ const unsigned& leads_to = e->second;
vabr (Chromium) 2013/05/03 20:37:27 I know this is uncommon. However, this method is a
battre 2013/05/03 21:59:12 Have you checked what happens if you used "const i
vabr (Chromium) 2013/05/05 23:36:49 Are you asking about removing the reference, or mo
tree_[leads_to].set_failure(0);
queue.push(leads_to);
}
@@ -147,16 +202,22 @@ void SubstringSetMatcher::CreateFailureEdges() {
queue.pop();
for (Edges::const_iterator e = current_node.edges().begin();
e != current_node.edges().end(); ++e) {
- char edge_label = e->first;
- int leads_to = e->second;
+ const char& edge_label = e->first;
+ const unsigned& leads_to = e->second;
queue.push(leads_to);
- int failure = current_node.failure();
- while (!tree_[failure].HasEdge(edge_label) && failure != 0)
+ unsigned failure = current_node.failure();
+ unsigned edge_from_failure = tree_[failure].GetEdge(edge_label);
+ while (edge_from_failure == AhoCorasickNode::kInvalidIndex &&
+ failure != 0) {
failure = tree_[failure].failure();
+ edge_from_failure = tree_[failure].GetEdge(edge_label);
+ }
- int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ?
- tree_[failure].GetEdge(edge_label) : 0;
+ const unsigned follow_in_case_of_failure =
+ edge_from_failure != AhoCorasickNode::kInvalidIndex
+ ? edge_from_failure
+ : 0;
tree_[leads_to].set_failure(follow_in_case_of_failure);
tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
}
@@ -164,7 +225,7 @@ void SubstringSetMatcher::CreateFailureEdges() {
}
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
- : failure_(-1) {}
+ : failure_(kInvalidIndex) {}
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
@@ -183,16 +244,13 @@ SubstringSetMatcher::AhoCorasickNode::operator=(
return *this;
}
-bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const {
- return edges_.find(c) != edges_.end();
-}
-
-int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
- std::map<char, int>::const_iterator i = edges_.find(c);
- return i == edges_.end() ? -1 : i->second;
+unsigned SubstringSetMatcher::AhoCorasickNode::GetEdge(
+ char c) const {
+ Edges::const_iterator i = edges_.find(c);
+ return i == edges_.end() ? kInvalidIndex : i->second;
}
-void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) {
+void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, unsigned node) {
edges_[c] = node;
}

Powered by Google App Engine
This is Rietveld 408576698