Index: extensions/common/matcher/substring_set_matcher.cc |
diff --git a/extensions/common/matcher/substring_set_matcher.cc b/extensions/common/matcher/substring_set_matcher.cc |
index b5fdae6da37f80c3b9a1b963e1cbb052e54af8b5..4c5108a086cb787184d7bf4ffc08a5e9cee7a3bb 100644 |
--- a/extensions/common/matcher/substring_set_matcher.cc |
+++ b/extensions/common/matcher/substring_set_matcher.cc |
@@ -4,6 +4,7 @@ |
#include "extensions/common/matcher/substring_set_matcher.h" |
+#include <algorithm> |
#include <queue> |
#include "base/logging.h" |
@@ -11,12 +12,51 @@ |
namespace extensions { |
+namespace { |
+ |
+// Compare StringPattern instances based on their string patterns. |
+bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
+ return a->pattern() <= b->pattern(); |
+} |
+ |
+// Given the set of patterns, compute how many nodes will the corresponding |
+// Aho-Corasick tree have. Note that |patterns| need to be sorted. |
+uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { |
+ uint32 result = 1u; // 1 for the root node. |
+ if (patterns.empty()) |
+ return result; |
+ |
+ std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
+ std::vector<const StringPattern*>::const_iterator current = last + 1; |
+ // For the first pattern, each letter is a label of an edge to a new node. |
+ result += (*last)->pattern().size(); |
+ |
+ // For the subsequent patterns, only count the edges which were not counted |
+ // yet. For this it suffices to test against the previous pattern, because the |
+ // patterns are sorted. |
+ for (; current != patterns.end(); ++last, ++current) { |
+ const std::string& last_pattern = (*last)->pattern(); |
+ const std::string& current_pattern = (*current)->pattern(); |
+ const uint32 prefix_bound = |
+ std::min(last_pattern.size(), current_pattern.size()); |
+ |
+ uint32 common_prefix = 0; |
+ while (common_prefix < prefix_bound && |
+ last_pattern[common_prefix] == current_pattern[common_prefix]) |
+ ++common_prefix; |
+ result += current_pattern.size() - common_prefix; |
+ } |
+ return result; |
+} |
+ |
+} // namespace |
+ |
// |
// SubstringSetMatcher |
// |
SubstringSetMatcher::SubstringSetMatcher() { |
- RebuildAhoCorasickTree(); |
+ RebuildAhoCorasickTree(SubstringPatternVector()); |
} |
SubstringSetMatcher::~SubstringSetMatcher() {} |
@@ -49,27 +89,44 @@ void SubstringSetMatcher::RegisterAndUnregisterPatterns( |
patterns_.erase((*i)->id()); |
} |
- RebuildAhoCorasickTree(); |
+ // Now we compute the total number of tree nodes needed. |
+ SubstringPatternVector sorted_patterns; |
+ sorted_patterns.resize(patterns_.size()); |
+ |
+ size_t next = 0; |
+ for (SubstringPatternMap::const_iterator i = patterns_.begin(); |
+ i != patterns_.end(); |
+ ++i, ++next) { |
+ sorted_patterns[next] = i->second; |
+ } |
+ |
+ std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
+ tree_.reserve(TreeSize(sorted_patterns)); |
+ |
+ RebuildAhoCorasickTree(sorted_patterns); |
} |
bool SubstringSetMatcher::Match(const std::string& text, |
std::set<StringPattern::ID>* matches) const { |
- size_t old_number_of_matches = matches->size(); |
+ const size_t old_number_of_matches = matches->size(); |
// Handle patterns matching the empty string. |
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
- int current_node = 0; |
- size_t text_length = text.length(); |
- for (size_t i = 0; i < text_length; ++i) { |
- while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) |
+ uint32 current_node = 0; |
+ for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
+ uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
+ while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
+ current_node != 0) { |
current_node = tree_[current_node].failure(); |
- if (tree_[current_node].HasEdge(text[i])) { |
- current_node = tree_[current_node].GetEdge(text[i]); |
+ edge_from_current = tree_[current_node].GetEdge(*i); |
+ } |
+ if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
+ current_node = edge_from_current; |
matches->insert(tree_[current_node].matches().begin(), |
tree_[current_node].matches().end()); |
} else { |
- DCHECK_EQ(0, current_node); |
+ DCHECK_EQ(0u, current_node); |
} |
} |
@@ -81,7 +138,8 @@ bool SubstringSetMatcher::IsEmpty() const { |
return patterns_.empty() && tree_.size() == 1u; |
} |
-void SubstringSetMatcher::RebuildAhoCorasickTree() { |
+void SubstringSetMatcher::RebuildAhoCorasickTree( |
+ const SubstringPatternVector& sorted_patterns) { |
tree_.clear(); |
// Initialize root note of tree. |
@@ -90,9 +148,10 @@ void SubstringSetMatcher::RebuildAhoCorasickTree() { |
tree_.push_back(root); |
// Insert all patterns. |
- for (SubstringPatternSet::const_iterator i = patterns_.begin(); |
- i != patterns_.end(); ++i) { |
- InsertPatternIntoAhoCorasickTree(i->second); |
+ for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); |
+ i != sorted_patterns.end(); |
+ ++i) { |
+ InsertPatternIntoAhoCorasickTree(*i); |
} |
CreateFailureEdges(); |
@@ -101,26 +160,27 @@ void SubstringSetMatcher::RebuildAhoCorasickTree() { |
void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
const StringPattern* pattern) { |
const std::string& text = pattern->pattern(); |
- size_t text_length = text.length(); |
+ const std::string::const_iterator text_end = text.end(); |
// Iterators on the tree and the text. |
- int current_node = 0; |
- size_t text_pos = 0; |
+ uint32 current_node = 0; |
+ std::string::const_iterator i = text.begin(); |
// Follow existing paths for as long as possible. |
- while (text_pos < text_length && |
- tree_[current_node].HasEdge(text[text_pos])) { |
- current_node = tree_[current_node].GetEdge(text[text_pos]); |
- ++text_pos; |
+ uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
+ while (i != text_end && |
+ edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
+ current_node = edge_from_current; |
+ ++i; |
+ edge_from_current = tree_[current_node].GetEdge(*i); |
} |
// Create new nodes if necessary. |
- while (text_pos < text_length) { |
- AhoCorasickNode new_node; |
- tree_.push_back(new_node); |
- tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); |
+ while (i != text_end) { |
+ tree_.push_back(AhoCorasickNode()); |
+ tree_[current_node].SetEdge(*i, tree_.size() - 1); |
current_node = tree_.size() - 1; |
- ++text_pos; |
+ ++i; |
} |
// Register match. |
@@ -130,14 +190,14 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
void SubstringSetMatcher::CreateFailureEdges() { |
typedef AhoCorasickNode::Edges Edges; |
- std::queue<int> queue; |
+ std::queue<uint32> queue; |
AhoCorasickNode& root = tree_[0]; |
root.set_failure(0); |
- const AhoCorasickNode::Edges& root_edges = root.edges(); |
+ const Edges& root_edges = root.edges(); |
for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
++e) { |
- int leads_to = e->second; |
+ const uint32& leads_to = e->second; |
tree_[leads_to].set_failure(0); |
queue.push(leads_to); |
} |
@@ -147,24 +207,32 @@ void SubstringSetMatcher::CreateFailureEdges() { |
queue.pop(); |
for (Edges::const_iterator e = current_node.edges().begin(); |
e != current_node.edges().end(); ++e) { |
- char edge_label = e->first; |
- int leads_to = e->second; |
+ const char& edge_label = e->first; |
+ const uint32& leads_to = e->second; |
queue.push(leads_to); |
- int failure = current_node.failure(); |
- while (!tree_[failure].HasEdge(edge_label) && failure != 0) |
+ uint32 failure = current_node.failure(); |
+ uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); |
+ while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
+ failure != 0) { |
failure = tree_[failure].failure(); |
+ edge_from_failure = tree_[failure].GetEdge(edge_label); |
+ } |
- int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? |
- tree_[failure].GetEdge(edge_label) : 0; |
+ const uint32 follow_in_case_of_failure = |
+ edge_from_failure != AhoCorasickNode::kNoSuchEdge |
+ ? edge_from_failure |
+ : 0; |
tree_[leads_to].set_failure(follow_in_case_of_failure); |
tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
} |
} |
} |
+const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; |
+ |
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
- : failure_(-1) {} |
+ : failure_(kNoSuchEdge) {} |
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
@@ -183,16 +251,12 @@ SubstringSetMatcher::AhoCorasickNode::operator=( |
return *this; |
} |
-bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { |
- return edges_.find(c) != edges_.end(); |
-} |
- |
-int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
- std::map<char, int>::const_iterator i = edges_.find(c); |
- return i == edges_.end() ? -1 : i->second; |
+uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
+ Edges::const_iterator i = edges_.find(c); |
+ return i == edges_.end() ? kNoSuchEdge : i->second; |
} |
-void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { |
+void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { |
edges_[c] = node; |
} |