Index: trunk/src/extensions/common/matcher/substring_set_matcher.cc |
=================================================================== |
--- trunk/src/extensions/common/matcher/substring_set_matcher.cc (revision 198873) |
+++ trunk/src/extensions/common/matcher/substring_set_matcher.cc (working copy) |
@@ -4,7 +4,6 @@ |
#include "extensions/common/matcher/substring_set_matcher.h" |
-#include <algorithm> |
#include <queue> |
#include "base/logging.h" |
@@ -12,51 +11,12 @@ |
namespace extensions { |
-namespace { |
- |
-// Compare StringPattern instances based on their string patterns. |
-bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
- return a->pattern() <= b->pattern(); |
-} |
- |
-// Given the set of patterns, compute how many nodes will the corresponding |
-// Aho-Corasick tree have. Note that |patterns| need to be sorted. |
-uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { |
- uint32 result = 1u; // 1 for the root node. |
- if (patterns.empty()) |
- return result; |
- |
- std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
- std::vector<const StringPattern*>::const_iterator current = last + 1; |
- // For the first pattern, each letter is a label of an edge to a new node. |
- result += (*last)->pattern().size(); |
- |
- // For the subsequent patterns, only count the edges which were not counted |
- // yet. For this it suffices to test against the previous pattern, because the |
- // patterns are sorted. |
- for (; current != patterns.end(); ++last, ++current) { |
- const std::string& last_pattern = (*last)->pattern(); |
- const std::string& current_pattern = (*current)->pattern(); |
- const uint32 prefix_bound = |
- std::min(last_pattern.size(), current_pattern.size()); |
- |
- uint32 common_prefix = 0; |
- while (common_prefix < prefix_bound && |
- last_pattern[common_prefix] == current_pattern[common_prefix]) |
- ++common_prefix; |
- result += current_pattern.size() - common_prefix; |
- } |
- return result; |
-} |
- |
-} // namespace |
- |
// |
// SubstringSetMatcher |
// |
SubstringSetMatcher::SubstringSetMatcher() { |
- RebuildAhoCorasickTree(SubstringPatternVector()); |
+ RebuildAhoCorasickTree(); |
} |
SubstringSetMatcher::~SubstringSetMatcher() {} |
@@ -89,44 +49,27 @@ |
patterns_.erase((*i)->id()); |
} |
- // Now we compute the total number of tree nodes needed. |
- SubstringPatternVector sorted_patterns; |
- sorted_patterns.resize(patterns_.size()); |
- |
- size_t next = 0; |
- for (SubstringPatternMap::const_iterator i = patterns_.begin(); |
- i != patterns_.end(); |
- ++i, ++next) { |
- sorted_patterns[next] = i->second; |
- } |
- |
- std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
- tree_.reserve(TreeSize(sorted_patterns)); |
- |
- RebuildAhoCorasickTree(sorted_patterns); |
+ RebuildAhoCorasickTree(); |
} |
bool SubstringSetMatcher::Match(const std::string& text, |
std::set<StringPattern::ID>* matches) const { |
- const size_t old_number_of_matches = matches->size(); |
+ size_t old_number_of_matches = matches->size(); |
// Handle patterns matching the empty string. |
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
- uint32 current_node = 0; |
- for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
- uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
- while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
- current_node != 0) { |
+ int current_node = 0; |
+ size_t text_length = text.length(); |
+ for (size_t i = 0; i < text_length; ++i) { |
+ while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) |
current_node = tree_[current_node].failure(); |
- edge_from_current = tree_[current_node].GetEdge(*i); |
- } |
- if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
- current_node = edge_from_current; |
+ if (tree_[current_node].HasEdge(text[i])) { |
+ current_node = tree_[current_node].GetEdge(text[i]); |
matches->insert(tree_[current_node].matches().begin(), |
tree_[current_node].matches().end()); |
} else { |
- DCHECK_EQ(0u, current_node); |
+ DCHECK_EQ(0, current_node); |
} |
} |
@@ -138,8 +81,7 @@ |
return patterns_.empty() && tree_.size() == 1u; |
} |
-void SubstringSetMatcher::RebuildAhoCorasickTree( |
- const SubstringPatternVector& sorted_patterns) { |
+void SubstringSetMatcher::RebuildAhoCorasickTree() { |
tree_.clear(); |
// Initialize root note of tree. |
@@ -148,10 +90,9 @@ |
tree_.push_back(root); |
// Insert all patterns. |
- for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); |
- i != sorted_patterns.end(); |
- ++i) { |
- InsertPatternIntoAhoCorasickTree(*i); |
+ for (SubstringPatternSet::const_iterator i = patterns_.begin(); |
+ i != patterns_.end(); ++i) { |
+ InsertPatternIntoAhoCorasickTree(i->second); |
} |
CreateFailureEdges(); |
@@ -160,27 +101,26 @@ |
void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
const StringPattern* pattern) { |
const std::string& text = pattern->pattern(); |
- const std::string::const_iterator text_end = text.end(); |
+ size_t text_length = text.length(); |
// Iterators on the tree and the text. |
- uint32 current_node = 0; |
- std::string::const_iterator i = text.begin(); |
+ int current_node = 0; |
+ size_t text_pos = 0; |
// Follow existing paths for as long as possible. |
- uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
- while (i != text_end && |
- edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
- current_node = edge_from_current; |
- ++i; |
- edge_from_current = tree_[current_node].GetEdge(*i); |
+ while (text_pos < text_length && |
+ tree_[current_node].HasEdge(text[text_pos])) { |
+ current_node = tree_[current_node].GetEdge(text[text_pos]); |
+ ++text_pos; |
} |
// Create new nodes if necessary. |
- while (i != text_end) { |
- tree_.push_back(AhoCorasickNode()); |
- tree_[current_node].SetEdge(*i, tree_.size() - 1); |
+ while (text_pos < text_length) { |
+ AhoCorasickNode new_node; |
+ tree_.push_back(new_node); |
+ tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); |
current_node = tree_.size() - 1; |
- ++i; |
+ ++text_pos; |
} |
// Register match. |
@@ -190,14 +130,14 @@ |
void SubstringSetMatcher::CreateFailureEdges() { |
typedef AhoCorasickNode::Edges Edges; |
- std::queue<uint32> queue; |
+ std::queue<int> queue; |
AhoCorasickNode& root = tree_[0]; |
root.set_failure(0); |
- const Edges& root_edges = root.edges(); |
+ const AhoCorasickNode::Edges& root_edges = root.edges(); |
for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
++e) { |
- const uint32& leads_to = e->second; |
+ int leads_to = e->second; |
tree_[leads_to].set_failure(0); |
queue.push(leads_to); |
} |
@@ -207,32 +147,24 @@ |
queue.pop(); |
for (Edges::const_iterator e = current_node.edges().begin(); |
e != current_node.edges().end(); ++e) { |
- const char& edge_label = e->first; |
- const uint32& leads_to = e->second; |
+ char edge_label = e->first; |
+ int leads_to = e->second; |
queue.push(leads_to); |
- uint32 failure = current_node.failure(); |
- uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); |
- while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
- failure != 0) { |
+ int failure = current_node.failure(); |
+ while (!tree_[failure].HasEdge(edge_label) && failure != 0) |
failure = tree_[failure].failure(); |
- edge_from_failure = tree_[failure].GetEdge(edge_label); |
- } |
- const uint32 follow_in_case_of_failure = |
- edge_from_failure != AhoCorasickNode::kNoSuchEdge |
- ? edge_from_failure |
- : 0; |
+ int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? |
+ tree_[failure].GetEdge(edge_label) : 0; |
tree_[leads_to].set_failure(follow_in_case_of_failure); |
tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
} |
} |
} |
-const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; |
- |
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
- : failure_(kNoSuchEdge) {} |
+ : failure_(-1) {} |
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
@@ -251,12 +183,16 @@ |
return *this; |
} |
-uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
- Edges::const_iterator i = edges_.find(c); |
- return i == edges_.end() ? kNoSuchEdge : i->second; |
+bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { |
+ return edges_.find(c) != edges_.end(); |
} |
-void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { |
+int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
+ std::map<char, int>::const_iterator i = edges_.find(c); |
+ return i == edges_.end() ? -1 : i->second; |
+} |
+ |
+void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { |
edges_[c] = node; |
} |