Index: extensions/common/matcher/substring_set_matcher.cc |
diff --git a/extensions/common/matcher/substring_set_matcher.cc b/extensions/common/matcher/substring_set_matcher.cc |
index b5fdae6da37f80c3b9a1b963e1cbb052e54af8b5..a71bce19f872382f53e7ff6fa34df7ef1d573197 100644 |
--- a/extensions/common/matcher/substring_set_matcher.cc |
+++ b/extensions/common/matcher/substring_set_matcher.cc |
@@ -4,6 +4,7 @@ |
#include "extensions/common/matcher/substring_set_matcher.h" |
+#include <algorithm> |
#include <queue> |
#include "base/logging.h" |
@@ -11,6 +12,44 @@ |
namespace extensions { |
+namespace { |
+ |
+// Compare StringPattern instances based on their string patterns. |
+bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
+ return a->pattern() <= b->pattern(); |
+} |
+ |
+// Given the set of patterns, compute how many nodes will the corresponding |
+// Aho-Corasick tree have. Note that |patterns| need to be sorted. |
+unsigned TreeSize(const std::vector<const StringPattern*>& patterns) { |
battre
2013/05/03 21:59:12
This should be size_t; also in all other places.
vabr (Chromium)
2013/05/05 23:36:49
On my machine, size_t is 8 bytes, unsigned is only
|
+ unsigned result = 1u; // 1 for the root node. |
+ if (patterns.empty()) |
+ return result; |
+ |
+ std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
+ std::vector<const StringPattern*>::const_iterator current = last + 1; |
+ // For the first pattern, each letter is a label of an edge to a new node. |
+ result += (*last)->pattern().size(); |
+ |
+ // For the subsequent patterns, only count the edges which were not counted |
+ // yet. For this it suffices to test against the previous pattern, because the |
+ // patterns are sorted. |
+ for (; current != patterns.end(); ++last, ++current) { |
+ const std::string& last_s = (*last)->pattern(); |
battre
2013/05/03 21:59:12
what does "_s" stand for? Can you pick a name with
vabr (Chromium)
2013/05/05 23:36:49
Done.
|
+ const std::string& current_s = (*current)->pattern(); |
+ const unsigned prefix_bound = std::min(last_s.size(), current_s.size()); |
+ |
+ unsigned common_prefix = 0; |
+ while (common_prefix < prefix_bound && |
+ last_s[common_prefix] == current_s[common_prefix]) |
+ ++common_prefix; |
+ result += current_s.size() - common_prefix; |
+ } |
+ return result; |
+} |
+ |
+} // namespace |
+ |
// |
// SubstringSetMatcher |
// |
@@ -49,27 +88,43 @@ void SubstringSetMatcher::RegisterAndUnregisterPatterns( |
patterns_.erase((*i)->id()); |
} |
+ // We compute the total number of tree nodes needed by using this vector. |
+ std::vector<const StringPattern*> sorted_patterns(patterns_.size()); |
+ // First, fill it from |patterns_|. |
+ size_t next = 0; |
+ for (SubstringPatternSet::const_iterator i = patterns_.begin(); |
+ i != patterns_.end(); |
+ ++i, ++next) { |
+ sorted_patterns[next] = i->second; |
+ } |
+ |
+ std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
+ tree_.reserve(TreeSize(sorted_patterns)); |
+ |
RebuildAhoCorasickTree(); |
} |
bool SubstringSetMatcher::Match(const std::string& text, |
std::set<StringPattern::ID>* matches) const { |
- size_t old_number_of_matches = matches->size(); |
+ const size_t old_number_of_matches = matches->size(); |
// Handle patterns matching the empty string. |
matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
- int current_node = 0; |
- size_t text_length = text.length(); |
- for (size_t i = 0; i < text_length; ++i) { |
- while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) |
+ unsigned current_node = 0; |
+ for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
+ unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
+ while (edge_from_current == AhoCorasickNode::kInvalidIndex && |
+ current_node != 0) { |
current_node = tree_[current_node].failure(); |
- if (tree_[current_node].HasEdge(text[i])) { |
- current_node = tree_[current_node].GetEdge(text[i]); |
+ edge_from_current = tree_[current_node].GetEdge(*i); |
+ } |
+ if (edge_from_current != AhoCorasickNode::kInvalidIndex) { |
+ current_node = edge_from_current; |
matches->insert(tree_[current_node].matches().begin(), |
tree_[current_node].matches().end()); |
} else { |
- DCHECK_EQ(0, current_node); |
+ DCHECK_EQ(0u, current_node); |
} |
} |
@@ -101,26 +156,26 @@ void SubstringSetMatcher::RebuildAhoCorasickTree() { |
void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
const StringPattern* pattern) { |
const std::string& text = pattern->pattern(); |
- size_t text_length = text.length(); |
// Iterators on the tree and the text. |
- int current_node = 0; |
- size_t text_pos = 0; |
+ unsigned current_node = 0; |
+ std::string::const_iterator i = text.begin(); |
// Follow existing paths for as long as possible. |
- while (text_pos < text_length && |
- tree_[current_node].HasEdge(text[text_pos])) { |
- current_node = tree_[current_node].GetEdge(text[text_pos]); |
- ++text_pos; |
+ unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
+ while (i != text.end() && |
battre
2013/05/03 21:59:12
text is immutable. Should we have a const std::str
vabr (Chromium)
2013/05/05 23:36:49
It does not seem to make a difference performance-
|
+ edge_from_current != AhoCorasickNode::kInvalidIndex) { |
+ current_node = edge_from_current; |
+ ++i; |
+ edge_from_current = tree_[current_node].GetEdge(*i); |
} |
// Create new nodes if necessary. |
- while (text_pos < text_length) { |
- AhoCorasickNode new_node; |
- tree_.push_back(new_node); |
- tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); |
+ while (i != text.end()) { |
+ tree_.push_back(AhoCorasickNode()); |
+ tree_[current_node].SetEdge(*i, tree_.size() - 1); |
current_node = tree_.size() - 1; |
- ++text_pos; |
+ ++i; |
} |
// Register match. |
@@ -130,14 +185,14 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
void SubstringSetMatcher::CreateFailureEdges() { |
typedef AhoCorasickNode::Edges Edges; |
- std::queue<int> queue; |
+ std::queue<unsigned> queue; |
AhoCorasickNode& root = tree_[0]; |
root.set_failure(0); |
- const AhoCorasickNode::Edges& root_edges = root.edges(); |
+ const Edges& root_edges = root.edges(); |
for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
++e) { |
- int leads_to = e->second; |
+ const unsigned& leads_to = e->second; |
vabr (Chromium)
2013/05/03 20:37:27
I know this is uncommon. However, this method is a
battre
2013/05/03 21:59:12
Have you checked what happens if you used "const i
vabr (Chromium)
2013/05/05 23:36:49
Are you asking about removing the reference, or mo
|
tree_[leads_to].set_failure(0); |
queue.push(leads_to); |
} |
@@ -147,16 +202,22 @@ void SubstringSetMatcher::CreateFailureEdges() { |
queue.pop(); |
for (Edges::const_iterator e = current_node.edges().begin(); |
e != current_node.edges().end(); ++e) { |
- char edge_label = e->first; |
- int leads_to = e->second; |
+ const char& edge_label = e->first; |
+ const unsigned& leads_to = e->second; |
queue.push(leads_to); |
- int failure = current_node.failure(); |
- while (!tree_[failure].HasEdge(edge_label) && failure != 0) |
+ unsigned failure = current_node.failure(); |
+ unsigned edge_from_failure = tree_[failure].GetEdge(edge_label); |
+ while (edge_from_failure == AhoCorasickNode::kInvalidIndex && |
+ failure != 0) { |
failure = tree_[failure].failure(); |
+ edge_from_failure = tree_[failure].GetEdge(edge_label); |
+ } |
- int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? |
- tree_[failure].GetEdge(edge_label) : 0; |
+ const unsigned follow_in_case_of_failure = |
+ edge_from_failure != AhoCorasickNode::kInvalidIndex |
+ ? edge_from_failure |
+ : 0; |
tree_[leads_to].set_failure(follow_in_case_of_failure); |
tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
} |
@@ -164,7 +225,7 @@ void SubstringSetMatcher::CreateFailureEdges() { |
} |
SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
- : failure_(-1) {} |
+ : failure_(kInvalidIndex) {} |
SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
@@ -183,16 +244,13 @@ SubstringSetMatcher::AhoCorasickNode::operator=( |
return *this; |
} |
-bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { |
- return edges_.find(c) != edges_.end(); |
-} |
- |
-int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
- std::map<char, int>::const_iterator i = edges_.find(c); |
- return i == edges_.end() ? -1 : i->second; |
+unsigned SubstringSetMatcher::AhoCorasickNode::GetEdge( |
+ char c) const { |
+ Edges::const_iterator i = edges_.find(c); |
+ return i == edges_.end() ? kInvalidIndex : i->second; |
} |
-void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { |
+void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, unsigned node) { |
edges_[c] = node; |
} |