Chromium Code Reviews| Index: extensions/common/matcher/substring_set_matcher.cc |
| diff --git a/extensions/common/matcher/substring_set_matcher.cc b/extensions/common/matcher/substring_set_matcher.cc |
| index b5fdae6da37f80c3b9a1b963e1cbb052e54af8b5..a71bce19f872382f53e7ff6fa34df7ef1d573197 100644 |
| --- a/extensions/common/matcher/substring_set_matcher.cc |
| +++ b/extensions/common/matcher/substring_set_matcher.cc |
| @@ -4,6 +4,7 @@ |
| #include "extensions/common/matcher/substring_set_matcher.h" |
| +#include <algorithm> |
| #include <queue> |
| #include "base/logging.h" |
| @@ -11,6 +12,44 @@ |
| namespace extensions { |
| +namespace { |
| + |
| +// Compare StringPattern instances based on their string patterns. |
| +bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
| + return a->pattern() <= b->pattern(); |
| +} |
| + |
| +// Given the set of patterns, compute how many nodes will the corresponding |
| +// Aho-Corasick tree have. Note that |patterns| need to be sorted. |
| +unsigned TreeSize(const std::vector<const StringPattern*>& patterns) { |
|
battre
2013/05/03 21:59:12
This should be size_t; also in all other places.
vabr (Chromium)
2013/05/05 23:36:49
On my machine, size_t is 8 bytes, unsigned is only
|
| + unsigned result = 1u; // 1 for the root node. |
| + if (patterns.empty()) |
| + return result; |
| + |
| + std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
| + std::vector<const StringPattern*>::const_iterator current = last + 1; |
| + // For the first pattern, each letter is a label of an edge to a new node. |
| + result += (*last)->pattern().size(); |
| + |
| + // For the subsequent patterns, only count the edges which were not counted |
| + // yet. For this it suffices to test against the previous pattern, because the |
| + // patterns are sorted. |
| + for (; current != patterns.end(); ++last, ++current) { |
| + const std::string& last_s = (*last)->pattern(); |
|
battre
2013/05/03 21:59:12
what does "_s" stand for? Can you pick a name with
vabr (Chromium)
2013/05/05 23:36:49
Done.
|
| + const std::string& current_s = (*current)->pattern(); |
| + const unsigned prefix_bound = std::min(last_s.size(), current_s.size()); |
| + |
| + unsigned common_prefix = 0; |
| + while (common_prefix < prefix_bound && |
| + last_s[common_prefix] == current_s[common_prefix]) |
| + ++common_prefix; |
| + result += current_s.size() - common_prefix; |
| + } |
| + return result; |
| +} |
| + |
| +} // namespace |
| + |
| // |
| // SubstringSetMatcher |
| // |
| @@ -49,27 +88,43 @@ void SubstringSetMatcher::RegisterAndUnregisterPatterns( |
| patterns_.erase((*i)->id()); |
| } |
| + // We compute the total number of tree nodes needed by using this vector. |
| + std::vector<const StringPattern*> sorted_patterns(patterns_.size()); |
| + // First, fill it from |patterns_|. |
| + size_t next = 0; |
| + for (SubstringPatternSet::const_iterator i = patterns_.begin(); |
| + i != patterns_.end(); |
| + ++i, ++next) { |
| + sorted_patterns[next] = i->second; |
| + } |
| + |
| + std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
| + tree_.reserve(TreeSize(sorted_patterns)); |
| + |
| RebuildAhoCorasickTree(); |
| } |
| bool SubstringSetMatcher::Match(const std::string& text, |
| std::set<StringPattern::ID>* matches) const { |
| - size_t old_number_of_matches = matches->size(); |
| + const size_t old_number_of_matches = matches->size(); |
| // Handle patterns matching the empty string. |
| matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
| - int current_node = 0; |
| - size_t text_length = text.length(); |
| - for (size_t i = 0; i < text_length; ++i) { |
| - while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) |
| + unsigned current_node = 0; |
| + for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
| + unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
| + while (edge_from_current == AhoCorasickNode::kInvalidIndex && |
| + current_node != 0) { |
| current_node = tree_[current_node].failure(); |
| - if (tree_[current_node].HasEdge(text[i])) { |
| - current_node = tree_[current_node].GetEdge(text[i]); |
| + edge_from_current = tree_[current_node].GetEdge(*i); |
| + } |
| + if (edge_from_current != AhoCorasickNode::kInvalidIndex) { |
| + current_node = edge_from_current; |
| matches->insert(tree_[current_node].matches().begin(), |
| tree_[current_node].matches().end()); |
| } else { |
| - DCHECK_EQ(0, current_node); |
| + DCHECK_EQ(0u, current_node); |
| } |
| } |
| @@ -101,26 +156,26 @@ void SubstringSetMatcher::RebuildAhoCorasickTree() { |
| void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| const StringPattern* pattern) { |
| const std::string& text = pattern->pattern(); |
| - size_t text_length = text.length(); |
| // Iterators on the tree and the text. |
| - int current_node = 0; |
| - size_t text_pos = 0; |
| + unsigned current_node = 0; |
| + std::string::const_iterator i = text.begin(); |
| // Follow existing paths for as long as possible. |
| - while (text_pos < text_length && |
| - tree_[current_node].HasEdge(text[text_pos])) { |
| - current_node = tree_[current_node].GetEdge(text[text_pos]); |
| - ++text_pos; |
| + unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
| + while (i != text.end() && |
|
battre
2013/05/03 21:59:12
text is immutable. Should we have a const std::str
vabr (Chromium)
2013/05/05 23:36:49
It does not seem to make a difference performance-
|
| + edge_from_current != AhoCorasickNode::kInvalidIndex) { |
| + current_node = edge_from_current; |
| + ++i; |
| + edge_from_current = tree_[current_node].GetEdge(*i); |
| } |
| // Create new nodes if necessary. |
| - while (text_pos < text_length) { |
| - AhoCorasickNode new_node; |
| - tree_.push_back(new_node); |
| - tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); |
| + while (i != text.end()) { |
| + tree_.push_back(AhoCorasickNode()); |
| + tree_[current_node].SetEdge(*i, tree_.size() - 1); |
| current_node = tree_.size() - 1; |
| - ++text_pos; |
| + ++i; |
| } |
| // Register match. |
| @@ -130,14 +185,14 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| void SubstringSetMatcher::CreateFailureEdges() { |
| typedef AhoCorasickNode::Edges Edges; |
| - std::queue<int> queue; |
| + std::queue<unsigned> queue; |
| AhoCorasickNode& root = tree_[0]; |
| root.set_failure(0); |
| - const AhoCorasickNode::Edges& root_edges = root.edges(); |
| + const Edges& root_edges = root.edges(); |
| for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
| ++e) { |
| - int leads_to = e->second; |
| + const unsigned& leads_to = e->second; |
|
vabr (Chromium)
2013/05/03 20:37:27
I know this is uncommon. However, this method is a
battre
2013/05/03 21:59:12
Have you checked what happens if you used "const i
vabr (Chromium)
2013/05/05 23:36:49
Are you asking about removing the reference, or mo
|
| tree_[leads_to].set_failure(0); |
| queue.push(leads_to); |
| } |
| @@ -147,16 +202,22 @@ void SubstringSetMatcher::CreateFailureEdges() { |
| queue.pop(); |
| for (Edges::const_iterator e = current_node.edges().begin(); |
| e != current_node.edges().end(); ++e) { |
| - char edge_label = e->first; |
| - int leads_to = e->second; |
| + const char& edge_label = e->first; |
| + const unsigned& leads_to = e->second; |
| queue.push(leads_to); |
| - int failure = current_node.failure(); |
| - while (!tree_[failure].HasEdge(edge_label) && failure != 0) |
| + unsigned failure = current_node.failure(); |
| + unsigned edge_from_failure = tree_[failure].GetEdge(edge_label); |
| + while (edge_from_failure == AhoCorasickNode::kInvalidIndex && |
| + failure != 0) { |
| failure = tree_[failure].failure(); |
| + edge_from_failure = tree_[failure].GetEdge(edge_label); |
| + } |
| - int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? |
| - tree_[failure].GetEdge(edge_label) : 0; |
| + const unsigned follow_in_case_of_failure = |
| + edge_from_failure != AhoCorasickNode::kInvalidIndex |
| + ? edge_from_failure |
| + : 0; |
| tree_[leads_to].set_failure(follow_in_case_of_failure); |
| tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
| } |
| @@ -164,7 +225,7 @@ void SubstringSetMatcher::CreateFailureEdges() { |
| } |
| SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
| - : failure_(-1) {} |
| + : failure_(kInvalidIndex) {} |
| SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
| @@ -183,16 +244,13 @@ SubstringSetMatcher::AhoCorasickNode::operator=( |
| return *this; |
| } |
| -bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { |
| - return edges_.find(c) != edges_.end(); |
| -} |
| - |
| -int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| - std::map<char, int>::const_iterator i = edges_.find(c); |
| - return i == edges_.end() ? -1 : i->second; |
| +unsigned SubstringSetMatcher::AhoCorasickNode::GetEdge( |
| + char c) const { |
| + Edges::const_iterator i = edges_.find(c); |
| + return i == edges_.end() ? kInvalidIndex : i->second; |
| } |
| -void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { |
| +void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, unsigned node) { |
| edges_[c] = node; |
| } |