Chromium Code Reviews| Index: extensions/common/matcher/substring_set_matcher.cc |
| diff --git a/extensions/common/matcher/substring_set_matcher.cc b/extensions/common/matcher/substring_set_matcher.cc |
| index b5fdae6da37f80c3b9a1b963e1cbb052e54af8b5..4c5108a086cb787184d7bf4ffc08a5e9cee7a3bb 100644 |
| --- a/extensions/common/matcher/substring_set_matcher.cc |
| +++ b/extensions/common/matcher/substring_set_matcher.cc |
| @@ -4,6 +4,7 @@ |
| #include "extensions/common/matcher/substring_set_matcher.h" |
| +#include <algorithm> |
| #include <queue> |
| #include "base/logging.h" |
| @@ -11,12 +12,51 @@ |
| namespace extensions { |
| +namespace { |
| + |
| +// Compare StringPattern instances based on their string patterns. |
| +bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
| + return a->pattern() <= b->pattern(); |
|
vabr (Chromium)
2013/05/15 07:04:17
This needs to be strict, otherwise the sort is not
|
| +} |
| + |
| +// Given the set of patterns, compute how many nodes will the corresponding |
| +// Aho-Corasick tree have. Note that |patterns| need to be sorted. |
| +uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { |
| + uint32 result = 1u; // 1 for the root node. |
| + if (patterns.empty()) |
| + return result; |
| + |
| + std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
| + std::vector<const StringPattern*>::const_iterator current = last + 1; |
| + // For the first pattern, each letter is a label of an edge to a new node. |
| + result += (*last)->pattern().size(); |
| + |
| + // For the subsequent patterns, only count the edges which were not counted |
| + // yet. For this it suffices to test against the previous pattern, because the |
| + // patterns are sorted. |
| + for (; current != patterns.end(); ++last, ++current) { |
| + const std::string& last_pattern = (*last)->pattern(); |
| + const std::string& current_pattern = (*current)->pattern(); |
| + const uint32 prefix_bound = |
| + std::min(last_pattern.size(), current_pattern.size()); |
| + |
| + uint32 common_prefix = 0; |
| + while (common_prefix < prefix_bound && |
| + last_pattern[common_prefix] == current_pattern[common_prefix]) |
| + ++common_prefix; |
| + result += current_pattern.size() - common_prefix; |
| + } |
| + return result; |
| +} |
| + |
| +} // namespace |
| + |
| // |
| // SubstringSetMatcher |
| // |
| SubstringSetMatcher::SubstringSetMatcher() { |
| - RebuildAhoCorasickTree(); |
| + RebuildAhoCorasickTree(SubstringPatternVector()); |
| } |
| SubstringSetMatcher::~SubstringSetMatcher() {} |
| @@ -49,27 +89,44 @@ void SubstringSetMatcher::RegisterAndUnregisterPatterns( |
| patterns_.erase((*i)->id()); |
| } |
| - RebuildAhoCorasickTree(); |
| + // Now we compute the total number of tree nodes needed. |
| + SubstringPatternVector sorted_patterns; |
| + sorted_patterns.resize(patterns_.size()); |
| + |
| + size_t next = 0; |
| + for (SubstringPatternMap::const_iterator i = patterns_.begin(); |
| + i != patterns_.end(); |
| + ++i, ++next) { |
| + sorted_patterns[next] = i->second; |
| + } |
| + |
| + std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
| + tree_.reserve(TreeSize(sorted_patterns)); |
| + |
| + RebuildAhoCorasickTree(sorted_patterns); |
| } |
| bool SubstringSetMatcher::Match(const std::string& text, |
| std::set<StringPattern::ID>* matches) const { |
| - size_t old_number_of_matches = matches->size(); |
| + const size_t old_number_of_matches = matches->size(); |
| // Handle patterns matching the empty string. |
| matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
| - int current_node = 0; |
| - size_t text_length = text.length(); |
| - for (size_t i = 0; i < text_length; ++i) { |
| - while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) |
| + uint32 current_node = 0; |
| + for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
| + uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
| + while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
| + current_node != 0) { |
| current_node = tree_[current_node].failure(); |
| - if (tree_[current_node].HasEdge(text[i])) { |
| - current_node = tree_[current_node].GetEdge(text[i]); |
| + edge_from_current = tree_[current_node].GetEdge(*i); |
| + } |
| + if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
| + current_node = edge_from_current; |
| matches->insert(tree_[current_node].matches().begin(), |
| tree_[current_node].matches().end()); |
| } else { |
| - DCHECK_EQ(0, current_node); |
| + DCHECK_EQ(0u, current_node); |
| } |
| } |
| @@ -81,7 +138,8 @@ bool SubstringSetMatcher::IsEmpty() const { |
| return patterns_.empty() && tree_.size() == 1u; |
| } |
| -void SubstringSetMatcher::RebuildAhoCorasickTree() { |
| +void SubstringSetMatcher::RebuildAhoCorasickTree( |
| + const SubstringPatternVector& sorted_patterns) { |
| tree_.clear(); |
| // Initialize root note of tree. |
| @@ -90,9 +148,10 @@ void SubstringSetMatcher::RebuildAhoCorasickTree() { |
| tree_.push_back(root); |
| // Insert all patterns. |
| - for (SubstringPatternSet::const_iterator i = patterns_.begin(); |
| - i != patterns_.end(); ++i) { |
| - InsertPatternIntoAhoCorasickTree(i->second); |
| + for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); |
| + i != sorted_patterns.end(); |
| + ++i) { |
| + InsertPatternIntoAhoCorasickTree(*i); |
| } |
| CreateFailureEdges(); |
| @@ -101,26 +160,27 @@ void SubstringSetMatcher::RebuildAhoCorasickTree() { |
| void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| const StringPattern* pattern) { |
| const std::string& text = pattern->pattern(); |
| - size_t text_length = text.length(); |
| + const std::string::const_iterator text_end = text.end(); |
| // Iterators on the tree and the text. |
| - int current_node = 0; |
| - size_t text_pos = 0; |
| + uint32 current_node = 0; |
| + std::string::const_iterator i = text.begin(); |
| // Follow existing paths for as long as possible. |
| - while (text_pos < text_length && |
| - tree_[current_node].HasEdge(text[text_pos])) { |
| - current_node = tree_[current_node].GetEdge(text[text_pos]); |
| - ++text_pos; |
| + uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
|
vabr (Chromium)
2013/05/15 07:04:17
Here I indeed dereferenced an invalid iterator if
|
| + while (i != text_end && |
| + edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
| + current_node = edge_from_current; |
| + ++i; |
| + edge_from_current = tree_[current_node].GetEdge(*i); |
| } |
| // Create new nodes if necessary. |
| - while (text_pos < text_length) { |
| - AhoCorasickNode new_node; |
| - tree_.push_back(new_node); |
| - tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); |
| + while (i != text_end) { |
| + tree_.push_back(AhoCorasickNode()); |
| + tree_[current_node].SetEdge(*i, tree_.size() - 1); |
| current_node = tree_.size() - 1; |
| - ++text_pos; |
| + ++i; |
| } |
| // Register match. |
| @@ -130,14 +190,14 @@ void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| void SubstringSetMatcher::CreateFailureEdges() { |
| typedef AhoCorasickNode::Edges Edges; |
| - std::queue<int> queue; |
| + std::queue<uint32> queue; |
| AhoCorasickNode& root = tree_[0]; |
| root.set_failure(0); |
| - const AhoCorasickNode::Edges& root_edges = root.edges(); |
| + const Edges& root_edges = root.edges(); |
| for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
| ++e) { |
| - int leads_to = e->second; |
| + const uint32& leads_to = e->second; |
| tree_[leads_to].set_failure(0); |
| queue.push(leads_to); |
| } |
| @@ -147,24 +207,32 @@ void SubstringSetMatcher::CreateFailureEdges() { |
| queue.pop(); |
| for (Edges::const_iterator e = current_node.edges().begin(); |
| e != current_node.edges().end(); ++e) { |
| - char edge_label = e->first; |
| - int leads_to = e->second; |
| + const char& edge_label = e->first; |
| + const uint32& leads_to = e->second; |
| queue.push(leads_to); |
| - int failure = current_node.failure(); |
| - while (!tree_[failure].HasEdge(edge_label) && failure != 0) |
| + uint32 failure = current_node.failure(); |
| + uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); |
| + while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
| + failure != 0) { |
| failure = tree_[failure].failure(); |
| + edge_from_failure = tree_[failure].GetEdge(edge_label); |
| + } |
| - int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? |
| - tree_[failure].GetEdge(edge_label) : 0; |
| + const uint32 follow_in_case_of_failure = |
| + edge_from_failure != AhoCorasickNode::kNoSuchEdge |
| + ? edge_from_failure |
| + : 0; |
| tree_[leads_to].set_failure(follow_in_case_of_failure); |
| tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
| } |
| } |
| } |
| +const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; |
| + |
| SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
| - : failure_(-1) {} |
| + : failure_(kNoSuchEdge) {} |
| SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
| @@ -183,16 +251,12 @@ SubstringSetMatcher::AhoCorasickNode::operator=( |
| return *this; |
| } |
| -bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { |
| - return edges_.find(c) != edges_.end(); |
| -} |
| - |
| -int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| - std::map<char, int>::const_iterator i = edges_.find(c); |
| - return i == edges_.end() ? -1 : i->second; |
| +uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| + Edges::const_iterator i = edges_.find(c); |
| + return i == edges_.end() ? kNoSuchEdge : i->second; |
| } |
| -void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { |
| +void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { |
| edges_[c] = node; |
| } |