Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "extensions/common/matcher/substring_set_matcher.h" | 5 #include "extensions/common/matcher/substring_set_matcher.h" |
| 6 | 6 |
| 7 #include <algorithm> | |
| 7 #include <queue> | 8 #include <queue> |
| 8 | 9 |
| 9 #include "base/logging.h" | 10 #include "base/logging.h" |
| 10 #include "base/stl_util.h" | 11 #include "base/stl_util.h" |
| 11 | 12 |
| 12 namespace extensions { | 13 namespace extensions { |
| 13 | 14 |
| 15 namespace { | |
| 16 | |
| 17 // Compare StringPattern instances based on their string patterns. | |
| 18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | |
| 19 return a->pattern() <= b->pattern(); | |
| 20 } | |
| 21 | |
| 22 // Given the set of patterns, compute how many nodes will the corresponding | |
| 23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. | |
| 24 unsigned TreeSize(const std::vector<const StringPattern*>& patterns) { | |
|
battre
2013/05/03 21:59:12
This should be size_t; also in all other places.
vabr (Chromium)
2013/05/05 23:36:49
On my machine, size_t is 8 bytes, unsigned is only
| |
| 25 unsigned result = 1u; // 1 for the root node. | |
| 26 if (patterns.empty()) | |
| 27 return result; | |
| 28 | |
| 29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | |
| 30 std::vector<const StringPattern*>::const_iterator current = last + 1; | |
| 31 // For the first pattern, each letter is a label of an edge to a new node. | |
| 32 result += (*last)->pattern().size(); | |
| 33 | |
| 34 // For the subsequent patterns, only count the edges which were not counted | |
| 35 // yet. For this it suffices to test against the previous pattern, because the | |
| 36 // patterns are sorted. | |
| 37 for (; current != patterns.end(); ++last, ++current) { | |
| 38 const std::string& last_s = (*last)->pattern(); | |
|
battre
2013/05/03 21:59:12
what does "_s" stand for? Can you pick a name with
vabr (Chromium)
2013/05/05 23:36:49
Done.
| |
| 39 const std::string& current_s = (*current)->pattern(); | |
| 40 const unsigned prefix_bound = std::min(last_s.size(), current_s.size()); | |
| 41 | |
| 42 unsigned common_prefix = 0; | |
| 43 while (common_prefix < prefix_bound && | |
| 44 last_s[common_prefix] == current_s[common_prefix]) | |
| 45 ++common_prefix; | |
| 46 result += current_s.size() - common_prefix; | |
| 47 } | |
| 48 return result; | |
| 49 } | |
| 50 | |
| 51 } // namespace | |
| 52 | |
| 14 // | 53 // |
| 15 // SubstringSetMatcher | 54 // SubstringSetMatcher |
| 16 // | 55 // |
| 17 | 56 |
| 18 SubstringSetMatcher::SubstringSetMatcher() { | 57 SubstringSetMatcher::SubstringSetMatcher() { |
| 19 RebuildAhoCorasickTree(); | 58 RebuildAhoCorasickTree(); |
| 20 } | 59 } |
| 21 | 60 |
| 22 SubstringSetMatcher::~SubstringSetMatcher() {} | 61 SubstringSetMatcher::~SubstringSetMatcher() {} |
| 23 | 62 |
| (...skipping 18 matching lines...) Expand all Loading... | |
| 42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 81 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
| 43 patterns_[(*i)->id()] = *i; | 82 patterns_[(*i)->id()] = *i; |
| 44 } | 83 } |
| 45 | 84 |
| 46 // Unregister patterns | 85 // Unregister patterns |
| 47 for (std::vector<const StringPattern*>::const_iterator i = | 86 for (std::vector<const StringPattern*>::const_iterator i = |
| 48 to_unregister.begin(); i != to_unregister.end(); ++i) { | 87 to_unregister.begin(); i != to_unregister.end(); ++i) { |
| 49 patterns_.erase((*i)->id()); | 88 patterns_.erase((*i)->id()); |
| 50 } | 89 } |
| 51 | 90 |
| 91 // We compute the total number of tree nodes needed by using this vector. | |
| 92 std::vector<const StringPattern*> sorted_patterns(patterns_.size()); | |
| 93 // First, fill it from |patterns_|. | |
| 94 size_t next = 0; | |
| 95 for (SubstringPatternSet::const_iterator i = patterns_.begin(); | |
| 96 i != patterns_.end(); | |
| 97 ++i, ++next) { | |
| 98 sorted_patterns[next] = i->second; | |
| 99 } | |
| 100 | |
| 101 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); | |
| 102 tree_.reserve(TreeSize(sorted_patterns)); | |
| 103 | |
| 52 RebuildAhoCorasickTree(); | 104 RebuildAhoCorasickTree(); |
| 53 } | 105 } |
| 54 | 106 |
| 55 bool SubstringSetMatcher::Match(const std::string& text, | 107 bool SubstringSetMatcher::Match(const std::string& text, |
| 56 std::set<StringPattern::ID>* matches) const { | 108 std::set<StringPattern::ID>* matches) const { |
| 57 size_t old_number_of_matches = matches->size(); | 109 const size_t old_number_of_matches = matches->size(); |
| 58 | 110 |
| 59 // Handle patterns matching the empty string. | 111 // Handle patterns matching the empty string. |
| 60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 112 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
| 61 | 113 |
| 62 int current_node = 0; | 114 unsigned current_node = 0; |
| 63 size_t text_length = text.length(); | 115 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
| 64 for (size_t i = 0; i < text_length; ++i) { | 116 unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
| 65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) | 117 while (edge_from_current == AhoCorasickNode::kInvalidIndex && |
| 118 current_node != 0) { | |
| 66 current_node = tree_[current_node].failure(); | 119 current_node = tree_[current_node].failure(); |
| 67 if (tree_[current_node].HasEdge(text[i])) { | 120 edge_from_current = tree_[current_node].GetEdge(*i); |
| 68 current_node = tree_[current_node].GetEdge(text[i]); | 121 } |
| 122 if (edge_from_current != AhoCorasickNode::kInvalidIndex) { | |
| 123 current_node = edge_from_current; | |
| 69 matches->insert(tree_[current_node].matches().begin(), | 124 matches->insert(tree_[current_node].matches().begin(), |
| 70 tree_[current_node].matches().end()); | 125 tree_[current_node].matches().end()); |
| 71 } else { | 126 } else { |
| 72 DCHECK_EQ(0, current_node); | 127 DCHECK_EQ(0u, current_node); |
| 73 } | 128 } |
| 74 } | 129 } |
| 75 | 130 |
| 76 return old_number_of_matches != matches->size(); | 131 return old_number_of_matches != matches->size(); |
| 77 } | 132 } |
| 78 | 133 |
| 79 bool SubstringSetMatcher::IsEmpty() const { | 134 bool SubstringSetMatcher::IsEmpty() const { |
| 80 // An empty tree consists of only the root node. | 135 // An empty tree consists of only the root node. |
| 81 return patterns_.empty() && tree_.size() == 1u; | 136 return patterns_.empty() && tree_.size() == 1u; |
| 82 } | 137 } |
| (...skipping 11 matching lines...) Expand all Loading... | |
| 94 i != patterns_.end(); ++i) { | 149 i != patterns_.end(); ++i) { |
| 95 InsertPatternIntoAhoCorasickTree(i->second); | 150 InsertPatternIntoAhoCorasickTree(i->second); |
| 96 } | 151 } |
| 97 | 152 |
| 98 CreateFailureEdges(); | 153 CreateFailureEdges(); |
| 99 } | 154 } |
| 100 | 155 |
| 101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 156 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| 102 const StringPattern* pattern) { | 157 const StringPattern* pattern) { |
| 103 const std::string& text = pattern->pattern(); | 158 const std::string& text = pattern->pattern(); |
| 104 size_t text_length = text.length(); | |
| 105 | 159 |
| 106 // Iterators on the tree and the text. | 160 // Iterators on the tree and the text. |
| 107 int current_node = 0; | 161 unsigned current_node = 0; |
| 108 size_t text_pos = 0; | 162 std::string::const_iterator i = text.begin(); |
| 109 | 163 |
| 110 // Follow existing paths for as long as possible. | 164 // Follow existing paths for as long as possible. |
| 111 while (text_pos < text_length && | 165 unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
| 112 tree_[current_node].HasEdge(text[text_pos])) { | 166 while (i != text.end() && |
|
battre
2013/05/03 21:59:12
text is immutable. Should we have a const std::str
vabr (Chromium)
2013/05/05 23:36:49
It does not seem to make a difference performance-
| |
| 113 current_node = tree_[current_node].GetEdge(text[text_pos]); | 167 edge_from_current != AhoCorasickNode::kInvalidIndex) { |
| 114 ++text_pos; | 168 current_node = edge_from_current; |
| 169 ++i; | |
| 170 edge_from_current = tree_[current_node].GetEdge(*i); | |
| 115 } | 171 } |
| 116 | 172 |
| 117 // Create new nodes if necessary. | 173 // Create new nodes if necessary. |
| 118 while (text_pos < text_length) { | 174 while (i != text.end()) { |
| 119 AhoCorasickNode new_node; | 175 tree_.push_back(AhoCorasickNode()); |
| 120 tree_.push_back(new_node); | 176 tree_[current_node].SetEdge(*i, tree_.size() - 1); |
| 121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); | |
| 122 current_node = tree_.size() - 1; | 177 current_node = tree_.size() - 1; |
| 123 ++text_pos; | 178 ++i; |
| 124 } | 179 } |
| 125 | 180 |
| 126 // Register match. | 181 // Register match. |
| 127 tree_[current_node].AddMatch(pattern->id()); | 182 tree_[current_node].AddMatch(pattern->id()); |
| 128 } | 183 } |
| 129 | 184 |
| 130 void SubstringSetMatcher::CreateFailureEdges() { | 185 void SubstringSetMatcher::CreateFailureEdges() { |
| 131 typedef AhoCorasickNode::Edges Edges; | 186 typedef AhoCorasickNode::Edges Edges; |
| 132 | 187 |
| 133 std::queue<int> queue; | 188 std::queue<unsigned> queue; |
| 134 | 189 |
| 135 AhoCorasickNode& root = tree_[0]; | 190 AhoCorasickNode& root = tree_[0]; |
| 136 root.set_failure(0); | 191 root.set_failure(0); |
| 137 const AhoCorasickNode::Edges& root_edges = root.edges(); | 192 const Edges& root_edges = root.edges(); |
| 138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 193 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
| 139 ++e) { | 194 ++e) { |
| 140 int leads_to = e->second; | 195 const unsigned& leads_to = e->second; |
|
vabr (Chromium)
2013/05/03 20:37:27
I know this is uncommon. However, this method is a
battre
2013/05/03 21:59:12
Have you checked what happens if you used "const i
vabr (Chromium)
2013/05/05 23:36:49
Are you asking about removing the reference, or mo
| |
| 141 tree_[leads_to].set_failure(0); | 196 tree_[leads_to].set_failure(0); |
| 142 queue.push(leads_to); | 197 queue.push(leads_to); |
| 143 } | 198 } |
| 144 | 199 |
| 145 while (!queue.empty()) { | 200 while (!queue.empty()) { |
| 146 AhoCorasickNode& current_node = tree_[queue.front()]; | 201 AhoCorasickNode& current_node = tree_[queue.front()]; |
| 147 queue.pop(); | 202 queue.pop(); |
| 148 for (Edges::const_iterator e = current_node.edges().begin(); | 203 for (Edges::const_iterator e = current_node.edges().begin(); |
| 149 e != current_node.edges().end(); ++e) { | 204 e != current_node.edges().end(); ++e) { |
| 150 char edge_label = e->first; | 205 const char& edge_label = e->first; |
| 151 int leads_to = e->second; | 206 const unsigned& leads_to = e->second; |
| 152 queue.push(leads_to); | 207 queue.push(leads_to); |
| 153 | 208 |
| 154 int failure = current_node.failure(); | 209 unsigned failure = current_node.failure(); |
| 155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) | 210 unsigned edge_from_failure = tree_[failure].GetEdge(edge_label); |
| 211 while (edge_from_failure == AhoCorasickNode::kInvalidIndex && | |
| 212 failure != 0) { | |
| 156 failure = tree_[failure].failure(); | 213 failure = tree_[failure].failure(); |
| 214 edge_from_failure = tree_[failure].GetEdge(edge_label); | |
| 215 } | |
| 157 | 216 |
| 158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? | 217 const unsigned follow_in_case_of_failure = |
| 159 tree_[failure].GetEdge(edge_label) : 0; | 218 edge_from_failure != AhoCorasickNode::kInvalidIndex |
| 219 ? edge_from_failure | |
| 220 : 0; | |
| 160 tree_[leads_to].set_failure(follow_in_case_of_failure); | 221 tree_[leads_to].set_failure(follow_in_case_of_failure); |
| 161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 222 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
| 162 } | 223 } |
| 163 } | 224 } |
| 164 } | 225 } |
| 165 | 226 |
| 166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 227 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
| 167 : failure_(-1) {} | 228 : failure_(kInvalidIndex) {} |
| 168 | 229 |
| 169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 230 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
| 170 | 231 |
| 171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 232 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
| 172 const SubstringSetMatcher::AhoCorasickNode& other) | 233 const SubstringSetMatcher::AhoCorasickNode& other) |
| 173 : edges_(other.edges_), | 234 : edges_(other.edges_), |
| 174 failure_(other.failure_), | 235 failure_(other.failure_), |
| 175 matches_(other.matches_) {} | 236 matches_(other.matches_) {} |
| 176 | 237 |
| 177 SubstringSetMatcher::AhoCorasickNode& | 238 SubstringSetMatcher::AhoCorasickNode& |
| 178 SubstringSetMatcher::AhoCorasickNode::operator=( | 239 SubstringSetMatcher::AhoCorasickNode::operator=( |
| 179 const SubstringSetMatcher::AhoCorasickNode& other) { | 240 const SubstringSetMatcher::AhoCorasickNode& other) { |
| 180 edges_ = other.edges_; | 241 edges_ = other.edges_; |
| 181 failure_ = other.failure_; | 242 failure_ = other.failure_; |
| 182 matches_ = other.matches_; | 243 matches_ = other.matches_; |
| 183 return *this; | 244 return *this; |
| 184 } | 245 } |
| 185 | 246 |
| 186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { | 247 unsigned SubstringSetMatcher::AhoCorasickNode::GetEdge( |
| 187 return edges_.find(c) != edges_.end(); | 248 char c) const { |
| 249 Edges::const_iterator i = edges_.find(c); | |
| 250 return i == edges_.end() ? kInvalidIndex : i->second; | |
| 188 } | 251 } |
| 189 | 252 |
| 190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 253 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, unsigned node) { |
| 191 std::map<char, int>::const_iterator i = edges_.find(c); | |
| 192 return i == edges_.end() ? -1 : i->second; | |
| 193 } | |
| 194 | |
| 195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { | |
| 196 edges_[c] = node; | 254 edges_[c] = node; |
| 197 } | 255 } |
| 198 | 256 |
| 199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 257 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
| 200 matches_.insert(id); | 258 matches_.insert(id); |
| 201 } | 259 } |
| 202 | 260 |
| 203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 261 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
| 204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 262 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
| 205 matches_.insert(matches.begin(), matches.end()); | 263 matches_.insert(matches.begin(), matches.end()); |
| 206 } | 264 } |
| 207 | 265 |
| 208 } // namespace extensions | 266 } // namespace extensions |
| OLD | NEW |