| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "extensions/common/matcher/substring_set_matcher.h" | 5 #include "extensions/common/matcher/substring_set_matcher.h" |
| 6 | 6 |
| 7 #include <algorithm> |
| 7 #include <queue> | 8 #include <queue> |
| 8 | 9 |
| 9 #include "base/logging.h" | 10 #include "base/logging.h" |
| 10 #include "base/stl_util.h" | 11 #include "base/stl_util.h" |
| 11 | 12 |
| 12 namespace extensions { | 13 namespace extensions { |
| 13 | 14 |
| 15 namespace { |
| 16 |
| 17 // Compare StringPattern instances based on their string patterns. |
| 18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
| 19 return a->pattern() <= b->pattern(); |
| 20 } |
| 21 |
| 22 // Given the set of patterns, compute how many nodes will the corresponding |
| 23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. |
| 24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { |
| 25 uint32 result = 1u; // 1 for the root node. |
| 26 if (patterns.empty()) |
| 27 return result; |
| 28 |
| 29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
| 30 std::vector<const StringPattern*>::const_iterator current = last + 1; |
| 31 // For the first pattern, each letter is a label of an edge to a new node. |
| 32 result += (*last)->pattern().size(); |
| 33 |
| 34 // For the subsequent patterns, only count the edges which were not counted |
| 35 // yet. For this it suffices to test against the previous pattern, because the |
| 36 // patterns are sorted. |
| 37 for (; current != patterns.end(); ++last, ++current) { |
| 38 const std::string& last_pattern = (*last)->pattern(); |
| 39 const std::string& current_pattern = (*current)->pattern(); |
| 40 const uint32 prefix_bound = |
| 41 std::min(last_pattern.size(), current_pattern.size()); |
| 42 |
| 43 uint32 common_prefix = 0; |
| 44 while (common_prefix < prefix_bound && |
| 45 last_pattern[common_prefix] == current_pattern[common_prefix]) |
| 46 ++common_prefix; |
| 47 result += current_pattern.size() - common_prefix; |
| 48 } |
| 49 return result; |
| 50 } |
| 51 |
| 52 } // namespace |
| 53 |
| 14 // | 54 // |
| 15 // SubstringSetMatcher | 55 // SubstringSetMatcher |
| 16 // | 56 // |
| 17 | 57 |
| 18 SubstringSetMatcher::SubstringSetMatcher() { | 58 SubstringSetMatcher::SubstringSetMatcher() { |
| 19 RebuildAhoCorasickTree(); | 59 RebuildAhoCorasickTree(SubstringPatternVector()); |
| 20 } | 60 } |
| 21 | 61 |
| 22 SubstringSetMatcher::~SubstringSetMatcher() {} | 62 SubstringSetMatcher::~SubstringSetMatcher() {} |
| 23 | 63 |
| 24 void SubstringSetMatcher::RegisterPatterns( | 64 void SubstringSetMatcher::RegisterPatterns( |
| 25 const std::vector<const StringPattern*>& patterns) { | 65 const std::vector<const StringPattern*>& patterns) { |
| 26 RegisterAndUnregisterPatterns(patterns, | 66 RegisterAndUnregisterPatterns(patterns, |
| 27 std::vector<const StringPattern*>()); | 67 std::vector<const StringPattern*>()); |
| 28 } | 68 } |
| 29 | 69 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 82 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
| 43 patterns_[(*i)->id()] = *i; | 83 patterns_[(*i)->id()] = *i; |
| 44 } | 84 } |
| 45 | 85 |
| 46 // Unregister patterns | 86 // Unregister patterns |
| 47 for (std::vector<const StringPattern*>::const_iterator i = | 87 for (std::vector<const StringPattern*>::const_iterator i = |
| 48 to_unregister.begin(); i != to_unregister.end(); ++i) { | 88 to_unregister.begin(); i != to_unregister.end(); ++i) { |
| 49 patterns_.erase((*i)->id()); | 89 patterns_.erase((*i)->id()); |
| 50 } | 90 } |
| 51 | 91 |
| 52 RebuildAhoCorasickTree(); | 92 // Now we compute the total number of tree nodes needed. |
| 93 SubstringPatternVector sorted_patterns; |
| 94 sorted_patterns.resize(patterns_.size()); |
| 95 |
| 96 size_t next = 0; |
| 97 for (SubstringPatternMap::const_iterator i = patterns_.begin(); |
| 98 i != patterns_.end(); |
| 99 ++i, ++next) { |
| 100 sorted_patterns[next] = i->second; |
| 101 } |
| 102 |
| 103 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); |
| 104 tree_.reserve(TreeSize(sorted_patterns)); |
| 105 |
| 106 RebuildAhoCorasickTree(sorted_patterns); |
| 53 } | 107 } |
| 54 | 108 |
| 55 bool SubstringSetMatcher::Match(const std::string& text, | 109 bool SubstringSetMatcher::Match(const std::string& text, |
| 56 std::set<StringPattern::ID>* matches) const { | 110 std::set<StringPattern::ID>* matches) const { |
| 57 size_t old_number_of_matches = matches->size(); | 111 const size_t old_number_of_matches = matches->size(); |
| 58 | 112 |
| 59 // Handle patterns matching the empty string. | 113 // Handle patterns matching the empty string. |
| 60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
| 61 | 115 |
| 62 int current_node = 0; | 116 uint32 current_node = 0; |
| 63 size_t text_length = text.length(); | 117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
| 64 for (size_t i = 0; i < text_length; ++i) { | 118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
| 65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) | 119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
| 120 current_node != 0) { |
| 66 current_node = tree_[current_node].failure(); | 121 current_node = tree_[current_node].failure(); |
| 67 if (tree_[current_node].HasEdge(text[i])) { | 122 edge_from_current = tree_[current_node].GetEdge(*i); |
| 68 current_node = tree_[current_node].GetEdge(text[i]); | 123 } |
| 124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
| 125 current_node = edge_from_current; |
| 69 matches->insert(tree_[current_node].matches().begin(), | 126 matches->insert(tree_[current_node].matches().begin(), |
| 70 tree_[current_node].matches().end()); | 127 tree_[current_node].matches().end()); |
| 71 } else { | 128 } else { |
| 72 DCHECK_EQ(0, current_node); | 129 DCHECK_EQ(0u, current_node); |
| 73 } | 130 } |
| 74 } | 131 } |
| 75 | 132 |
| 76 return old_number_of_matches != matches->size(); | 133 return old_number_of_matches != matches->size(); |
| 77 } | 134 } |
| 78 | 135 |
| 79 bool SubstringSetMatcher::IsEmpty() const { | 136 bool SubstringSetMatcher::IsEmpty() const { |
| 80 // An empty tree consists of only the root node. | 137 // An empty tree consists of only the root node. |
| 81 return patterns_.empty() && tree_.size() == 1u; | 138 return patterns_.empty() && tree_.size() == 1u; |
| 82 } | 139 } |
| 83 | 140 |
| 84 void SubstringSetMatcher::RebuildAhoCorasickTree() { | 141 void SubstringSetMatcher::RebuildAhoCorasickTree( |
| 142 const SubstringPatternVector& sorted_patterns) { |
| 85 tree_.clear(); | 143 tree_.clear(); |
| 86 | 144 |
| 87 // Initialize root note of tree. | 145 // Initialize root note of tree. |
| 88 AhoCorasickNode root; | 146 AhoCorasickNode root; |
| 89 root.set_failure(0); | 147 root.set_failure(0); |
| 90 tree_.push_back(root); | 148 tree_.push_back(root); |
| 91 | 149 |
| 92 // Insert all patterns. | 150 // Insert all patterns. |
| 93 for (SubstringPatternSet::const_iterator i = patterns_.begin(); | 151 for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); |
| 94 i != patterns_.end(); ++i) { | 152 i != sorted_patterns.end(); |
| 95 InsertPatternIntoAhoCorasickTree(i->second); | 153 ++i) { |
| 154 InsertPatternIntoAhoCorasickTree(*i); |
| 96 } | 155 } |
| 97 | 156 |
| 98 CreateFailureEdges(); | 157 CreateFailureEdges(); |
| 99 } | 158 } |
| 100 | 159 |
| 101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| 102 const StringPattern* pattern) { | 161 const StringPattern* pattern) { |
| 103 const std::string& text = pattern->pattern(); | 162 const std::string& text = pattern->pattern(); |
| 104 size_t text_length = text.length(); | 163 const std::string::const_iterator text_end = text.end(); |
| 105 | 164 |
| 106 // Iterators on the tree and the text. | 165 // Iterators on the tree and the text. |
| 107 int current_node = 0; | 166 uint32 current_node = 0; |
| 108 size_t text_pos = 0; | 167 std::string::const_iterator i = text.begin(); |
| 109 | 168 |
| 110 // Follow existing paths for as long as possible. | 169 // Follow existing paths for as long as possible. |
| 111 while (text_pos < text_length && | 170 uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
| 112 tree_[current_node].HasEdge(text[text_pos])) { | 171 while (i != text_end && |
| 113 current_node = tree_[current_node].GetEdge(text[text_pos]); | 172 edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
| 114 ++text_pos; | 173 current_node = edge_from_current; |
| 174 ++i; |
| 175 edge_from_current = tree_[current_node].GetEdge(*i); |
| 115 } | 176 } |
| 116 | 177 |
| 117 // Create new nodes if necessary. | 178 // Create new nodes if necessary. |
| 118 while (text_pos < text_length) { | 179 while (i != text_end) { |
| 119 AhoCorasickNode new_node; | 180 tree_.push_back(AhoCorasickNode()); |
| 120 tree_.push_back(new_node); | 181 tree_[current_node].SetEdge(*i, tree_.size() - 1); |
| 121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); | |
| 122 current_node = tree_.size() - 1; | 182 current_node = tree_.size() - 1; |
| 123 ++text_pos; | 183 ++i; |
| 124 } | 184 } |
| 125 | 185 |
| 126 // Register match. | 186 // Register match. |
| 127 tree_[current_node].AddMatch(pattern->id()); | 187 tree_[current_node].AddMatch(pattern->id()); |
| 128 } | 188 } |
| 129 | 189 |
| 130 void SubstringSetMatcher::CreateFailureEdges() { | 190 void SubstringSetMatcher::CreateFailureEdges() { |
| 131 typedef AhoCorasickNode::Edges Edges; | 191 typedef AhoCorasickNode::Edges Edges; |
| 132 | 192 |
| 133 std::queue<int> queue; | 193 std::queue<uint32> queue; |
| 134 | 194 |
| 135 AhoCorasickNode& root = tree_[0]; | 195 AhoCorasickNode& root = tree_[0]; |
| 136 root.set_failure(0); | 196 root.set_failure(0); |
| 137 const AhoCorasickNode::Edges& root_edges = root.edges(); | 197 const Edges& root_edges = root.edges(); |
| 138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
| 139 ++e) { | 199 ++e) { |
| 140 int leads_to = e->second; | 200 const uint32& leads_to = e->second; |
| 141 tree_[leads_to].set_failure(0); | 201 tree_[leads_to].set_failure(0); |
| 142 queue.push(leads_to); | 202 queue.push(leads_to); |
| 143 } | 203 } |
| 144 | 204 |
| 145 while (!queue.empty()) { | 205 while (!queue.empty()) { |
| 146 AhoCorasickNode& current_node = tree_[queue.front()]; | 206 AhoCorasickNode& current_node = tree_[queue.front()]; |
| 147 queue.pop(); | 207 queue.pop(); |
| 148 for (Edges::const_iterator e = current_node.edges().begin(); | 208 for (Edges::const_iterator e = current_node.edges().begin(); |
| 149 e != current_node.edges().end(); ++e) { | 209 e != current_node.edges().end(); ++e) { |
| 150 char edge_label = e->first; | 210 const char& edge_label = e->first; |
| 151 int leads_to = e->second; | 211 const uint32& leads_to = e->second; |
| 152 queue.push(leads_to); | 212 queue.push(leads_to); |
| 153 | 213 |
| 154 int failure = current_node.failure(); | 214 uint32 failure = current_node.failure(); |
| 155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) | 215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); |
| 216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
| 217 failure != 0) { |
| 156 failure = tree_[failure].failure(); | 218 failure = tree_[failure].failure(); |
| 219 edge_from_failure = tree_[failure].GetEdge(edge_label); |
| 220 } |
| 157 | 221 |
| 158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? | 222 const uint32 follow_in_case_of_failure = |
| 159 tree_[failure].GetEdge(edge_label) : 0; | 223 edge_from_failure != AhoCorasickNode::kNoSuchEdge |
| 224 ? edge_from_failure |
| 225 : 0; |
| 160 tree_[leads_to].set_failure(follow_in_case_of_failure); | 226 tree_[leads_to].set_failure(follow_in_case_of_failure); |
| 161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
| 162 } | 228 } |
| 163 } | 229 } |
| 164 } | 230 } |
| 165 | 231 |
| 232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; |
| 233 |
| 166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
| 167 : failure_(-1) {} | 235 : failure_(kNoSuchEdge) {} |
| 168 | 236 |
| 169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
| 170 | 238 |
| 171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
| 172 const SubstringSetMatcher::AhoCorasickNode& other) | 240 const SubstringSetMatcher::AhoCorasickNode& other) |
| 173 : edges_(other.edges_), | 241 : edges_(other.edges_), |
| 174 failure_(other.failure_), | 242 failure_(other.failure_), |
| 175 matches_(other.matches_) {} | 243 matches_(other.matches_) {} |
| 176 | 244 |
| 177 SubstringSetMatcher::AhoCorasickNode& | 245 SubstringSetMatcher::AhoCorasickNode& |
| 178 SubstringSetMatcher::AhoCorasickNode::operator=( | 246 SubstringSetMatcher::AhoCorasickNode::operator=( |
| 179 const SubstringSetMatcher::AhoCorasickNode& other) { | 247 const SubstringSetMatcher::AhoCorasickNode& other) { |
| 180 edges_ = other.edges_; | 248 edges_ = other.edges_; |
| 181 failure_ = other.failure_; | 249 failure_ = other.failure_; |
| 182 matches_ = other.matches_; | 250 matches_ = other.matches_; |
| 183 return *this; | 251 return *this; |
| 184 } | 252 } |
| 185 | 253 |
| 186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { | 254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| 187 return edges_.find(c) != edges_.end(); | 255 Edges::const_iterator i = edges_.find(c); |
| 256 return i == edges_.end() ? kNoSuchEdge : i->second; |
| 188 } | 257 } |
| 189 | 258 |
| 190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { |
| 191 std::map<char, int>::const_iterator i = edges_.find(c); | |
| 192 return i == edges_.end() ? -1 : i->second; | |
| 193 } | |
| 194 | |
| 195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { | |
| 196 edges_[c] = node; | 260 edges_[c] = node; |
| 197 } | 261 } |
| 198 | 262 |
| 199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
| 200 matches_.insert(id); | 264 matches_.insert(id); |
| 201 } | 265 } |
| 202 | 266 |
| 203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 267 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
| 204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
| 205 matches_.insert(matches.begin(), matches.end()); | 269 matches_.insert(matches.begin(), matches.end()); |
| 206 } | 270 } |
| 207 | 271 |
| 208 } // namespace extensions | 272 } // namespace extensions |
| OLD | NEW |