| OLD | NEW |
| 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "extensions/common/matcher/substring_set_matcher.h" | 5 #include "extensions/common/matcher/substring_set_matcher.h" |
| 6 | 6 |
| 7 #include <algorithm> | |
| 8 #include <queue> | 7 #include <queue> |
| 9 | 8 |
| 10 #include "base/logging.h" | 9 #include "base/logging.h" |
| 11 #include "base/stl_util.h" | 10 #include "base/stl_util.h" |
| 12 | 11 |
| 13 namespace extensions { | 12 namespace extensions { |
| 14 | 13 |
| 15 namespace { | |
| 16 | |
| 17 // Compare StringPattern instances based on their string patterns. | |
| 18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | |
| 19 return a->pattern() <= b->pattern(); | |
| 20 } | |
| 21 | |
| 22 // Given the set of patterns, compute how many nodes will the corresponding | |
| 23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. | |
| 24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { | |
| 25 uint32 result = 1u; // 1 for the root node. | |
| 26 if (patterns.empty()) | |
| 27 return result; | |
| 28 | |
| 29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | |
| 30 std::vector<const StringPattern*>::const_iterator current = last + 1; | |
| 31 // For the first pattern, each letter is a label of an edge to a new node. | |
| 32 result += (*last)->pattern().size(); | |
| 33 | |
| 34 // For the subsequent patterns, only count the edges which were not counted | |
| 35 // yet. For this it suffices to test against the previous pattern, because the | |
| 36 // patterns are sorted. | |
| 37 for (; current != patterns.end(); ++last, ++current) { | |
| 38 const std::string& last_pattern = (*last)->pattern(); | |
| 39 const std::string& current_pattern = (*current)->pattern(); | |
| 40 const uint32 prefix_bound = | |
| 41 std::min(last_pattern.size(), current_pattern.size()); | |
| 42 | |
| 43 uint32 common_prefix = 0; | |
| 44 while (common_prefix < prefix_bound && | |
| 45 last_pattern[common_prefix] == current_pattern[common_prefix]) | |
| 46 ++common_prefix; | |
| 47 result += current_pattern.size() - common_prefix; | |
| 48 } | |
| 49 return result; | |
| 50 } | |
| 51 | |
| 52 } // namespace | |
| 53 | |
| 54 // | 14 // |
| 55 // SubstringSetMatcher | 15 // SubstringSetMatcher |
| 56 // | 16 // |
| 57 | 17 |
| 58 SubstringSetMatcher::SubstringSetMatcher() { | 18 SubstringSetMatcher::SubstringSetMatcher() { |
| 59 RebuildAhoCorasickTree(SubstringPatternVector()); | 19 RebuildAhoCorasickTree(); |
| 60 } | 20 } |
| 61 | 21 |
| 62 SubstringSetMatcher::~SubstringSetMatcher() {} | 22 SubstringSetMatcher::~SubstringSetMatcher() {} |
| 63 | 23 |
| 64 void SubstringSetMatcher::RegisterPatterns( | 24 void SubstringSetMatcher::RegisterPatterns( |
| 65 const std::vector<const StringPattern*>& patterns) { | 25 const std::vector<const StringPattern*>& patterns) { |
| 66 RegisterAndUnregisterPatterns(patterns, | 26 RegisterAndUnregisterPatterns(patterns, |
| 67 std::vector<const StringPattern*>()); | 27 std::vector<const StringPattern*>()); |
| 68 } | 28 } |
| 69 | 29 |
| (...skipping 12 matching lines...) Expand all Loading... |
| 82 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
| 83 patterns_[(*i)->id()] = *i; | 43 patterns_[(*i)->id()] = *i; |
| 84 } | 44 } |
| 85 | 45 |
| 86 // Unregister patterns | 46 // Unregister patterns |
| 87 for (std::vector<const StringPattern*>::const_iterator i = | 47 for (std::vector<const StringPattern*>::const_iterator i = |
| 88 to_unregister.begin(); i != to_unregister.end(); ++i) { | 48 to_unregister.begin(); i != to_unregister.end(); ++i) { |
| 89 patterns_.erase((*i)->id()); | 49 patterns_.erase((*i)->id()); |
| 90 } | 50 } |
| 91 | 51 |
| 92 // Now we compute the total number of tree nodes needed. | 52 RebuildAhoCorasickTree(); |
| 93 SubstringPatternVector sorted_patterns; | |
| 94 sorted_patterns.resize(patterns_.size()); | |
| 95 | |
| 96 size_t next = 0; | |
| 97 for (SubstringPatternMap::const_iterator i = patterns_.begin(); | |
| 98 i != patterns_.end(); | |
| 99 ++i, ++next) { | |
| 100 sorted_patterns[next] = i->second; | |
| 101 } | |
| 102 | |
| 103 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); | |
| 104 tree_.reserve(TreeSize(sorted_patterns)); | |
| 105 | |
| 106 RebuildAhoCorasickTree(sorted_patterns); | |
| 107 } | 53 } |
| 108 | 54 |
| 109 bool SubstringSetMatcher::Match(const std::string& text, | 55 bool SubstringSetMatcher::Match(const std::string& text, |
| 110 std::set<StringPattern::ID>* matches) const { | 56 std::set<StringPattern::ID>* matches) const { |
| 111 const size_t old_number_of_matches = matches->size(); | 57 size_t old_number_of_matches = matches->size(); |
| 112 | 58 |
| 113 // Handle patterns matching the empty string. | 59 // Handle patterns matching the empty string. |
| 114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
| 115 | 61 |
| 116 uint32 current_node = 0; | 62 int current_node = 0; |
| 117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { | 63 size_t text_length = text.length(); |
| 118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 64 for (size_t i = 0; i < text_length; ++i) { |
| 119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && | 65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) |
| 120 current_node != 0) { | |
| 121 current_node = tree_[current_node].failure(); | 66 current_node = tree_[current_node].failure(); |
| 122 edge_from_current = tree_[current_node].GetEdge(*i); | 67 if (tree_[current_node].HasEdge(text[i])) { |
| 123 } | 68 current_node = tree_[current_node].GetEdge(text[i]); |
| 124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { | |
| 125 current_node = edge_from_current; | |
| 126 matches->insert(tree_[current_node].matches().begin(), | 69 matches->insert(tree_[current_node].matches().begin(), |
| 127 tree_[current_node].matches().end()); | 70 tree_[current_node].matches().end()); |
| 128 } else { | 71 } else { |
| 129 DCHECK_EQ(0u, current_node); | 72 DCHECK_EQ(0, current_node); |
| 130 } | 73 } |
| 131 } | 74 } |
| 132 | 75 |
| 133 return old_number_of_matches != matches->size(); | 76 return old_number_of_matches != matches->size(); |
| 134 } | 77 } |
| 135 | 78 |
| 136 bool SubstringSetMatcher::IsEmpty() const { | 79 bool SubstringSetMatcher::IsEmpty() const { |
| 137 // An empty tree consists of only the root node. | 80 // An empty tree consists of only the root node. |
| 138 return patterns_.empty() && tree_.size() == 1u; | 81 return patterns_.empty() && tree_.size() == 1u; |
| 139 } | 82 } |
| 140 | 83 |
| 141 void SubstringSetMatcher::RebuildAhoCorasickTree( | 84 void SubstringSetMatcher::RebuildAhoCorasickTree() { |
| 142 const SubstringPatternVector& sorted_patterns) { | |
| 143 tree_.clear(); | 85 tree_.clear(); |
| 144 | 86 |
| 145 // Initialize root note of tree. | 87 // Initialize root note of tree. |
| 146 AhoCorasickNode root; | 88 AhoCorasickNode root; |
| 147 root.set_failure(0); | 89 root.set_failure(0); |
| 148 tree_.push_back(root); | 90 tree_.push_back(root); |
| 149 | 91 |
| 150 // Insert all patterns. | 92 // Insert all patterns. |
| 151 for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); | 93 for (SubstringPatternSet::const_iterator i = patterns_.begin(); |
| 152 i != sorted_patterns.end(); | 94 i != patterns_.end(); ++i) { |
| 153 ++i) { | 95 InsertPatternIntoAhoCorasickTree(i->second); |
| 154 InsertPatternIntoAhoCorasickTree(*i); | |
| 155 } | 96 } |
| 156 | 97 |
| 157 CreateFailureEdges(); | 98 CreateFailureEdges(); |
| 158 } | 99 } |
| 159 | 100 |
| 160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| 161 const StringPattern* pattern) { | 102 const StringPattern* pattern) { |
| 162 const std::string& text = pattern->pattern(); | 103 const std::string& text = pattern->pattern(); |
| 163 const std::string::const_iterator text_end = text.end(); | 104 size_t text_length = text.length(); |
| 164 | 105 |
| 165 // Iterators on the tree and the text. | 106 // Iterators on the tree and the text. |
| 166 uint32 current_node = 0; | 107 int current_node = 0; |
| 167 std::string::const_iterator i = text.begin(); | 108 size_t text_pos = 0; |
| 168 | 109 |
| 169 // Follow existing paths for as long as possible. | 110 // Follow existing paths for as long as possible. |
| 170 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 111 while (text_pos < text_length && |
| 171 while (i != text_end && | 112 tree_[current_node].HasEdge(text[text_pos])) { |
| 172 edge_from_current != AhoCorasickNode::kNoSuchEdge) { | 113 current_node = tree_[current_node].GetEdge(text[text_pos]); |
| 173 current_node = edge_from_current; | 114 ++text_pos; |
| 174 ++i; | |
| 175 edge_from_current = tree_[current_node].GetEdge(*i); | |
| 176 } | 115 } |
| 177 | 116 |
| 178 // Create new nodes if necessary. | 117 // Create new nodes if necessary. |
| 179 while (i != text_end) { | 118 while (text_pos < text_length) { |
| 180 tree_.push_back(AhoCorasickNode()); | 119 AhoCorasickNode new_node; |
| 181 tree_[current_node].SetEdge(*i, tree_.size() - 1); | 120 tree_.push_back(new_node); |
| 121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); |
| 182 current_node = tree_.size() - 1; | 122 current_node = tree_.size() - 1; |
| 183 ++i; | 123 ++text_pos; |
| 184 } | 124 } |
| 185 | 125 |
| 186 // Register match. | 126 // Register match. |
| 187 tree_[current_node].AddMatch(pattern->id()); | 127 tree_[current_node].AddMatch(pattern->id()); |
| 188 } | 128 } |
| 189 | 129 |
| 190 void SubstringSetMatcher::CreateFailureEdges() { | 130 void SubstringSetMatcher::CreateFailureEdges() { |
| 191 typedef AhoCorasickNode::Edges Edges; | 131 typedef AhoCorasickNode::Edges Edges; |
| 192 | 132 |
| 193 std::queue<uint32> queue; | 133 std::queue<int> queue; |
| 194 | 134 |
| 195 AhoCorasickNode& root = tree_[0]; | 135 AhoCorasickNode& root = tree_[0]; |
| 196 root.set_failure(0); | 136 root.set_failure(0); |
| 197 const Edges& root_edges = root.edges(); | 137 const AhoCorasickNode::Edges& root_edges = root.edges(); |
| 198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
| 199 ++e) { | 139 ++e) { |
| 200 const uint32& leads_to = e->second; | 140 int leads_to = e->second; |
| 201 tree_[leads_to].set_failure(0); | 141 tree_[leads_to].set_failure(0); |
| 202 queue.push(leads_to); | 142 queue.push(leads_to); |
| 203 } | 143 } |
| 204 | 144 |
| 205 while (!queue.empty()) { | 145 while (!queue.empty()) { |
| 206 AhoCorasickNode& current_node = tree_[queue.front()]; | 146 AhoCorasickNode& current_node = tree_[queue.front()]; |
| 207 queue.pop(); | 147 queue.pop(); |
| 208 for (Edges::const_iterator e = current_node.edges().begin(); | 148 for (Edges::const_iterator e = current_node.edges().begin(); |
| 209 e != current_node.edges().end(); ++e) { | 149 e != current_node.edges().end(); ++e) { |
| 210 const char& edge_label = e->first; | 150 char edge_label = e->first; |
| 211 const uint32& leads_to = e->second; | 151 int leads_to = e->second; |
| 212 queue.push(leads_to); | 152 queue.push(leads_to); |
| 213 | 153 |
| 214 uint32 failure = current_node.failure(); | 154 int failure = current_node.failure(); |
| 215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); | 155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) |
| 216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && | |
| 217 failure != 0) { | |
| 218 failure = tree_[failure].failure(); | 156 failure = tree_[failure].failure(); |
| 219 edge_from_failure = tree_[failure].GetEdge(edge_label); | |
| 220 } | |
| 221 | 157 |
| 222 const uint32 follow_in_case_of_failure = | 158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? |
| 223 edge_from_failure != AhoCorasickNode::kNoSuchEdge | 159 tree_[failure].GetEdge(edge_label) : 0; |
| 224 ? edge_from_failure | |
| 225 : 0; | |
| 226 tree_[leads_to].set_failure(follow_in_case_of_failure); | 160 tree_[leads_to].set_failure(follow_in_case_of_failure); |
| 227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
| 228 } | 162 } |
| 229 } | 163 } |
| 230 } | 164 } |
| 231 | 165 |
| 232 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
| 233 : failure_(kNoSuchEdge) {} | 167 : failure_(-1) {} |
| 234 | 168 |
| 235 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
| 236 | 170 |
| 237 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
| 238 const SubstringSetMatcher::AhoCorasickNode& other) | 172 const SubstringSetMatcher::AhoCorasickNode& other) |
| 239 : edges_(other.edges_), | 173 : edges_(other.edges_), |
| 240 failure_(other.failure_), | 174 failure_(other.failure_), |
| 241 matches_(other.matches_) {} | 175 matches_(other.matches_) {} |
| 242 | 176 |
| 243 SubstringSetMatcher::AhoCorasickNode& | 177 SubstringSetMatcher::AhoCorasickNode& |
| 244 SubstringSetMatcher::AhoCorasickNode::operator=( | 178 SubstringSetMatcher::AhoCorasickNode::operator=( |
| 245 const SubstringSetMatcher::AhoCorasickNode& other) { | 179 const SubstringSetMatcher::AhoCorasickNode& other) { |
| 246 edges_ = other.edges_; | 180 edges_ = other.edges_; |
| 247 failure_ = other.failure_; | 181 failure_ = other.failure_; |
| 248 matches_ = other.matches_; | 182 matches_ = other.matches_; |
| 249 return *this; | 183 return *this; |
| 250 } | 184 } |
| 251 | 185 |
| 252 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { |
| 253 Edges::const_iterator i = edges_.find(c); | 187 return edges_.find(c) != edges_.end(); |
| 254 return i == edges_.end() ? kNoSuchEdge : i->second; | |
| 255 } | 188 } |
| 256 | 189 |
| 257 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { | 190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| 191 std::map<char, int>::const_iterator i = edges_.find(c); |
| 192 return i == edges_.end() ? -1 : i->second; |
| 193 } |
| 194 |
| 195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { |
| 258 edges_[c] = node; | 196 edges_[c] = node; |
| 259 } | 197 } |
| 260 | 198 |
| 261 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
| 262 matches_.insert(id); | 200 matches_.insert(id); |
| 263 } | 201 } |
| 264 | 202 |
| 265 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
| 266 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
| 267 matches_.insert(matches.begin(), matches.end()); | 205 matches_.insert(matches.begin(), matches.end()); |
| 268 } | 206 } |
| 269 | 207 |
| 270 } // namespace extensions | 208 } // namespace extensions |
| OLD | NEW |