| OLD | NEW |
| 1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "components/url_matcher/substring_set_matcher.h" | 5 #include "components/url_matcher/substring_set_matcher.h" |
| 6 | 6 |
| 7 #include <stddef.h> |
| 8 |
| 7 #include <algorithm> | 9 #include <algorithm> |
| 8 #include <queue> | 10 #include <queue> |
| 9 | 11 |
| 10 #include "base/logging.h" | 12 #include "base/logging.h" |
| 11 #include "base/stl_util.h" | 13 #include "base/stl_util.h" |
| 12 | 14 |
| 13 namespace url_matcher { | 15 namespace url_matcher { |
| 14 | 16 |
| 15 namespace { | 17 namespace { |
| 16 | 18 |
| 17 // Compare StringPattern instances based on their string patterns. | 19 // Compare StringPattern instances based on their string patterns. |
| 18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | 20 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
| 19 return a->pattern() < b->pattern(); | 21 return a->pattern() < b->pattern(); |
| 20 } | 22 } |
| 21 | 23 |
| 22 // Given the set of patterns, compute how many nodes will the corresponding | 24 // Given the set of patterns, compute how many nodes will the corresponding |
| 23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. | 25 // Aho-Corasick tree have. Note that |patterns| need to be sorted. |
| 24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { | 26 uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) { |
| 25 uint32 result = 1u; // 1 for the root node. | 27 uint32_t result = 1u; // 1 for the root node. |
| 26 if (patterns.empty()) | 28 if (patterns.empty()) |
| 27 return result; | 29 return result; |
| 28 | 30 |
| 29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | 31 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
| 30 std::vector<const StringPattern*>::const_iterator current = last + 1; | 32 std::vector<const StringPattern*>::const_iterator current = last + 1; |
| 31 // For the first pattern, each letter is a label of an edge to a new node. | 33 // For the first pattern, each letter is a label of an edge to a new node. |
| 32 result += (*last)->pattern().size(); | 34 result += (*last)->pattern().size(); |
| 33 | 35 |
| 34 // For the subsequent patterns, only count the edges which were not counted | 36 // For the subsequent patterns, only count the edges which were not counted |
| 35 // yet. For this it suffices to test against the previous pattern, because the | 37 // yet. For this it suffices to test against the previous pattern, because the |
| 36 // patterns are sorted. | 38 // patterns are sorted. |
| 37 for (; current != patterns.end(); ++last, ++current) { | 39 for (; current != patterns.end(); ++last, ++current) { |
| 38 const std::string& last_pattern = (*last)->pattern(); | 40 const std::string& last_pattern = (*last)->pattern(); |
| 39 const std::string& current_pattern = (*current)->pattern(); | 41 const std::string& current_pattern = (*current)->pattern(); |
| 40 const uint32 prefix_bound = | 42 const uint32_t prefix_bound = |
| 41 std::min(last_pattern.size(), current_pattern.size()); | 43 std::min(last_pattern.size(), current_pattern.size()); |
| 42 | 44 |
| 43 uint32 common_prefix = 0; | 45 uint32_t common_prefix = 0; |
| 44 while (common_prefix < prefix_bound && | 46 while (common_prefix < prefix_bound && |
| 45 last_pattern[common_prefix] == current_pattern[common_prefix]) | 47 last_pattern[common_prefix] == current_pattern[common_prefix]) |
| 46 ++common_prefix; | 48 ++common_prefix; |
| 47 result += current_pattern.size() - common_prefix; | 49 result += current_pattern.size() - common_prefix; |
| 48 } | 50 } |
| 49 return result; | 51 return result; |
| 50 } | 52 } |
| 51 | 53 |
| 52 } // namespace | 54 } // namespace |
| 53 | 55 |
| (...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 106 RebuildAhoCorasickTree(sorted_patterns); | 108 RebuildAhoCorasickTree(sorted_patterns); |
| 107 } | 109 } |
| 108 | 110 |
| 109 bool SubstringSetMatcher::Match(const std::string& text, | 111 bool SubstringSetMatcher::Match(const std::string& text, |
| 110 std::set<StringPattern::ID>* matches) const { | 112 std::set<StringPattern::ID>* matches) const { |
| 111 const size_t old_number_of_matches = matches->size(); | 113 const size_t old_number_of_matches = matches->size(); |
| 112 | 114 |
| 113 // Handle patterns matching the empty string. | 115 // Handle patterns matching the empty string. |
| 114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 116 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
| 115 | 117 |
| 116 uint32 current_node = 0; | 118 uint32_t current_node = 0; |
| 117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { | 119 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
| 118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 120 uint32_t edge_from_current = tree_[current_node].GetEdge(*i); |
| 119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && | 121 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
| 120 current_node != 0) { | 122 current_node != 0) { |
| 121 current_node = tree_[current_node].failure(); | 123 current_node = tree_[current_node].failure(); |
| 122 edge_from_current = tree_[current_node].GetEdge(*i); | 124 edge_from_current = tree_[current_node].GetEdge(*i); |
| 123 } | 125 } |
| 124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { | 126 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
| 125 current_node = edge_from_current; | 127 current_node = edge_from_current; |
| 126 matches->insert(tree_[current_node].matches().begin(), | 128 matches->insert(tree_[current_node].matches().begin(), |
| 127 tree_[current_node].matches().end()); | 129 tree_[current_node].matches().end()); |
| 128 } else { | 130 } else { |
| (...skipping 27 matching lines...) Expand all Loading... |
| 156 | 158 |
| 157 CreateFailureEdges(); | 159 CreateFailureEdges(); |
| 158 } | 160 } |
| 159 | 161 |
| 160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 162 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
| 161 const StringPattern* pattern) { | 163 const StringPattern* pattern) { |
| 162 const std::string& text = pattern->pattern(); | 164 const std::string& text = pattern->pattern(); |
| 163 const std::string::const_iterator text_end = text.end(); | 165 const std::string::const_iterator text_end = text.end(); |
| 164 | 166 |
| 165 // Iterators on the tree and the text. | 167 // Iterators on the tree and the text. |
| 166 uint32 current_node = 0; | 168 uint32_t current_node = 0; |
| 167 std::string::const_iterator i = text.begin(); | 169 std::string::const_iterator i = text.begin(); |
| 168 | 170 |
| 169 // Follow existing paths for as long as possible. | 171 // Follow existing paths for as long as possible. |
| 170 while (i != text_end) { | 172 while (i != text_end) { |
| 171 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 173 uint32_t edge_from_current = tree_[current_node].GetEdge(*i); |
| 172 if (edge_from_current == AhoCorasickNode::kNoSuchEdge) | 174 if (edge_from_current == AhoCorasickNode::kNoSuchEdge) |
| 173 break; | 175 break; |
| 174 current_node = edge_from_current; | 176 current_node = edge_from_current; |
| 175 ++i; | 177 ++i; |
| 176 } | 178 } |
| 177 | 179 |
| 178 // Create new nodes if necessary. | 180 // Create new nodes if necessary. |
| 179 while (i != text_end) { | 181 while (i != text_end) { |
| 180 tree_.push_back(AhoCorasickNode()); | 182 tree_.push_back(AhoCorasickNode()); |
| 181 tree_[current_node].SetEdge(*i, tree_.size() - 1); | 183 tree_[current_node].SetEdge(*i, tree_.size() - 1); |
| 182 current_node = tree_.size() - 1; | 184 current_node = tree_.size() - 1; |
| 183 ++i; | 185 ++i; |
| 184 } | 186 } |
| 185 | 187 |
| 186 // Register match. | 188 // Register match. |
| 187 tree_[current_node].AddMatch(pattern->id()); | 189 tree_[current_node].AddMatch(pattern->id()); |
| 188 } | 190 } |
| 189 | 191 |
| 190 void SubstringSetMatcher::CreateFailureEdges() { | 192 void SubstringSetMatcher::CreateFailureEdges() { |
| 191 typedef AhoCorasickNode::Edges Edges; | 193 typedef AhoCorasickNode::Edges Edges; |
| 192 | 194 |
| 193 std::queue<uint32> queue; | 195 std::queue<uint32_t> queue; |
| 194 | 196 |
| 195 AhoCorasickNode& root = tree_[0]; | 197 AhoCorasickNode& root = tree_[0]; |
| 196 root.set_failure(0); | 198 root.set_failure(0); |
| 197 const Edges& root_edges = root.edges(); | 199 const Edges& root_edges = root.edges(); |
| 198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 200 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
| 199 ++e) { | 201 ++e) { |
| 200 const uint32& leads_to = e->second; | 202 const uint32_t& leads_to = e->second; |
| 201 tree_[leads_to].set_failure(0); | 203 tree_[leads_to].set_failure(0); |
| 202 queue.push(leads_to); | 204 queue.push(leads_to); |
| 203 } | 205 } |
| 204 | 206 |
| 205 while (!queue.empty()) { | 207 while (!queue.empty()) { |
| 206 AhoCorasickNode& current_node = tree_[queue.front()]; | 208 AhoCorasickNode& current_node = tree_[queue.front()]; |
| 207 queue.pop(); | 209 queue.pop(); |
| 208 for (Edges::const_iterator e = current_node.edges().begin(); | 210 for (Edges::const_iterator e = current_node.edges().begin(); |
| 209 e != current_node.edges().end(); ++e) { | 211 e != current_node.edges().end(); ++e) { |
| 210 const char& edge_label = e->first; | 212 const char& edge_label = e->first; |
| 211 const uint32& leads_to = e->second; | 213 const uint32_t& leads_to = e->second; |
| 212 queue.push(leads_to); | 214 queue.push(leads_to); |
| 213 | 215 |
| 214 uint32 failure = current_node.failure(); | 216 uint32_t failure = current_node.failure(); |
| 215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); | 217 uint32_t edge_from_failure = tree_[failure].GetEdge(edge_label); |
| 216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && | 218 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
| 217 failure != 0) { | 219 failure != 0) { |
| 218 failure = tree_[failure].failure(); | 220 failure = tree_[failure].failure(); |
| 219 edge_from_failure = tree_[failure].GetEdge(edge_label); | 221 edge_from_failure = tree_[failure].GetEdge(edge_label); |
| 220 } | 222 } |
| 221 | 223 |
| 222 const uint32 follow_in_case_of_failure = | 224 const uint32_t follow_in_case_of_failure = |
| 223 edge_from_failure != AhoCorasickNode::kNoSuchEdge | 225 edge_from_failure != AhoCorasickNode::kNoSuchEdge ? edge_from_failure |
| 224 ? edge_from_failure | 226 : 0; |
| 225 : 0; | |
| 226 tree_[leads_to].set_failure(follow_in_case_of_failure); | 227 tree_[leads_to].set_failure(follow_in_case_of_failure); |
| 227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 228 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
| 228 } | 229 } |
| 229 } | 230 } |
| 230 } | 231 } |
| 231 | 232 |
| 232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; | 233 const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; |
| 233 | 234 |
| 234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 235 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
| 235 : failure_(kNoSuchEdge) {} | 236 : failure_(kNoSuchEdge) {} |
| 236 | 237 |
| 237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 238 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
| 238 | 239 |
| 239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 240 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
| 240 const SubstringSetMatcher::AhoCorasickNode& other) | 241 const SubstringSetMatcher::AhoCorasickNode& other) |
| 241 : edges_(other.edges_), | 242 : edges_(other.edges_), |
| 242 failure_(other.failure_), | 243 failure_(other.failure_), |
| 243 matches_(other.matches_) {} | 244 matches_(other.matches_) {} |
| 244 | 245 |
| 245 SubstringSetMatcher::AhoCorasickNode& | 246 SubstringSetMatcher::AhoCorasickNode& |
| 246 SubstringSetMatcher::AhoCorasickNode::operator=( | 247 SubstringSetMatcher::AhoCorasickNode::operator=( |
| 247 const SubstringSetMatcher::AhoCorasickNode& other) { | 248 const SubstringSetMatcher::AhoCorasickNode& other) { |
| 248 edges_ = other.edges_; | 249 edges_ = other.edges_; |
| 249 failure_ = other.failure_; | 250 failure_ = other.failure_; |
| 250 matches_ = other.matches_; | 251 matches_ = other.matches_; |
| 251 return *this; | 252 return *this; |
| 252 } | 253 } |
| 253 | 254 |
| 254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 255 uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| 255 Edges::const_iterator i = edges_.find(c); | 256 Edges::const_iterator i = edges_.find(c); |
| 256 return i == edges_.end() ? kNoSuchEdge : i->second; | 257 return i == edges_.end() ? kNoSuchEdge : i->second; |
| 257 } | 258 } |
| 258 | 259 |
| 259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { | 260 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32_t node) { |
| 260 edges_[c] = node; | 261 edges_[c] = node; |
| 261 } | 262 } |
| 262 | 263 |
| 263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 264 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
| 264 matches_.insert(id); | 265 matches_.insert(id); |
| 265 } | 266 } |
| 266 | 267 |
| 267 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 268 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
| 268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 269 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
| 269 matches_.insert(matches.begin(), matches.end()); | 270 matches_.insert(matches.begin(), matches.end()); |
| 270 } | 271 } |
| 271 | 272 |
| 272 } // namespace url_matcher | 273 } // namespace url_matcher |
| OLD | NEW |