OLD | NEW |
1 // Copyright 2013 The Chromium Authors. All rights reserved. | 1 // Copyright 2013 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "components/url_matcher/substring_set_matcher.h" | 5 #include "components/url_matcher/substring_set_matcher.h" |
6 | 6 |
| 7 #include <stddef.h> |
| 8 |
7 #include <algorithm> | 9 #include <algorithm> |
8 #include <queue> | 10 #include <queue> |
9 | 11 |
10 #include "base/logging.h" | 12 #include "base/logging.h" |
11 #include "base/stl_util.h" | 13 #include "base/stl_util.h" |
12 | 14 |
13 namespace url_matcher { | 15 namespace url_matcher { |
14 | 16 |
15 namespace { | 17 namespace { |
16 | 18 |
17 // Compare StringPattern instances based on their string patterns. | 19 // Compare StringPattern instances based on their string patterns. |
18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | 20 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { |
19 return a->pattern() < b->pattern(); | 21 return a->pattern() < b->pattern(); |
20 } | 22 } |
21 | 23 |
22 // Given the set of patterns, compute how many nodes will the corresponding | 24 // Given the set of patterns, compute how many nodes will the corresponding |
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. | 25 // Aho-Corasick tree have. Note that |patterns| need to be sorted. |
24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { | 26 uint32_t TreeSize(const std::vector<const StringPattern*>& patterns) { |
25 uint32 result = 1u; // 1 for the root node. | 27 uint32_t result = 1u; // 1 for the root node. |
26 if (patterns.empty()) | 28 if (patterns.empty()) |
27 return result; | 29 return result; |
28 | 30 |
29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | 31 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); |
30 std::vector<const StringPattern*>::const_iterator current = last + 1; | 32 std::vector<const StringPattern*>::const_iterator current = last + 1; |
31 // For the first pattern, each letter is a label of an edge to a new node. | 33 // For the first pattern, each letter is a label of an edge to a new node. |
32 result += (*last)->pattern().size(); | 34 result += (*last)->pattern().size(); |
33 | 35 |
34 // For the subsequent patterns, only count the edges which were not counted | 36 // For the subsequent patterns, only count the edges which were not counted |
35 // yet. For this it suffices to test against the previous pattern, because the | 37 // yet. For this it suffices to test against the previous pattern, because the |
36 // patterns are sorted. | 38 // patterns are sorted. |
37 for (; current != patterns.end(); ++last, ++current) { | 39 for (; current != patterns.end(); ++last, ++current) { |
38 const std::string& last_pattern = (*last)->pattern(); | 40 const std::string& last_pattern = (*last)->pattern(); |
39 const std::string& current_pattern = (*current)->pattern(); | 41 const std::string& current_pattern = (*current)->pattern(); |
40 const uint32 prefix_bound = | 42 const uint32_t prefix_bound = |
41 std::min(last_pattern.size(), current_pattern.size()); | 43 std::min(last_pattern.size(), current_pattern.size()); |
42 | 44 |
43 uint32 common_prefix = 0; | 45 uint32_t common_prefix = 0; |
44 while (common_prefix < prefix_bound && | 46 while (common_prefix < prefix_bound && |
45 last_pattern[common_prefix] == current_pattern[common_prefix]) | 47 last_pattern[common_prefix] == current_pattern[common_prefix]) |
46 ++common_prefix; | 48 ++common_prefix; |
47 result += current_pattern.size() - common_prefix; | 49 result += current_pattern.size() - common_prefix; |
48 } | 50 } |
49 return result; | 51 return result; |
50 } | 52 } |
51 | 53 |
52 } // namespace | 54 } // namespace |
53 | 55 |
(...skipping 52 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
106 RebuildAhoCorasickTree(sorted_patterns); | 108 RebuildAhoCorasickTree(sorted_patterns); |
107 } | 109 } |
108 | 110 |
109 bool SubstringSetMatcher::Match(const std::string& text, | 111 bool SubstringSetMatcher::Match(const std::string& text, |
110 std::set<StringPattern::ID>* matches) const { | 112 std::set<StringPattern::ID>* matches) const { |
111 const size_t old_number_of_matches = matches->size(); | 113 const size_t old_number_of_matches = matches->size(); |
112 | 114 |
113 // Handle patterns matching the empty string. | 115 // Handle patterns matching the empty string. |
114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 116 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
115 | 117 |
116 uint32 current_node = 0; | 118 uint32_t current_node = 0; |
117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { | 119 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 120 uint32_t edge_from_current = tree_[current_node].GetEdge(*i); |
119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && | 121 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
120 current_node != 0) { | 122 current_node != 0) { |
121 current_node = tree_[current_node].failure(); | 123 current_node = tree_[current_node].failure(); |
122 edge_from_current = tree_[current_node].GetEdge(*i); | 124 edge_from_current = tree_[current_node].GetEdge(*i); |
123 } | 125 } |
124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { | 126 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
125 current_node = edge_from_current; | 127 current_node = edge_from_current; |
126 matches->insert(tree_[current_node].matches().begin(), | 128 matches->insert(tree_[current_node].matches().begin(), |
127 tree_[current_node].matches().end()); | 129 tree_[current_node].matches().end()); |
128 } else { | 130 } else { |
(...skipping 27 matching lines...) Expand all Loading... |
156 | 158 |
157 CreateFailureEdges(); | 159 CreateFailureEdges(); |
158 } | 160 } |
159 | 161 |
160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 162 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
161 const StringPattern* pattern) { | 163 const StringPattern* pattern) { |
162 const std::string& text = pattern->pattern(); | 164 const std::string& text = pattern->pattern(); |
163 const std::string::const_iterator text_end = text.end(); | 165 const std::string::const_iterator text_end = text.end(); |
164 | 166 |
165 // Iterators on the tree and the text. | 167 // Iterators on the tree and the text. |
166 uint32 current_node = 0; | 168 uint32_t current_node = 0; |
167 std::string::const_iterator i = text.begin(); | 169 std::string::const_iterator i = text.begin(); |
168 | 170 |
169 // Follow existing paths for as long as possible. | 171 // Follow existing paths for as long as possible. |
170 while (i != text_end) { | 172 while (i != text_end) { |
171 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 173 uint32_t edge_from_current = tree_[current_node].GetEdge(*i); |
172 if (edge_from_current == AhoCorasickNode::kNoSuchEdge) | 174 if (edge_from_current == AhoCorasickNode::kNoSuchEdge) |
173 break; | 175 break; |
174 current_node = edge_from_current; | 176 current_node = edge_from_current; |
175 ++i; | 177 ++i; |
176 } | 178 } |
177 | 179 |
178 // Create new nodes if necessary. | 180 // Create new nodes if necessary. |
179 while (i != text_end) { | 181 while (i != text_end) { |
180 tree_.push_back(AhoCorasickNode()); | 182 tree_.push_back(AhoCorasickNode()); |
181 tree_[current_node].SetEdge(*i, tree_.size() - 1); | 183 tree_[current_node].SetEdge(*i, tree_.size() - 1); |
182 current_node = tree_.size() - 1; | 184 current_node = tree_.size() - 1; |
183 ++i; | 185 ++i; |
184 } | 186 } |
185 | 187 |
186 // Register match. | 188 // Register match. |
187 tree_[current_node].AddMatch(pattern->id()); | 189 tree_[current_node].AddMatch(pattern->id()); |
188 } | 190 } |
189 | 191 |
190 void SubstringSetMatcher::CreateFailureEdges() { | 192 void SubstringSetMatcher::CreateFailureEdges() { |
191 typedef AhoCorasickNode::Edges Edges; | 193 typedef AhoCorasickNode::Edges Edges; |
192 | 194 |
193 std::queue<uint32> queue; | 195 std::queue<uint32_t> queue; |
194 | 196 |
195 AhoCorasickNode& root = tree_[0]; | 197 AhoCorasickNode& root = tree_[0]; |
196 root.set_failure(0); | 198 root.set_failure(0); |
197 const Edges& root_edges = root.edges(); | 199 const Edges& root_edges = root.edges(); |
198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 200 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
199 ++e) { | 201 ++e) { |
200 const uint32& leads_to = e->second; | 202 const uint32_t& leads_to = e->second; |
201 tree_[leads_to].set_failure(0); | 203 tree_[leads_to].set_failure(0); |
202 queue.push(leads_to); | 204 queue.push(leads_to); |
203 } | 205 } |
204 | 206 |
205 while (!queue.empty()) { | 207 while (!queue.empty()) { |
206 AhoCorasickNode& current_node = tree_[queue.front()]; | 208 AhoCorasickNode& current_node = tree_[queue.front()]; |
207 queue.pop(); | 209 queue.pop(); |
208 for (Edges::const_iterator e = current_node.edges().begin(); | 210 for (Edges::const_iterator e = current_node.edges().begin(); |
209 e != current_node.edges().end(); ++e) { | 211 e != current_node.edges().end(); ++e) { |
210 const char& edge_label = e->first; | 212 const char& edge_label = e->first; |
211 const uint32& leads_to = e->second; | 213 const uint32_t& leads_to = e->second; |
212 queue.push(leads_to); | 214 queue.push(leads_to); |
213 | 215 |
214 uint32 failure = current_node.failure(); | 216 uint32_t failure = current_node.failure(); |
215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); | 217 uint32_t edge_from_failure = tree_[failure].GetEdge(edge_label); |
216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && | 218 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && |
217 failure != 0) { | 219 failure != 0) { |
218 failure = tree_[failure].failure(); | 220 failure = tree_[failure].failure(); |
219 edge_from_failure = tree_[failure].GetEdge(edge_label); | 221 edge_from_failure = tree_[failure].GetEdge(edge_label); |
220 } | 222 } |
221 | 223 |
222 const uint32 follow_in_case_of_failure = | 224 const uint32_t follow_in_case_of_failure = |
223 edge_from_failure != AhoCorasickNode::kNoSuchEdge | 225 edge_from_failure != AhoCorasickNode::kNoSuchEdge ? edge_from_failure |
224 ? edge_from_failure | 226 : 0; |
225 : 0; | |
226 tree_[leads_to].set_failure(follow_in_case_of_failure); | 227 tree_[leads_to].set_failure(follow_in_case_of_failure); |
227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 228 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
228 } | 229 } |
229 } | 230 } |
230 } | 231 } |
231 | 232 |
232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; | 233 const uint32_t SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = 0xFFFFFFFF; |
233 | 234 |
234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 235 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
235 : failure_(kNoSuchEdge) {} | 236 : failure_(kNoSuchEdge) {} |
236 | 237 |
237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 238 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
238 | 239 |
239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 240 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
240 const SubstringSetMatcher::AhoCorasickNode& other) | 241 const SubstringSetMatcher::AhoCorasickNode& other) |
241 : edges_(other.edges_), | 242 : edges_(other.edges_), |
242 failure_(other.failure_), | 243 failure_(other.failure_), |
243 matches_(other.matches_) {} | 244 matches_(other.matches_) {} |
244 | 245 |
245 SubstringSetMatcher::AhoCorasickNode& | 246 SubstringSetMatcher::AhoCorasickNode& |
246 SubstringSetMatcher::AhoCorasickNode::operator=( | 247 SubstringSetMatcher::AhoCorasickNode::operator=( |
247 const SubstringSetMatcher::AhoCorasickNode& other) { | 248 const SubstringSetMatcher::AhoCorasickNode& other) { |
248 edges_ = other.edges_; | 249 edges_ = other.edges_; |
249 failure_ = other.failure_; | 250 failure_ = other.failure_; |
250 matches_ = other.matches_; | 251 matches_ = other.matches_; |
251 return *this; | 252 return *this; |
252 } | 253 } |
253 | 254 |
254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 255 uint32_t SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
255 Edges::const_iterator i = edges_.find(c); | 256 Edges::const_iterator i = edges_.find(c); |
256 return i == edges_.end() ? kNoSuchEdge : i->second; | 257 return i == edges_.end() ? kNoSuchEdge : i->second; |
257 } | 258 } |
258 | 259 |
259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { | 260 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32_t node) { |
260 edges_[c] = node; | 261 edges_[c] = node; |
261 } | 262 } |
262 | 263 |
263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 264 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
264 matches_.insert(id); | 265 matches_.insert(id); |
265 } | 266 } |
266 | 267 |
267 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 268 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 269 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
269 matches_.insert(matches.begin(), matches.end()); | 270 matches_.insert(matches.begin(), matches.end()); |
270 } | 271 } |
271 | 272 |
272 } // namespace url_matcher | 273 } // namespace url_matcher |
OLD | NEW |