OLD | NEW |
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "extensions/common/matcher/substring_set_matcher.h" | 5 #include "extensions/common/matcher/substring_set_matcher.h" |
6 | 6 |
7 #include <algorithm> | |
8 #include <queue> | 7 #include <queue> |
9 | 8 |
10 #include "base/logging.h" | 9 #include "base/logging.h" |
11 #include "base/stl_util.h" | 10 #include "base/stl_util.h" |
12 | 11 |
13 namespace extensions { | 12 namespace extensions { |
14 | 13 |
15 namespace { | |
16 | |
17 // Compare StringPattern instances based on their string patterns. | |
18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | |
19 return a->pattern() <= b->pattern(); | |
20 } | |
21 | |
22 // Given the set of patterns, compute how many nodes will the corresponding | |
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. | |
24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { | |
25 uint32 result = 1u; // 1 for the root node. | |
26 if (patterns.empty()) | |
27 return result; | |
28 | |
29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | |
30 std::vector<const StringPattern*>::const_iterator current = last + 1; | |
31 // For the first pattern, each letter is a label of an edge to a new node. | |
32 result += (*last)->pattern().size(); | |
33 | |
34 // For the subsequent patterns, only count the edges which were not counted | |
35 // yet. For this it suffices to test against the previous pattern, because the | |
36 // patterns are sorted. | |
37 for (; current != patterns.end(); ++last, ++current) { | |
38 const std::string& last_pattern = (*last)->pattern(); | |
39 const std::string& current_pattern = (*current)->pattern(); | |
40 const uint32 prefix_bound = | |
41 std::min(last_pattern.size(), current_pattern.size()); | |
42 | |
43 uint32 common_prefix = 0; | |
44 while (common_prefix < prefix_bound && | |
45 last_pattern[common_prefix] == current_pattern[common_prefix]) | |
46 ++common_prefix; | |
47 result += current_pattern.size() - common_prefix; | |
48 } | |
49 return result; | |
50 } | |
51 | |
52 } // namespace | |
53 | |
54 // | 14 // |
55 // SubstringSetMatcher | 15 // SubstringSetMatcher |
56 // | 16 // |
57 | 17 |
58 SubstringSetMatcher::SubstringSetMatcher() { | 18 SubstringSetMatcher::SubstringSetMatcher() { |
59 RebuildAhoCorasickTree(SubstringPatternVector()); | 19 RebuildAhoCorasickTree(); |
60 } | 20 } |
61 | 21 |
62 SubstringSetMatcher::~SubstringSetMatcher() {} | 22 SubstringSetMatcher::~SubstringSetMatcher() {} |
63 | 23 |
64 void SubstringSetMatcher::RegisterPatterns( | 24 void SubstringSetMatcher::RegisterPatterns( |
65 const std::vector<const StringPattern*>& patterns) { | 25 const std::vector<const StringPattern*>& patterns) { |
66 RegisterAndUnregisterPatterns(patterns, | 26 RegisterAndUnregisterPatterns(patterns, |
67 std::vector<const StringPattern*>()); | 27 std::vector<const StringPattern*>()); |
68 } | 28 } |
69 | 29 |
(...skipping 12 matching lines...) Expand all Loading... |
82 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
83 patterns_[(*i)->id()] = *i; | 43 patterns_[(*i)->id()] = *i; |
84 } | 44 } |
85 | 45 |
86 // Unregister patterns | 46 // Unregister patterns |
87 for (std::vector<const StringPattern*>::const_iterator i = | 47 for (std::vector<const StringPattern*>::const_iterator i = |
88 to_unregister.begin(); i != to_unregister.end(); ++i) { | 48 to_unregister.begin(); i != to_unregister.end(); ++i) { |
89 patterns_.erase((*i)->id()); | 49 patterns_.erase((*i)->id()); |
90 } | 50 } |
91 | 51 |
92 // Now we compute the total number of tree nodes needed. | 52 RebuildAhoCorasickTree(); |
93 SubstringPatternVector sorted_patterns; | |
94 sorted_patterns.resize(patterns_.size()); | |
95 | |
96 size_t next = 0; | |
97 for (SubstringPatternMap::const_iterator i = patterns_.begin(); | |
98 i != patterns_.end(); | |
99 ++i, ++next) { | |
100 sorted_patterns[next] = i->second; | |
101 } | |
102 | |
103 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); | |
104 tree_.reserve(TreeSize(sorted_patterns)); | |
105 | |
106 RebuildAhoCorasickTree(sorted_patterns); | |
107 } | 53 } |
108 | 54 |
109 bool SubstringSetMatcher::Match(const std::string& text, | 55 bool SubstringSetMatcher::Match(const std::string& text, |
110 std::set<StringPattern::ID>* matches) const { | 56 std::set<StringPattern::ID>* matches) const { |
111 const size_t old_number_of_matches = matches->size(); | 57 size_t old_number_of_matches = matches->size(); |
112 | 58 |
113 // Handle patterns matching the empty string. | 59 // Handle patterns matching the empty string. |
114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
115 | 61 |
116 uint32 current_node = 0; | 62 int current_node = 0; |
117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { | 63 size_t text_length = text.length(); |
118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 64 for (size_t i = 0; i < text_length; ++i) { |
119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && | 65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) |
120 current_node != 0) { | |
121 current_node = tree_[current_node].failure(); | 66 current_node = tree_[current_node].failure(); |
122 edge_from_current = tree_[current_node].GetEdge(*i); | 67 if (tree_[current_node].HasEdge(text[i])) { |
123 } | 68 current_node = tree_[current_node].GetEdge(text[i]); |
124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { | |
125 current_node = edge_from_current; | |
126 matches->insert(tree_[current_node].matches().begin(), | 69 matches->insert(tree_[current_node].matches().begin(), |
127 tree_[current_node].matches().end()); | 70 tree_[current_node].matches().end()); |
128 } else { | 71 } else { |
129 DCHECK_EQ(0u, current_node); | 72 DCHECK_EQ(0, current_node); |
130 } | 73 } |
131 } | 74 } |
132 | 75 |
133 return old_number_of_matches != matches->size(); | 76 return old_number_of_matches != matches->size(); |
134 } | 77 } |
135 | 78 |
136 bool SubstringSetMatcher::IsEmpty() const { | 79 bool SubstringSetMatcher::IsEmpty() const { |
137 // An empty tree consists of only the root node. | 80 // An empty tree consists of only the root node. |
138 return patterns_.empty() && tree_.size() == 1u; | 81 return patterns_.empty() && tree_.size() == 1u; |
139 } | 82 } |
140 | 83 |
141 void SubstringSetMatcher::RebuildAhoCorasickTree( | 84 void SubstringSetMatcher::RebuildAhoCorasickTree() { |
142 const SubstringPatternVector& sorted_patterns) { | |
143 tree_.clear(); | 85 tree_.clear(); |
144 | 86 |
145 // Initialize root note of tree. | 87 // Initialize root note of tree. |
146 AhoCorasickNode root; | 88 AhoCorasickNode root; |
147 root.set_failure(0); | 89 root.set_failure(0); |
148 tree_.push_back(root); | 90 tree_.push_back(root); |
149 | 91 |
150 // Insert all patterns. | 92 // Insert all patterns. |
151 for (SubstringPatternVector::const_iterator i = sorted_patterns.begin(); | 93 for (SubstringPatternSet::const_iterator i = patterns_.begin(); |
152 i != sorted_patterns.end(); | 94 i != patterns_.end(); ++i) { |
153 ++i) { | 95 InsertPatternIntoAhoCorasickTree(i->second); |
154 InsertPatternIntoAhoCorasickTree(*i); | |
155 } | 96 } |
156 | 97 |
157 CreateFailureEdges(); | 98 CreateFailureEdges(); |
158 } | 99 } |
159 | 100 |
160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
161 const StringPattern* pattern) { | 102 const StringPattern* pattern) { |
162 const std::string& text = pattern->pattern(); | 103 const std::string& text = pattern->pattern(); |
163 const std::string::const_iterator text_end = text.end(); | 104 size_t text_length = text.length(); |
164 | 105 |
165 // Iterators on the tree and the text. | 106 // Iterators on the tree and the text. |
166 uint32 current_node = 0; | 107 int current_node = 0; |
167 std::string::const_iterator i = text.begin(); | 108 size_t text_pos = 0; |
168 | 109 |
169 // Follow existing paths for as long as possible. | 110 // Follow existing paths for as long as possible. |
170 uint32 edge_from_current = tree_[current_node].GetEdge(*i); | 111 while (text_pos < text_length && |
171 while (i != text_end && | 112 tree_[current_node].HasEdge(text[text_pos])) { |
172 edge_from_current != AhoCorasickNode::kNoSuchEdge) { | 113 current_node = tree_[current_node].GetEdge(text[text_pos]); |
173 current_node = edge_from_current; | 114 ++text_pos; |
174 ++i; | |
175 edge_from_current = tree_[current_node].GetEdge(*i); | |
176 } | 115 } |
177 | 116 |
178 // Create new nodes if necessary. | 117 // Create new nodes if necessary. |
179 while (i != text_end) { | 118 while (text_pos < text_length) { |
180 tree_.push_back(AhoCorasickNode()); | 119 AhoCorasickNode new_node; |
181 tree_[current_node].SetEdge(*i, tree_.size() - 1); | 120 tree_.push_back(new_node); |
| 121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); |
182 current_node = tree_.size() - 1; | 122 current_node = tree_.size() - 1; |
183 ++i; | 123 ++text_pos; |
184 } | 124 } |
185 | 125 |
186 // Register match. | 126 // Register match. |
187 tree_[current_node].AddMatch(pattern->id()); | 127 tree_[current_node].AddMatch(pattern->id()); |
188 } | 128 } |
189 | 129 |
190 void SubstringSetMatcher::CreateFailureEdges() { | 130 void SubstringSetMatcher::CreateFailureEdges() { |
191 typedef AhoCorasickNode::Edges Edges; | 131 typedef AhoCorasickNode::Edges Edges; |
192 | 132 |
193 std::queue<uint32> queue; | 133 std::queue<int> queue; |
194 | 134 |
195 AhoCorasickNode& root = tree_[0]; | 135 AhoCorasickNode& root = tree_[0]; |
196 root.set_failure(0); | 136 root.set_failure(0); |
197 const Edges& root_edges = root.edges(); | 137 const AhoCorasickNode::Edges& root_edges = root.edges(); |
198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
199 ++e) { | 139 ++e) { |
200 const uint32& leads_to = e->second; | 140 int leads_to = e->second; |
201 tree_[leads_to].set_failure(0); | 141 tree_[leads_to].set_failure(0); |
202 queue.push(leads_to); | 142 queue.push(leads_to); |
203 } | 143 } |
204 | 144 |
205 while (!queue.empty()) { | 145 while (!queue.empty()) { |
206 AhoCorasickNode& current_node = tree_[queue.front()]; | 146 AhoCorasickNode& current_node = tree_[queue.front()]; |
207 queue.pop(); | 147 queue.pop(); |
208 for (Edges::const_iterator e = current_node.edges().begin(); | 148 for (Edges::const_iterator e = current_node.edges().begin(); |
209 e != current_node.edges().end(); ++e) { | 149 e != current_node.edges().end(); ++e) { |
210 const char& edge_label = e->first; | 150 char edge_label = e->first; |
211 const uint32& leads_to = e->second; | 151 int leads_to = e->second; |
212 queue.push(leads_to); | 152 queue.push(leads_to); |
213 | 153 |
214 uint32 failure = current_node.failure(); | 154 int failure = current_node.failure(); |
215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); | 155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) |
216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && | |
217 failure != 0) { | |
218 failure = tree_[failure].failure(); | 156 failure = tree_[failure].failure(); |
219 edge_from_failure = tree_[failure].GetEdge(edge_label); | |
220 } | |
221 | 157 |
222 const uint32 follow_in_case_of_failure = | 158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? |
223 edge_from_failure != AhoCorasickNode::kNoSuchEdge | 159 tree_[failure].GetEdge(edge_label) : 0; |
224 ? edge_from_failure | |
225 : 0; | |
226 tree_[leads_to].set_failure(follow_in_case_of_failure); | 160 tree_[leads_to].set_failure(follow_in_case_of_failure); |
227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
228 } | 162 } |
229 } | 163 } |
230 } | 164 } |
231 | 165 |
232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0; | |
233 | |
234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
235 : failure_(kNoSuchEdge) {} | 167 : failure_(-1) {} |
236 | 168 |
237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
238 | 170 |
239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
240 const SubstringSetMatcher::AhoCorasickNode& other) | 172 const SubstringSetMatcher::AhoCorasickNode& other) |
241 : edges_(other.edges_), | 173 : edges_(other.edges_), |
242 failure_(other.failure_), | 174 failure_(other.failure_), |
243 matches_(other.matches_) {} | 175 matches_(other.matches_) {} |
244 | 176 |
245 SubstringSetMatcher::AhoCorasickNode& | 177 SubstringSetMatcher::AhoCorasickNode& |
246 SubstringSetMatcher::AhoCorasickNode::operator=( | 178 SubstringSetMatcher::AhoCorasickNode::operator=( |
247 const SubstringSetMatcher::AhoCorasickNode& other) { | 179 const SubstringSetMatcher::AhoCorasickNode& other) { |
248 edges_ = other.edges_; | 180 edges_ = other.edges_; |
249 failure_ = other.failure_; | 181 failure_ = other.failure_; |
250 matches_ = other.matches_; | 182 matches_ = other.matches_; |
251 return *this; | 183 return *this; |
252 } | 184 } |
253 | 185 |
254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { |
255 Edges::const_iterator i = edges_.find(c); | 187 return edges_.find(c) != edges_.end(); |
256 return i == edges_.end() ? kNoSuchEdge : i->second; | |
257 } | 188 } |
258 | 189 |
259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { | 190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
| 191 std::map<char, int>::const_iterator i = edges_.find(c); |
| 192 return i == edges_.end() ? -1 : i->second; |
| 193 } |
| 194 |
| 195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { |
260 edges_[c] = node; | 196 edges_[c] = node; |
261 } | 197 } |
262 | 198 |
263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
264 matches_.insert(id); | 200 matches_.insert(id); |
265 } | 201 } |
266 | 202 |
267 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
269 matches_.insert(matches.begin(), matches.end()); | 205 matches_.insert(matches.begin(), matches.end()); |
270 } | 206 } |
271 | 207 |
272 } // namespace extensions | 208 } // namespace extensions |
OLD | NEW |