OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "extensions/common/matcher/substring_set_matcher.h" | 5 #include "extensions/common/matcher/substring_set_matcher.h" |
6 | 6 |
7 #include <algorithm> | |
7 #include <queue> | 8 #include <queue> |
8 | 9 |
9 #include "base/logging.h" | 10 #include "base/logging.h" |
10 #include "base/stl_util.h" | 11 #include "base/stl_util.h" |
11 | 12 |
12 namespace extensions { | 13 namespace extensions { |
13 | 14 |
15 namespace { | |
16 | |
17 // Compare StringPattern instances based on their string patterns. | |
18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | |
19 return a->pattern() <= b->pattern(); | |
20 } | |
21 | |
22 // Given the set of patterns, compute how many nodes will the corresponding | |
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. | |
24 unsigned TreeSize(const std::vector<const StringPattern*>& patterns) { | |
battre
2013/05/03 21:59:12
This should be size_t; also in all other places.
vabr (Chromium)
2013/05/05 23:36:49
On my machine, size_t is 8 bytes, unsigned is only
| |
25 unsigned result = 1u; // 1 for the root node. | |
26 if (patterns.empty()) | |
27 return result; | |
28 | |
29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | |
30 std::vector<const StringPattern*>::const_iterator current = last + 1; | |
31 // For the first pattern, each letter is a label of an edge to a new node. | |
32 result += (*last)->pattern().size(); | |
33 | |
34 // For the subsequent patterns, only count the edges which were not counted | |
35 // yet. For this it suffices to test against the previous pattern, because the | |
36 // patterns are sorted. | |
37 for (; current != patterns.end(); ++last, ++current) { | |
38 const std::string& last_s = (*last)->pattern(); | |
battre
2013/05/03 21:59:12
what does "_s" stand for? Can you pick a name with
vabr (Chromium)
2013/05/05 23:36:49
Done.
| |
39 const std::string& current_s = (*current)->pattern(); | |
40 const unsigned prefix_bound = std::min(last_s.size(), current_s.size()); | |
41 | |
42 unsigned common_prefix = 0; | |
43 while (common_prefix < prefix_bound && | |
44 last_s[common_prefix] == current_s[common_prefix]) | |
45 ++common_prefix; | |
46 result += current_s.size() - common_prefix; | |
47 } | |
48 return result; | |
49 } | |
50 | |
51 } // namespace | |
52 | |
14 // | 53 // |
15 // SubstringSetMatcher | 54 // SubstringSetMatcher |
16 // | 55 // |
17 | 56 |
18 SubstringSetMatcher::SubstringSetMatcher() { | 57 SubstringSetMatcher::SubstringSetMatcher() { |
19 RebuildAhoCorasickTree(); | 58 RebuildAhoCorasickTree(); |
20 } | 59 } |
21 | 60 |
22 SubstringSetMatcher::~SubstringSetMatcher() {} | 61 SubstringSetMatcher::~SubstringSetMatcher() {} |
23 | 62 |
(...skipping 18 matching lines...) Expand all Loading... | |
42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 81 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
43 patterns_[(*i)->id()] = *i; | 82 patterns_[(*i)->id()] = *i; |
44 } | 83 } |
45 | 84 |
46 // Unregister patterns | 85 // Unregister patterns |
47 for (std::vector<const StringPattern*>::const_iterator i = | 86 for (std::vector<const StringPattern*>::const_iterator i = |
48 to_unregister.begin(); i != to_unregister.end(); ++i) { | 87 to_unregister.begin(); i != to_unregister.end(); ++i) { |
49 patterns_.erase((*i)->id()); | 88 patterns_.erase((*i)->id()); |
50 } | 89 } |
51 | 90 |
91 // We compute the total number of tree nodes needed by using this vector. | |
92 std::vector<const StringPattern*> sorted_patterns(patterns_.size()); | |
93 // First, fill it from |patterns_|. | |
94 size_t next = 0; | |
95 for (SubstringPatternSet::const_iterator i = patterns_.begin(); | |
96 i != patterns_.end(); | |
97 ++i, ++next) { | |
98 sorted_patterns[next] = i->second; | |
99 } | |
100 | |
101 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns); | |
102 tree_.reserve(TreeSize(sorted_patterns)); | |
103 | |
52 RebuildAhoCorasickTree(); | 104 RebuildAhoCorasickTree(); |
53 } | 105 } |
54 | 106 |
55 bool SubstringSetMatcher::Match(const std::string& text, | 107 bool SubstringSetMatcher::Match(const std::string& text, |
56 std::set<StringPattern::ID>* matches) const { | 108 std::set<StringPattern::ID>* matches) const { |
57 size_t old_number_of_matches = matches->size(); | 109 const size_t old_number_of_matches = matches->size(); |
58 | 110 |
59 // Handle patterns matching the empty string. | 111 // Handle patterns matching the empty string. |
60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 112 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
61 | 113 |
62 int current_node = 0; | 114 unsigned current_node = 0; |
63 size_t text_length = text.length(); | 115 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
64 for (size_t i = 0; i < text_length; ++i) { | 116 unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) | 117 while (edge_from_current == AhoCorasickNode::kInvalidIndex && |
118 current_node != 0) { | |
66 current_node = tree_[current_node].failure(); | 119 current_node = tree_[current_node].failure(); |
67 if (tree_[current_node].HasEdge(text[i])) { | 120 edge_from_current = tree_[current_node].GetEdge(*i); |
68 current_node = tree_[current_node].GetEdge(text[i]); | 121 } |
122 if (edge_from_current != AhoCorasickNode::kInvalidIndex) { | |
123 current_node = edge_from_current; | |
69 matches->insert(tree_[current_node].matches().begin(), | 124 matches->insert(tree_[current_node].matches().begin(), |
70 tree_[current_node].matches().end()); | 125 tree_[current_node].matches().end()); |
71 } else { | 126 } else { |
72 DCHECK_EQ(0, current_node); | 127 DCHECK_EQ(0u, current_node); |
73 } | 128 } |
74 } | 129 } |
75 | 130 |
76 return old_number_of_matches != matches->size(); | 131 return old_number_of_matches != matches->size(); |
77 } | 132 } |
78 | 133 |
79 bool SubstringSetMatcher::IsEmpty() const { | 134 bool SubstringSetMatcher::IsEmpty() const { |
80 // An empty tree consists of only the root node. | 135 // An empty tree consists of only the root node. |
81 return patterns_.empty() && tree_.size() == 1u; | 136 return patterns_.empty() && tree_.size() == 1u; |
82 } | 137 } |
(...skipping 11 matching lines...) Expand all Loading... | |
94 i != patterns_.end(); ++i) { | 149 i != patterns_.end(); ++i) { |
95 InsertPatternIntoAhoCorasickTree(i->second); | 150 InsertPatternIntoAhoCorasickTree(i->second); |
96 } | 151 } |
97 | 152 |
98 CreateFailureEdges(); | 153 CreateFailureEdges(); |
99 } | 154 } |
100 | 155 |
101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 156 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
102 const StringPattern* pattern) { | 157 const StringPattern* pattern) { |
103 const std::string& text = pattern->pattern(); | 158 const std::string& text = pattern->pattern(); |
104 size_t text_length = text.length(); | |
105 | 159 |
106 // Iterators on the tree and the text. | 160 // Iterators on the tree and the text. |
107 int current_node = 0; | 161 unsigned current_node = 0; |
108 size_t text_pos = 0; | 162 std::string::const_iterator i = text.begin(); |
109 | 163 |
110 // Follow existing paths for as long as possible. | 164 // Follow existing paths for as long as possible. |
111 while (text_pos < text_length && | 165 unsigned edge_from_current = tree_[current_node].GetEdge(*i); |
112 tree_[current_node].HasEdge(text[text_pos])) { | 166 while (i != text.end() && |
battre
2013/05/03 21:59:12
text is immutable. Should we have a const std::str
vabr (Chromium)
2013/05/05 23:36:49
It does not seem to make a difference performance-
| |
113 current_node = tree_[current_node].GetEdge(text[text_pos]); | 167 edge_from_current != AhoCorasickNode::kInvalidIndex) { |
114 ++text_pos; | 168 current_node = edge_from_current; |
169 ++i; | |
170 edge_from_current = tree_[current_node].GetEdge(*i); | |
115 } | 171 } |
116 | 172 |
117 // Create new nodes if necessary. | 173 // Create new nodes if necessary. |
118 while (text_pos < text_length) { | 174 while (i != text.end()) { |
119 AhoCorasickNode new_node; | 175 tree_.push_back(AhoCorasickNode()); |
120 tree_.push_back(new_node); | 176 tree_[current_node].SetEdge(*i, tree_.size() - 1); |
121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); | |
122 current_node = tree_.size() - 1; | 177 current_node = tree_.size() - 1; |
123 ++text_pos; | 178 ++i; |
124 } | 179 } |
125 | 180 |
126 // Register match. | 181 // Register match. |
127 tree_[current_node].AddMatch(pattern->id()); | 182 tree_[current_node].AddMatch(pattern->id()); |
128 } | 183 } |
129 | 184 |
130 void SubstringSetMatcher::CreateFailureEdges() { | 185 void SubstringSetMatcher::CreateFailureEdges() { |
131 typedef AhoCorasickNode::Edges Edges; | 186 typedef AhoCorasickNode::Edges Edges; |
132 | 187 |
133 std::queue<int> queue; | 188 std::queue<unsigned> queue; |
134 | 189 |
135 AhoCorasickNode& root = tree_[0]; | 190 AhoCorasickNode& root = tree_[0]; |
136 root.set_failure(0); | 191 root.set_failure(0); |
137 const AhoCorasickNode::Edges& root_edges = root.edges(); | 192 const Edges& root_edges = root.edges(); |
138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 193 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
139 ++e) { | 194 ++e) { |
140 int leads_to = e->second; | 195 const unsigned& leads_to = e->second; |
vabr (Chromium)
2013/05/03 20:37:27
I know this is uncommon. However, this method is a
battre
2013/05/03 21:59:12
Have you checked what happens if you used "const i
vabr (Chromium)
2013/05/05 23:36:49
Are you asking about removing the reference, or mo
| |
141 tree_[leads_to].set_failure(0); | 196 tree_[leads_to].set_failure(0); |
142 queue.push(leads_to); | 197 queue.push(leads_to); |
143 } | 198 } |
144 | 199 |
145 while (!queue.empty()) { | 200 while (!queue.empty()) { |
146 AhoCorasickNode& current_node = tree_[queue.front()]; | 201 AhoCorasickNode& current_node = tree_[queue.front()]; |
147 queue.pop(); | 202 queue.pop(); |
148 for (Edges::const_iterator e = current_node.edges().begin(); | 203 for (Edges::const_iterator e = current_node.edges().begin(); |
149 e != current_node.edges().end(); ++e) { | 204 e != current_node.edges().end(); ++e) { |
150 char edge_label = e->first; | 205 const char& edge_label = e->first; |
151 int leads_to = e->second; | 206 const unsigned& leads_to = e->second; |
152 queue.push(leads_to); | 207 queue.push(leads_to); |
153 | 208 |
154 int failure = current_node.failure(); | 209 unsigned failure = current_node.failure(); |
155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) | 210 unsigned edge_from_failure = tree_[failure].GetEdge(edge_label); |
211 while (edge_from_failure == AhoCorasickNode::kInvalidIndex && | |
212 failure != 0) { | |
156 failure = tree_[failure].failure(); | 213 failure = tree_[failure].failure(); |
214 edge_from_failure = tree_[failure].GetEdge(edge_label); | |
215 } | |
157 | 216 |
158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? | 217 const unsigned follow_in_case_of_failure = |
159 tree_[failure].GetEdge(edge_label) : 0; | 218 edge_from_failure != AhoCorasickNode::kInvalidIndex |
219 ? edge_from_failure | |
220 : 0; | |
160 tree_[leads_to].set_failure(follow_in_case_of_failure); | 221 tree_[leads_to].set_failure(follow_in_case_of_failure); |
161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 222 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
162 } | 223 } |
163 } | 224 } |
164 } | 225 } |
165 | 226 |
166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 227 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
167 : failure_(-1) {} | 228 : failure_(kInvalidIndex) {} |
168 | 229 |
169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 230 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
170 | 231 |
171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 232 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
172 const SubstringSetMatcher::AhoCorasickNode& other) | 233 const SubstringSetMatcher::AhoCorasickNode& other) |
173 : edges_(other.edges_), | 234 : edges_(other.edges_), |
174 failure_(other.failure_), | 235 failure_(other.failure_), |
175 matches_(other.matches_) {} | 236 matches_(other.matches_) {} |
176 | 237 |
177 SubstringSetMatcher::AhoCorasickNode& | 238 SubstringSetMatcher::AhoCorasickNode& |
178 SubstringSetMatcher::AhoCorasickNode::operator=( | 239 SubstringSetMatcher::AhoCorasickNode::operator=( |
179 const SubstringSetMatcher::AhoCorasickNode& other) { | 240 const SubstringSetMatcher::AhoCorasickNode& other) { |
180 edges_ = other.edges_; | 241 edges_ = other.edges_; |
181 failure_ = other.failure_; | 242 failure_ = other.failure_; |
182 matches_ = other.matches_; | 243 matches_ = other.matches_; |
183 return *this; | 244 return *this; |
184 } | 245 } |
185 | 246 |
186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { | 247 unsigned SubstringSetMatcher::AhoCorasickNode::GetEdge( |
187 return edges_.find(c) != edges_.end(); | 248 char c) const { |
249 Edges::const_iterator i = edges_.find(c); | |
250 return i == edges_.end() ? kInvalidIndex : i->second; | |
188 } | 251 } |
189 | 252 |
190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 253 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, unsigned node) { |
191 std::map<char, int>::const_iterator i = edges_.find(c); | |
192 return i == edges_.end() ? -1 : i->second; | |
193 } | |
194 | |
195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { | |
196 edges_[c] = node; | 254 edges_[c] = node; |
197 } | 255 } |
198 | 256 |
199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 257 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
200 matches_.insert(id); | 258 matches_.insert(id); |
201 } | 259 } |
202 | 260 |
203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 261 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 262 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
205 matches_.insert(matches.begin(), matches.end()); | 263 matches_.insert(matches.begin(), matches.end()); |
206 } | 264 } |
207 | 265 |
208 } // namespace extensions | 266 } // namespace extensions |
OLD | NEW |