OLD | NEW |
---|---|
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. | 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "extensions/common/matcher/substring_set_matcher.h" | 5 #include "extensions/common/matcher/substring_set_matcher.h" |
6 | 6 |
7 #include <algorithm> | |
7 #include <queue> | 8 #include <queue> |
8 | 9 |
9 #include "base/logging.h" | 10 #include "base/logging.h" |
10 #include "base/stl_util.h" | 11 #include "base/stl_util.h" |
11 | 12 |
12 namespace extensions { | 13 namespace extensions { |
13 | 14 |
15 namespace { | |
16 | |
17 // Compare StringPattern instances based on their string patterns. | |
18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) { | |
19 return a->pattern() <= b->pattern(); | |
20 } | |
21 | |
22 // Given the set of patterns, compute how many nodes will the corresponding | |
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted. | |
24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) { | |
25 uint32 result = 1u; // 1 for the root node. | |
26 if (patterns.empty()) | |
27 return result; | |
28 | |
29 std::vector<const StringPattern*>::const_iterator last = patterns.begin(); | |
30 std::vector<const StringPattern*>::const_iterator current = last + 1; | |
31 // For the first pattern, each letter is a label of an edge to a new node. | |
32 result += (*last)->pattern().size(); | |
33 | |
34 // For the subsequent patterns, only count the edges which were not counted | |
35 // yet. For this it suffices to test against the previous pattern, because the | |
36 // patterns are sorted. | |
37 for (; current != patterns.end(); ++last, ++current) { | |
38 const std::string& last_pattern = (*last)->pattern(); | |
39 const std::string& current_pattern = (*current)->pattern(); | |
40 const uint32 prefix_bound = | |
41 std::min(last_pattern.size(), current_pattern.size()); | |
42 | |
43 uint32 common_prefix = 0; | |
44 while (common_prefix < prefix_bound && | |
45 last_pattern[common_prefix] == current_pattern[common_prefix]) | |
46 ++common_prefix; | |
47 result += current_pattern.size() - common_prefix; | |
48 } | |
49 return result; | |
50 } | |
51 | |
52 } // namespace | |
53 | |
14 // | 54 // |
15 // SubstringSetMatcher | 55 // SubstringSetMatcher |
16 // | 56 // |
17 | 57 |
18 SubstringSetMatcher::SubstringSetMatcher() { | 58 SubstringSetMatcher::SubstringSetMatcher() { |
19 RebuildAhoCorasickTree(); | 59 RebuildAhoCorasickTree(); |
20 } | 60 } |
21 | 61 |
22 SubstringSetMatcher::~SubstringSetMatcher() {} | 62 SubstringSetMatcher::~SubstringSetMatcher() {} |
23 | 63 |
(...skipping 18 matching lines...) Expand all Loading... | |
42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); | 82 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); |
43 patterns_[(*i)->id()] = *i; | 83 patterns_[(*i)->id()] = *i; |
44 } | 84 } |
45 | 85 |
46 // Unregister patterns | 86 // Unregister patterns |
47 for (std::vector<const StringPattern*>::const_iterator i = | 87 for (std::vector<const StringPattern*>::const_iterator i = |
48 to_unregister.begin(); i != to_unregister.end(); ++i) { | 88 to_unregister.begin(); i != to_unregister.end(); ++i) { |
49 patterns_.erase((*i)->id()); | 89 patterns_.erase((*i)->id()); |
50 } | 90 } |
51 | 91 |
92 // Now we compute the total number of tree nodes needed. | |
93 sorted_patterns_.clear(); | |
94 sorted_patterns_.resize(patterns_.size()); | |
battre
2013/05/07 14:30:03
I think if you use it this way, sorted_patterns sh
vabr (Chromium)
2013/05/07 14:59:16
Done.
Originally I hoped to replace |patterns_| w
| |
95 | |
96 size_t next = 0; | |
97 for (SubstringPatternSet::const_iterator i = patterns_.begin(); | |
98 i != patterns_.end(); | |
99 ++i, ++next) { | |
100 sorted_patterns_[next] = i->second; | |
101 } | |
102 | |
103 std::sort(sorted_patterns_.begin(), sorted_patterns_.end(), ComparePatterns); | |
104 tree_.reserve(TreeSize(sorted_patterns_)); | |
105 | |
52 RebuildAhoCorasickTree(); | 106 RebuildAhoCorasickTree(); |
53 } | 107 } |
54 | 108 |
55 bool SubstringSetMatcher::Match(const std::string& text, | 109 bool SubstringSetMatcher::Match(const std::string& text, |
56 std::set<StringPattern::ID>* matches) const { | 110 std::set<StringPattern::ID>* matches) const { |
57 size_t old_number_of_matches = matches->size(); | 111 const size_t old_number_of_matches = matches->size(); |
58 | 112 |
59 // Handle patterns matching the empty string. | 113 // Handle patterns matching the empty string. |
60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); | 114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); |
61 | 115 |
62 int current_node = 0; | 116 uint32 current_node = 0; |
63 size_t text_length = text.length(); | 117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) { |
64 for (size_t i = 0; i < text_length; ++i) { | 118 uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) | 119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge && |
120 current_node != 0) { | |
66 current_node = tree_[current_node].failure(); | 121 current_node = tree_[current_node].failure(); |
67 if (tree_[current_node].HasEdge(text[i])) { | 122 edge_from_current = tree_[current_node].GetEdge(*i); |
68 current_node = tree_[current_node].GetEdge(text[i]); | 123 } |
124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) { | |
125 current_node = edge_from_current; | |
69 matches->insert(tree_[current_node].matches().begin(), | 126 matches->insert(tree_[current_node].matches().begin(), |
70 tree_[current_node].matches().end()); | 127 tree_[current_node].matches().end()); |
71 } else { | 128 } else { |
72 DCHECK_EQ(0, current_node); | 129 DCHECK_EQ(0u, current_node); |
73 } | 130 } |
74 } | 131 } |
75 | 132 |
76 return old_number_of_matches != matches->size(); | 133 return old_number_of_matches != matches->size(); |
77 } | 134 } |
78 | 135 |
79 bool SubstringSetMatcher::IsEmpty() const { | 136 bool SubstringSetMatcher::IsEmpty() const { |
80 // An empty tree consists of only the root node. | 137 // An empty tree consists of only the root node. |
81 return patterns_.empty() && tree_.size() == 1u; | 138 return patterns_.empty() && tree_.size() == 1u; |
82 } | 139 } |
83 | 140 |
84 void SubstringSetMatcher::RebuildAhoCorasickTree() { | 141 void SubstringSetMatcher::RebuildAhoCorasickTree() { |
85 tree_.clear(); | 142 tree_.clear(); |
86 | 143 |
87 // Initialize root note of tree. | 144 // Initialize root note of tree. |
88 AhoCorasickNode root; | 145 AhoCorasickNode root; |
89 root.set_failure(0); | 146 root.set_failure(0); |
90 tree_.push_back(root); | 147 tree_.push_back(root); |
91 | 148 |
92 // Insert all patterns. | 149 // Insert all patterns. |
93 for (SubstringPatternSet::const_iterator i = patterns_.begin(); | 150 for (SubstringPatternVector::const_iterator i = sorted_patterns_.begin(); |
94 i != patterns_.end(); ++i) { | 151 i != sorted_patterns_.end(); |
95 InsertPatternIntoAhoCorasickTree(i->second); | 152 ++i) { |
153 InsertPatternIntoAhoCorasickTree(*i); | |
96 } | 154 } |
97 | 155 |
98 CreateFailureEdges(); | 156 CreateFailureEdges(); |
99 } | 157 } |
100 | 158 |
101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( | 159 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( |
102 const StringPattern* pattern) { | 160 const StringPattern* pattern) { |
103 const std::string& text = pattern->pattern(); | 161 const std::string& text = pattern->pattern(); |
104 size_t text_length = text.length(); | 162 const std::string::const_iterator text_end = text.end(); |
105 | 163 |
106 // Iterators on the tree and the text. | 164 // Iterators on the tree and the text. |
107 int current_node = 0; | 165 uint32 current_node = 0; |
108 size_t text_pos = 0; | 166 std::string::const_iterator i = text.begin(); |
109 | 167 |
110 // Follow existing paths for as long as possible. | 168 // Follow existing paths for as long as possible. |
111 while (text_pos < text_length && | 169 uint32 edge_from_current = tree_[current_node].GetEdge(*i); |
112 tree_[current_node].HasEdge(text[text_pos])) { | 170 while (i != text_end && |
113 current_node = tree_[current_node].GetEdge(text[text_pos]); | 171 edge_from_current != AhoCorasickNode::kNoSuchEdge) { |
114 ++text_pos; | 172 current_node = edge_from_current; |
173 ++i; | |
174 edge_from_current = tree_[current_node].GetEdge(*i); | |
115 } | 175 } |
116 | 176 |
117 // Create new nodes if necessary. | 177 // Create new nodes if necessary. |
118 while (text_pos < text_length) { | 178 while (i != text_end) { |
119 AhoCorasickNode new_node; | 179 tree_.push_back(AhoCorasickNode()); |
120 tree_.push_back(new_node); | 180 tree_[current_node].SetEdge(*i, tree_.size() - 1); |
121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1); | |
122 current_node = tree_.size() - 1; | 181 current_node = tree_.size() - 1; |
123 ++text_pos; | 182 ++i; |
124 } | 183 } |
125 | 184 |
126 // Register match. | 185 // Register match. |
127 tree_[current_node].AddMatch(pattern->id()); | 186 tree_[current_node].AddMatch(pattern->id()); |
128 } | 187 } |
129 | 188 |
130 void SubstringSetMatcher::CreateFailureEdges() { | 189 void SubstringSetMatcher::CreateFailureEdges() { |
131 typedef AhoCorasickNode::Edges Edges; | 190 typedef AhoCorasickNode::Edges Edges; |
132 | 191 |
133 std::queue<int> queue; | 192 std::queue<uint32> queue; |
134 | 193 |
135 AhoCorasickNode& root = tree_[0]; | 194 AhoCorasickNode& root = tree_[0]; |
136 root.set_failure(0); | 195 root.set_failure(0); |
137 const AhoCorasickNode::Edges& root_edges = root.edges(); | 196 const Edges& root_edges = root.edges(); |
138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); | 197 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); |
139 ++e) { | 198 ++e) { |
140 int leads_to = e->second; | 199 const uint32& leads_to = e->second; |
141 tree_[leads_to].set_failure(0); | 200 tree_[leads_to].set_failure(0); |
142 queue.push(leads_to); | 201 queue.push(leads_to); |
143 } | 202 } |
144 | 203 |
145 while (!queue.empty()) { | 204 while (!queue.empty()) { |
146 AhoCorasickNode& current_node = tree_[queue.front()]; | 205 AhoCorasickNode& current_node = tree_[queue.front()]; |
147 queue.pop(); | 206 queue.pop(); |
148 for (Edges::const_iterator e = current_node.edges().begin(); | 207 for (Edges::const_iterator e = current_node.edges().begin(); |
149 e != current_node.edges().end(); ++e) { | 208 e != current_node.edges().end(); ++e) { |
150 char edge_label = e->first; | 209 const char& edge_label = e->first; |
151 int leads_to = e->second; | 210 const uint32& leads_to = e->second; |
152 queue.push(leads_to); | 211 queue.push(leads_to); |
153 | 212 |
154 int failure = current_node.failure(); | 213 uint32 failure = current_node.failure(); |
155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) | 214 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label); |
215 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge && | |
216 failure != 0) { | |
156 failure = tree_[failure].failure(); | 217 failure = tree_[failure].failure(); |
218 edge_from_failure = tree_[failure].GetEdge(edge_label); | |
219 } | |
157 | 220 |
158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? | 221 const uint32 follow_in_case_of_failure = |
159 tree_[failure].GetEdge(edge_label) : 0; | 222 edge_from_failure != AhoCorasickNode::kNoSuchEdge |
223 ? edge_from_failure | |
224 : 0; | |
160 tree_[leads_to].set_failure(follow_in_case_of_failure); | 225 tree_[leads_to].set_failure(follow_in_case_of_failure); |
161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); | 226 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); |
162 } | 227 } |
163 } | 228 } |
164 } | 229 } |
165 | 230 |
166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() | 231 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() |
167 : failure_(-1) {} | 232 : failure_(kNoSuchEdge) {} |
168 | 233 |
169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} | 234 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} |
170 | 235 |
171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( | 236 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( |
172 const SubstringSetMatcher::AhoCorasickNode& other) | 237 const SubstringSetMatcher::AhoCorasickNode& other) |
173 : edges_(other.edges_), | 238 : edges_(other.edges_), |
174 failure_(other.failure_), | 239 failure_(other.failure_), |
175 matches_(other.matches_) {} | 240 matches_(other.matches_) {} |
176 | 241 |
177 SubstringSetMatcher::AhoCorasickNode& | 242 SubstringSetMatcher::AhoCorasickNode& |
178 SubstringSetMatcher::AhoCorasickNode::operator=( | 243 SubstringSetMatcher::AhoCorasickNode::operator=( |
179 const SubstringSetMatcher::AhoCorasickNode& other) { | 244 const SubstringSetMatcher::AhoCorasickNode& other) { |
180 edges_ = other.edges_; | 245 edges_ = other.edges_; |
181 failure_ = other.failure_; | 246 failure_ = other.failure_; |
182 matches_ = other.matches_; | 247 matches_ = other.matches_; |
183 return *this; | 248 return *this; |
184 } | 249 } |
185 | 250 |
186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { | 251 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { |
187 return edges_.find(c) != edges_.end(); | 252 Edges::const_iterator i = edges_.find(c); |
253 return i == edges_.end() ? kNoSuchEdge : i->second; | |
188 } | 254 } |
189 | 255 |
190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { | 256 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) { |
191 std::map<char, int>::const_iterator i = edges_.find(c); | |
192 return i == edges_.end() ? -1 : i->second; | |
193 } | |
194 | |
195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) { | |
196 edges_[c] = node; | 257 edges_[c] = node; |
197 } | 258 } |
198 | 259 |
199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { | 260 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { |
200 matches_.insert(id); | 261 matches_.insert(id); |
201 } | 262 } |
202 | 263 |
203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( | 264 void SubstringSetMatcher::AhoCorasickNode::AddMatches( |
204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { | 265 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { |
205 matches_.insert(matches.begin(), matches.end()); | 266 matches_.insert(matches.begin(), matches.end()); |
206 } | 267 } |
207 | 268 |
208 } // namespace extensions | 269 } // namespace extensions |
OLD | NEW |