Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(454)

Side by Side Diff: extensions/common/matcher/substring_set_matcher.cc

Issue 15027008: Speed improvements to SubstringSetMatcher (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 7 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
« no previous file with comments | « extensions/common/matcher/substring_set_matcher.h ('k') | no next file » | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "extensions/common/matcher/substring_set_matcher.h" 5 #include "extensions/common/matcher/substring_set_matcher.h"
6 6
7 #include <algorithm>
7 #include <queue> 8 #include <queue>
8 9
9 #include "base/logging.h" 10 #include "base/logging.h"
10 #include "base/stl_util.h" 11 #include "base/stl_util.h"
11 12
12 namespace extensions { 13 namespace extensions {
13 14
15 namespace {
16
17 // Compare StringPattern instances based on their string patterns.
18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
19 return a->pattern() <= b->pattern();
vabr (Chromium) 2013/05/15 07:04:17 This needs to be strict, otherwise the sort is not
20 }
21
22 // Given the set of patterns, compute how many nodes will the corresponding
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted.
24 uint32 TreeSize(const std::vector<const StringPattern*>& patterns) {
25 uint32 result = 1u; // 1 for the root node.
26 if (patterns.empty())
27 return result;
28
29 std::vector<const StringPattern*>::const_iterator last = patterns.begin();
30 std::vector<const StringPattern*>::const_iterator current = last + 1;
31 // For the first pattern, each letter is a label of an edge to a new node.
32 result += (*last)->pattern().size();
33
34 // For the subsequent patterns, only count the edges which were not counted
35 // yet. For this it suffices to test against the previous pattern, because the
36 // patterns are sorted.
37 for (; current != patterns.end(); ++last, ++current) {
38 const std::string& last_pattern = (*last)->pattern();
39 const std::string& current_pattern = (*current)->pattern();
40 const uint32 prefix_bound =
41 std::min(last_pattern.size(), current_pattern.size());
42
43 uint32 common_prefix = 0;
44 while (common_prefix < prefix_bound &&
45 last_pattern[common_prefix] == current_pattern[common_prefix])
46 ++common_prefix;
47 result += current_pattern.size() - common_prefix;
48 }
49 return result;
50 }
51
52 } // namespace
53
14 // 54 //
15 // SubstringSetMatcher 55 // SubstringSetMatcher
16 // 56 //
17 57
18 SubstringSetMatcher::SubstringSetMatcher() { 58 SubstringSetMatcher::SubstringSetMatcher() {
19 RebuildAhoCorasickTree(); 59 RebuildAhoCorasickTree(SubstringPatternVector());
20 } 60 }
21 61
22 SubstringSetMatcher::~SubstringSetMatcher() {} 62 SubstringSetMatcher::~SubstringSetMatcher() {}
23 63
24 void SubstringSetMatcher::RegisterPatterns( 64 void SubstringSetMatcher::RegisterPatterns(
25 const std::vector<const StringPattern*>& patterns) { 65 const std::vector<const StringPattern*>& patterns) {
26 RegisterAndUnregisterPatterns(patterns, 66 RegisterAndUnregisterPatterns(patterns,
27 std::vector<const StringPattern*>()); 67 std::vector<const StringPattern*>());
28 } 68 }
29 69
(...skipping 12 matching lines...) Expand all
42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); 82 DCHECK(patterns_.find((*i)->id()) == patterns_.end());
43 patterns_[(*i)->id()] = *i; 83 patterns_[(*i)->id()] = *i;
44 } 84 }
45 85
46 // Unregister patterns 86 // Unregister patterns
47 for (std::vector<const StringPattern*>::const_iterator i = 87 for (std::vector<const StringPattern*>::const_iterator i =
48 to_unregister.begin(); i != to_unregister.end(); ++i) { 88 to_unregister.begin(); i != to_unregister.end(); ++i) {
49 patterns_.erase((*i)->id()); 89 patterns_.erase((*i)->id());
50 } 90 }
51 91
52 RebuildAhoCorasickTree(); 92 // Now we compute the total number of tree nodes needed.
93 SubstringPatternVector sorted_patterns;
94 sorted_patterns.resize(patterns_.size());
95
96 size_t next = 0;
97 for (SubstringPatternMap::const_iterator i = patterns_.begin();
98 i != patterns_.end();
99 ++i, ++next) {
100 sorted_patterns[next] = i->second;
101 }
102
103 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
104 tree_.reserve(TreeSize(sorted_patterns));
105
106 RebuildAhoCorasickTree(sorted_patterns);
53 } 107 }
54 108
55 bool SubstringSetMatcher::Match(const std::string& text, 109 bool SubstringSetMatcher::Match(const std::string& text,
56 std::set<StringPattern::ID>* matches) const { 110 std::set<StringPattern::ID>* matches) const {
57 size_t old_number_of_matches = matches->size(); 111 const size_t old_number_of_matches = matches->size();
58 112
59 // Handle patterns matching the empty string. 113 // Handle patterns matching the empty string.
60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); 114 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
61 115
62 int current_node = 0; 116 uint32 current_node = 0;
63 size_t text_length = text.length(); 117 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
64 for (size_t i = 0; i < text_length; ++i) { 118 uint32 edge_from_current = tree_[current_node].GetEdge(*i);
65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) 119 while (edge_from_current == AhoCorasickNode::kNoSuchEdge &&
120 current_node != 0) {
66 current_node = tree_[current_node].failure(); 121 current_node = tree_[current_node].failure();
67 if (tree_[current_node].HasEdge(text[i])) { 122 edge_from_current = tree_[current_node].GetEdge(*i);
68 current_node = tree_[current_node].GetEdge(text[i]); 123 }
124 if (edge_from_current != AhoCorasickNode::kNoSuchEdge) {
125 current_node = edge_from_current;
69 matches->insert(tree_[current_node].matches().begin(), 126 matches->insert(tree_[current_node].matches().begin(),
70 tree_[current_node].matches().end()); 127 tree_[current_node].matches().end());
71 } else { 128 } else {
72 DCHECK_EQ(0, current_node); 129 DCHECK_EQ(0u, current_node);
73 } 130 }
74 } 131 }
75 132
76 return old_number_of_matches != matches->size(); 133 return old_number_of_matches != matches->size();
77 } 134 }
78 135
79 bool SubstringSetMatcher::IsEmpty() const { 136 bool SubstringSetMatcher::IsEmpty() const {
80 // An empty tree consists of only the root node. 137 // An empty tree consists of only the root node.
81 return patterns_.empty() && tree_.size() == 1u; 138 return patterns_.empty() && tree_.size() == 1u;
82 } 139 }
83 140
84 void SubstringSetMatcher::RebuildAhoCorasickTree() { 141 void SubstringSetMatcher::RebuildAhoCorasickTree(
142 const SubstringPatternVector& sorted_patterns) {
85 tree_.clear(); 143 tree_.clear();
86 144
87 // Initialize root note of tree. 145 // Initialize root note of tree.
88 AhoCorasickNode root; 146 AhoCorasickNode root;
89 root.set_failure(0); 147 root.set_failure(0);
90 tree_.push_back(root); 148 tree_.push_back(root);
91 149
92 // Insert all patterns. 150 // Insert all patterns.
93 for (SubstringPatternSet::const_iterator i = patterns_.begin(); 151 for (SubstringPatternVector::const_iterator i = sorted_patterns.begin();
94 i != patterns_.end(); ++i) { 152 i != sorted_patterns.end();
95 InsertPatternIntoAhoCorasickTree(i->second); 153 ++i) {
154 InsertPatternIntoAhoCorasickTree(*i);
96 } 155 }
97 156
98 CreateFailureEdges(); 157 CreateFailureEdges();
99 } 158 }
100 159
101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( 160 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
102 const StringPattern* pattern) { 161 const StringPattern* pattern) {
103 const std::string& text = pattern->pattern(); 162 const std::string& text = pattern->pattern();
104 size_t text_length = text.length(); 163 const std::string::const_iterator text_end = text.end();
105 164
106 // Iterators on the tree and the text. 165 // Iterators on the tree and the text.
107 int current_node = 0; 166 uint32 current_node = 0;
108 size_t text_pos = 0; 167 std::string::const_iterator i = text.begin();
109 168
110 // Follow existing paths for as long as possible. 169 // Follow existing paths for as long as possible.
111 while (text_pos < text_length && 170 uint32 edge_from_current = tree_[current_node].GetEdge(*i);
vabr (Chromium) 2013/05/15 07:04:17 Here I indeed dereferenced an invalid iterator if
112 tree_[current_node].HasEdge(text[text_pos])) { 171 while (i != text_end &&
113 current_node = tree_[current_node].GetEdge(text[text_pos]); 172 edge_from_current != AhoCorasickNode::kNoSuchEdge) {
114 ++text_pos; 173 current_node = edge_from_current;
174 ++i;
175 edge_from_current = tree_[current_node].GetEdge(*i);
115 } 176 }
116 177
117 // Create new nodes if necessary. 178 // Create new nodes if necessary.
118 while (text_pos < text_length) { 179 while (i != text_end) {
119 AhoCorasickNode new_node; 180 tree_.push_back(AhoCorasickNode());
120 tree_.push_back(new_node); 181 tree_[current_node].SetEdge(*i, tree_.size() - 1);
121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1);
122 current_node = tree_.size() - 1; 182 current_node = tree_.size() - 1;
123 ++text_pos; 183 ++i;
124 } 184 }
125 185
126 // Register match. 186 // Register match.
127 tree_[current_node].AddMatch(pattern->id()); 187 tree_[current_node].AddMatch(pattern->id());
128 } 188 }
129 189
130 void SubstringSetMatcher::CreateFailureEdges() { 190 void SubstringSetMatcher::CreateFailureEdges() {
131 typedef AhoCorasickNode::Edges Edges; 191 typedef AhoCorasickNode::Edges Edges;
132 192
133 std::queue<int> queue; 193 std::queue<uint32> queue;
134 194
135 AhoCorasickNode& root = tree_[0]; 195 AhoCorasickNode& root = tree_[0];
136 root.set_failure(0); 196 root.set_failure(0);
137 const AhoCorasickNode::Edges& root_edges = root.edges(); 197 const Edges& root_edges = root.edges();
138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); 198 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
139 ++e) { 199 ++e) {
140 int leads_to = e->second; 200 const uint32& leads_to = e->second;
141 tree_[leads_to].set_failure(0); 201 tree_[leads_to].set_failure(0);
142 queue.push(leads_to); 202 queue.push(leads_to);
143 } 203 }
144 204
145 while (!queue.empty()) { 205 while (!queue.empty()) {
146 AhoCorasickNode& current_node = tree_[queue.front()]; 206 AhoCorasickNode& current_node = tree_[queue.front()];
147 queue.pop(); 207 queue.pop();
148 for (Edges::const_iterator e = current_node.edges().begin(); 208 for (Edges::const_iterator e = current_node.edges().begin();
149 e != current_node.edges().end(); ++e) { 209 e != current_node.edges().end(); ++e) {
150 char edge_label = e->first; 210 const char& edge_label = e->first;
151 int leads_to = e->second; 211 const uint32& leads_to = e->second;
152 queue.push(leads_to); 212 queue.push(leads_to);
153 213
154 int failure = current_node.failure(); 214 uint32 failure = current_node.failure();
155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) 215 uint32 edge_from_failure = tree_[failure].GetEdge(edge_label);
216 while (edge_from_failure == AhoCorasickNode::kNoSuchEdge &&
217 failure != 0) {
156 failure = tree_[failure].failure(); 218 failure = tree_[failure].failure();
219 edge_from_failure = tree_[failure].GetEdge(edge_label);
220 }
157 221
158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? 222 const uint32 follow_in_case_of_failure =
159 tree_[failure].GetEdge(edge_label) : 0; 223 edge_from_failure != AhoCorasickNode::kNoSuchEdge
224 ? edge_from_failure
225 : 0;
160 tree_[leads_to].set_failure(follow_in_case_of_failure); 226 tree_[leads_to].set_failure(follow_in_case_of_failure);
161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); 227 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
162 } 228 }
163 } 229 }
164 } 230 }
165 231
232 const uint32 SubstringSetMatcher::AhoCorasickNode::kNoSuchEdge = ~0;
233
166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() 234 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
167 : failure_(-1) {} 235 : failure_(kNoSuchEdge) {}
168 236
169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} 237 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
170 238
171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( 239 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
172 const SubstringSetMatcher::AhoCorasickNode& other) 240 const SubstringSetMatcher::AhoCorasickNode& other)
173 : edges_(other.edges_), 241 : edges_(other.edges_),
174 failure_(other.failure_), 242 failure_(other.failure_),
175 matches_(other.matches_) {} 243 matches_(other.matches_) {}
176 244
177 SubstringSetMatcher::AhoCorasickNode& 245 SubstringSetMatcher::AhoCorasickNode&
178 SubstringSetMatcher::AhoCorasickNode::operator=( 246 SubstringSetMatcher::AhoCorasickNode::operator=(
179 const SubstringSetMatcher::AhoCorasickNode& other) { 247 const SubstringSetMatcher::AhoCorasickNode& other) {
180 edges_ = other.edges_; 248 edges_ = other.edges_;
181 failure_ = other.failure_; 249 failure_ = other.failure_;
182 matches_ = other.matches_; 250 matches_ = other.matches_;
183 return *this; 251 return *this;
184 } 252 }
185 253
186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { 254 uint32 SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const {
187 return edges_.find(c) != edges_.end(); 255 Edges::const_iterator i = edges_.find(c);
256 return i == edges_.end() ? kNoSuchEdge : i->second;
188 } 257 }
189 258
190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { 259 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, uint32 node) {
191 std::map<char, int>::const_iterator i = edges_.find(c);
192 return i == edges_.end() ? -1 : i->second;
193 }
194
195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) {
196 edges_[c] = node; 260 edges_[c] = node;
197 } 261 }
198 262
199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { 263 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
200 matches_.insert(id); 264 matches_.insert(id);
201 } 265 }
202 266
203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( 267 void SubstringSetMatcher::AhoCorasickNode::AddMatches(
204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { 268 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) {
205 matches_.insert(matches.begin(), matches.end()); 269 matches_.insert(matches.begin(), matches.end());
206 } 270 }
207 271
208 } // namespace extensions 272 } // namespace extensions
OLDNEW
« no previous file with comments | « extensions/common/matcher/substring_set_matcher.h ('k') | no next file » | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698