Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(230)

Side by Side Diff: extensions/common/matcher/substring_set_matcher.cc

Issue 14780003: Speed improvements to SubstringSetMatcher (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Created 7 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
1 // Copyright (c) 2012 The Chromium Authors. All rights reserved. 1 // Copyright (c) 2012 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "extensions/common/matcher/substring_set_matcher.h" 5 #include "extensions/common/matcher/substring_set_matcher.h"
6 6
7 #include <algorithm>
7 #include <queue> 8 #include <queue>
8 9
9 #include "base/logging.h" 10 #include "base/logging.h"
10 #include "base/stl_util.h" 11 #include "base/stl_util.h"
11 12
12 namespace extensions { 13 namespace extensions {
13 14
15 namespace {
16
17 // Compare StringPattern instances based on their string patterns.
18 bool ComparePatterns(const StringPattern* a, const StringPattern* b) {
19 return a->pattern() <= b->pattern();
20 }
21
22 // Given the set of patterns, compute how many nodes will the corresponding
23 // Aho-Corasick tree have. Note that |patterns| need to be sorted.
24 unsigned TreeSize(const std::vector<const StringPattern*>& patterns) {
battre 2013/05/03 21:59:12 This should be size_t; also in all other places.
vabr (Chromium) 2013/05/05 23:36:49 On my machine, size_t is 8 bytes, unsigned is only
25 unsigned result = 1u; // 1 for the root node.
26 if (patterns.empty())
27 return result;
28
29 std::vector<const StringPattern*>::const_iterator last = patterns.begin();
30 std::vector<const StringPattern*>::const_iterator current = last + 1;
31 // For the first pattern, each letter is a label of an edge to a new node.
32 result += (*last)->pattern().size();
33
34 // For the subsequent patterns, only count the edges which were not counted
35 // yet. For this it suffices to test against the previous pattern, because the
36 // patterns are sorted.
37 for (; current != patterns.end(); ++last, ++current) {
38 const std::string& last_s = (*last)->pattern();
battre 2013/05/03 21:59:12 what does "_s" stand for? Can you pick a name with
vabr (Chromium) 2013/05/05 23:36:49 Done.
39 const std::string& current_s = (*current)->pattern();
40 const unsigned prefix_bound = std::min(last_s.size(), current_s.size());
41
42 unsigned common_prefix = 0;
43 while (common_prefix < prefix_bound &&
44 last_s[common_prefix] == current_s[common_prefix])
45 ++common_prefix;
46 result += current_s.size() - common_prefix;
47 }
48 return result;
49 }
50
51 } // namespace
52
14 // 53 //
15 // SubstringSetMatcher 54 // SubstringSetMatcher
16 // 55 //
17 56
18 SubstringSetMatcher::SubstringSetMatcher() { 57 SubstringSetMatcher::SubstringSetMatcher() {
19 RebuildAhoCorasickTree(); 58 RebuildAhoCorasickTree();
20 } 59 }
21 60
22 SubstringSetMatcher::~SubstringSetMatcher() {} 61 SubstringSetMatcher::~SubstringSetMatcher() {}
23 62
(...skipping 18 matching lines...) Expand all
42 DCHECK(patterns_.find((*i)->id()) == patterns_.end()); 81 DCHECK(patterns_.find((*i)->id()) == patterns_.end());
43 patterns_[(*i)->id()] = *i; 82 patterns_[(*i)->id()] = *i;
44 } 83 }
45 84
46 // Unregister patterns 85 // Unregister patterns
47 for (std::vector<const StringPattern*>::const_iterator i = 86 for (std::vector<const StringPattern*>::const_iterator i =
48 to_unregister.begin(); i != to_unregister.end(); ++i) { 87 to_unregister.begin(); i != to_unregister.end(); ++i) {
49 patterns_.erase((*i)->id()); 88 patterns_.erase((*i)->id());
50 } 89 }
51 90
91 // We compute the total number of tree nodes needed by using this vector.
92 std::vector<const StringPattern*> sorted_patterns(patterns_.size());
93 // First, fill it from |patterns_|.
94 size_t next = 0;
95 for (SubstringPatternSet::const_iterator i = patterns_.begin();
96 i != patterns_.end();
97 ++i, ++next) {
98 sorted_patterns[next] = i->second;
99 }
100
101 std::sort(sorted_patterns.begin(), sorted_patterns.end(), ComparePatterns);
102 tree_.reserve(TreeSize(sorted_patterns));
103
52 RebuildAhoCorasickTree(); 104 RebuildAhoCorasickTree();
53 } 105 }
54 106
55 bool SubstringSetMatcher::Match(const std::string& text, 107 bool SubstringSetMatcher::Match(const std::string& text,
56 std::set<StringPattern::ID>* matches) const { 108 std::set<StringPattern::ID>* matches) const {
57 size_t old_number_of_matches = matches->size(); 109 const size_t old_number_of_matches = matches->size();
58 110
59 // Handle patterns matching the empty string. 111 // Handle patterns matching the empty string.
60 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end()); 112 matches->insert(tree_[0].matches().begin(), tree_[0].matches().end());
61 113
62 int current_node = 0; 114 unsigned current_node = 0;
63 size_t text_length = text.length(); 115 for (std::string::const_iterator i = text.begin(); i != text.end(); ++i) {
64 for (size_t i = 0; i < text_length; ++i) { 116 unsigned edge_from_current = tree_[current_node].GetEdge(*i);
65 while (!tree_[current_node].HasEdge(text[i]) && current_node != 0) 117 while (edge_from_current == AhoCorasickNode::kInvalidIndex &&
118 current_node != 0) {
66 current_node = tree_[current_node].failure(); 119 current_node = tree_[current_node].failure();
67 if (tree_[current_node].HasEdge(text[i])) { 120 edge_from_current = tree_[current_node].GetEdge(*i);
68 current_node = tree_[current_node].GetEdge(text[i]); 121 }
122 if (edge_from_current != AhoCorasickNode::kInvalidIndex) {
123 current_node = edge_from_current;
69 matches->insert(tree_[current_node].matches().begin(), 124 matches->insert(tree_[current_node].matches().begin(),
70 tree_[current_node].matches().end()); 125 tree_[current_node].matches().end());
71 } else { 126 } else {
72 DCHECK_EQ(0, current_node); 127 DCHECK_EQ(0u, current_node);
73 } 128 }
74 } 129 }
75 130
76 return old_number_of_matches != matches->size(); 131 return old_number_of_matches != matches->size();
77 } 132 }
78 133
79 bool SubstringSetMatcher::IsEmpty() const { 134 bool SubstringSetMatcher::IsEmpty() const {
80 // An empty tree consists of only the root node. 135 // An empty tree consists of only the root node.
81 return patterns_.empty() && tree_.size() == 1u; 136 return patterns_.empty() && tree_.size() == 1u;
82 } 137 }
(...skipping 11 matching lines...) Expand all
94 i != patterns_.end(); ++i) { 149 i != patterns_.end(); ++i) {
95 InsertPatternIntoAhoCorasickTree(i->second); 150 InsertPatternIntoAhoCorasickTree(i->second);
96 } 151 }
97 152
98 CreateFailureEdges(); 153 CreateFailureEdges();
99 } 154 }
100 155
101 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree( 156 void SubstringSetMatcher::InsertPatternIntoAhoCorasickTree(
102 const StringPattern* pattern) { 157 const StringPattern* pattern) {
103 const std::string& text = pattern->pattern(); 158 const std::string& text = pattern->pattern();
104 size_t text_length = text.length();
105 159
106 // Iterators on the tree and the text. 160 // Iterators on the tree and the text.
107 int current_node = 0; 161 unsigned current_node = 0;
108 size_t text_pos = 0; 162 std::string::const_iterator i = text.begin();
109 163
110 // Follow existing paths for as long as possible. 164 // Follow existing paths for as long as possible.
111 while (text_pos < text_length && 165 unsigned edge_from_current = tree_[current_node].GetEdge(*i);
112 tree_[current_node].HasEdge(text[text_pos])) { 166 while (i != text.end() &&
battre 2013/05/03 21:59:12 text is immutable. Should we have a const std::str
vabr (Chromium) 2013/05/05 23:36:49 It does not seem to make a difference performance-
113 current_node = tree_[current_node].GetEdge(text[text_pos]); 167 edge_from_current != AhoCorasickNode::kInvalidIndex) {
114 ++text_pos; 168 current_node = edge_from_current;
169 ++i;
170 edge_from_current = tree_[current_node].GetEdge(*i);
115 } 171 }
116 172
117 // Create new nodes if necessary. 173 // Create new nodes if necessary.
118 while (text_pos < text_length) { 174 while (i != text.end()) {
119 AhoCorasickNode new_node; 175 tree_.push_back(AhoCorasickNode());
120 tree_.push_back(new_node); 176 tree_[current_node].SetEdge(*i, tree_.size() - 1);
121 tree_[current_node].SetEdge(text[text_pos], tree_.size() - 1);
122 current_node = tree_.size() - 1; 177 current_node = tree_.size() - 1;
123 ++text_pos; 178 ++i;
124 } 179 }
125 180
126 // Register match. 181 // Register match.
127 tree_[current_node].AddMatch(pattern->id()); 182 tree_[current_node].AddMatch(pattern->id());
128 } 183 }
129 184
130 void SubstringSetMatcher::CreateFailureEdges() { 185 void SubstringSetMatcher::CreateFailureEdges() {
131 typedef AhoCorasickNode::Edges Edges; 186 typedef AhoCorasickNode::Edges Edges;
132 187
133 std::queue<int> queue; 188 std::queue<unsigned> queue;
134 189
135 AhoCorasickNode& root = tree_[0]; 190 AhoCorasickNode& root = tree_[0];
136 root.set_failure(0); 191 root.set_failure(0);
137 const AhoCorasickNode::Edges& root_edges = root.edges(); 192 const Edges& root_edges = root.edges();
138 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end(); 193 for (Edges::const_iterator e = root_edges.begin(); e != root_edges.end();
139 ++e) { 194 ++e) {
140 int leads_to = e->second; 195 const unsigned& leads_to = e->second;
vabr (Chromium) 2013/05/03 20:37:27 I know this is uncommon. However, this method is a
battre 2013/05/03 21:59:12 Have you checked what happens if you used "const i
vabr (Chromium) 2013/05/05 23:36:49 Are you asking about removing the reference, or mo
141 tree_[leads_to].set_failure(0); 196 tree_[leads_to].set_failure(0);
142 queue.push(leads_to); 197 queue.push(leads_to);
143 } 198 }
144 199
145 while (!queue.empty()) { 200 while (!queue.empty()) {
146 AhoCorasickNode& current_node = tree_[queue.front()]; 201 AhoCorasickNode& current_node = tree_[queue.front()];
147 queue.pop(); 202 queue.pop();
148 for (Edges::const_iterator e = current_node.edges().begin(); 203 for (Edges::const_iterator e = current_node.edges().begin();
149 e != current_node.edges().end(); ++e) { 204 e != current_node.edges().end(); ++e) {
150 char edge_label = e->first; 205 const char& edge_label = e->first;
151 int leads_to = e->second; 206 const unsigned& leads_to = e->second;
152 queue.push(leads_to); 207 queue.push(leads_to);
153 208
154 int failure = current_node.failure(); 209 unsigned failure = current_node.failure();
155 while (!tree_[failure].HasEdge(edge_label) && failure != 0) 210 unsigned edge_from_failure = tree_[failure].GetEdge(edge_label);
211 while (edge_from_failure == AhoCorasickNode::kInvalidIndex &&
212 failure != 0) {
156 failure = tree_[failure].failure(); 213 failure = tree_[failure].failure();
214 edge_from_failure = tree_[failure].GetEdge(edge_label);
215 }
157 216
158 int follow_in_case_of_failure = tree_[failure].HasEdge(e->first) ? 217 const unsigned follow_in_case_of_failure =
159 tree_[failure].GetEdge(edge_label) : 0; 218 edge_from_failure != AhoCorasickNode::kInvalidIndex
219 ? edge_from_failure
220 : 0;
160 tree_[leads_to].set_failure(follow_in_case_of_failure); 221 tree_[leads_to].set_failure(follow_in_case_of_failure);
161 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches()); 222 tree_[leads_to].AddMatches(tree_[follow_in_case_of_failure].matches());
162 } 223 }
163 } 224 }
164 } 225 }
165 226
166 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode() 227 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode()
167 : failure_(-1) {} 228 : failure_(kInvalidIndex) {}
168 229
169 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {} 230 SubstringSetMatcher::AhoCorasickNode::~AhoCorasickNode() {}
170 231
171 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode( 232 SubstringSetMatcher::AhoCorasickNode::AhoCorasickNode(
172 const SubstringSetMatcher::AhoCorasickNode& other) 233 const SubstringSetMatcher::AhoCorasickNode& other)
173 : edges_(other.edges_), 234 : edges_(other.edges_),
174 failure_(other.failure_), 235 failure_(other.failure_),
175 matches_(other.matches_) {} 236 matches_(other.matches_) {}
176 237
177 SubstringSetMatcher::AhoCorasickNode& 238 SubstringSetMatcher::AhoCorasickNode&
178 SubstringSetMatcher::AhoCorasickNode::operator=( 239 SubstringSetMatcher::AhoCorasickNode::operator=(
179 const SubstringSetMatcher::AhoCorasickNode& other) { 240 const SubstringSetMatcher::AhoCorasickNode& other) {
180 edges_ = other.edges_; 241 edges_ = other.edges_;
181 failure_ = other.failure_; 242 failure_ = other.failure_;
182 matches_ = other.matches_; 243 matches_ = other.matches_;
183 return *this; 244 return *this;
184 } 245 }
185 246
186 bool SubstringSetMatcher::AhoCorasickNode::HasEdge(char c) const { 247 unsigned SubstringSetMatcher::AhoCorasickNode::GetEdge(
187 return edges_.find(c) != edges_.end(); 248 char c) const {
249 Edges::const_iterator i = edges_.find(c);
250 return i == edges_.end() ? kInvalidIndex : i->second;
188 } 251 }
189 252
190 int SubstringSetMatcher::AhoCorasickNode::GetEdge(char c) const { 253 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, unsigned node) {
191 std::map<char, int>::const_iterator i = edges_.find(c);
192 return i == edges_.end() ? -1 : i->second;
193 }
194
195 void SubstringSetMatcher::AhoCorasickNode::SetEdge(char c, int node) {
196 edges_[c] = node; 254 edges_[c] = node;
197 } 255 }
198 256
199 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) { 257 void SubstringSetMatcher::AhoCorasickNode::AddMatch(StringPattern::ID id) {
200 matches_.insert(id); 258 matches_.insert(id);
201 } 259 }
202 260
203 void SubstringSetMatcher::AhoCorasickNode::AddMatches( 261 void SubstringSetMatcher::AhoCorasickNode::AddMatches(
204 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) { 262 const SubstringSetMatcher::AhoCorasickNode::Matches& matches) {
205 matches_.insert(matches.begin(), matches.end()); 263 matches_.insert(matches.begin(), matches.end());
206 } 264 }
207 265
208 } // namespace extensions 266 } // namespace extensions
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698