Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(43)

Side by Side Diff: third_party/re2/re2/prefilter_tree.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/prefilter_tree.h ('k') | third_party/re2/re2/prog.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "util/util.h"
6 #include "util/flags.h"
7 #include "re2/prefilter.h"
8 #include "re2/prefilter_tree.h"
9 #include "re2/re2.h"
10
11 DEFINE_int32(filtered_re2_min_atom_len,
12 3,
13 "Strings less than this length are not stored as atoms");
14
15 namespace re2 {
16
17 PrefilterTree::PrefilterTree()
18 : compiled_(false) {
19 }
20
21 PrefilterTree::~PrefilterTree() {
22 for (size_t i = 0; i < prefilter_vec_.size(); i++)
23 delete prefilter_vec_[i];
24
25 for (size_t i = 0; i < entries_.size(); i++)
26 delete entries_[i].parents;
27 }
28
29 // Functions used for adding and Compiling prefilters to the
30 // PrefilterTree.
31 static bool KeepPart(Prefilter* prefilter, int level) {
32 if (prefilter == NULL)
33 return false;
34
35 switch (prefilter->op()) {
36 default:
37 LOG(DFATAL) << "Unexpected op in KeepPart: "
38 << prefilter->op();
39 return false;
40
41 case Prefilter::ALL:
42 return false;
43
44 case Prefilter::ATOM:
45 return prefilter->atom().size() >=
46 static_cast<size_t>(FLAGS_filtered_re2_min_atom_len);
47
48 case Prefilter::AND: {
49 int j = 0;
50 vector<Prefilter*>* subs = prefilter->subs();
51 for (size_t i = 0; i < subs->size(); i++)
52 if (KeepPart((*subs)[i], level + 1))
53 (*subs)[j++] = (*subs)[i];
54 else
55 delete (*subs)[i];
56
57 subs->resize(j);
58 return j > 0;
59 }
60
61 case Prefilter::OR:
62 for (size_t i = 0; i < prefilter->subs()->size(); i++)
63 if (!KeepPart((*prefilter->subs())[i], level + 1))
64 return false;
65 return true;
66 }
67 }
68
69 void PrefilterTree::Add(Prefilter *f) {
70 if (compiled_) {
71 LOG(DFATAL) << "Add after Compile.";
72 return;
73 }
74 if (f != NULL && !KeepPart(f, 0)) {
75 delete f;
76 f = NULL;
77 }
78
79 prefilter_vec_.push_back(f);
80 }
81
82 void PrefilterTree::Compile(vector<string>* atom_vec) {
83 if (compiled_) {
84 LOG(DFATAL) << "Compile after Compile.";
85 return;
86 }
87
88 // We do this check to support some legacy uses of
89 // PrefilterTree that call Compile before adding any regexps,
90 // and expect Compile not to have effect.
91 if (prefilter_vec_.empty())
92 return;
93
94 compiled_ = true;
95
96 AssignUniqueIds(atom_vec);
97
98 // Identify nodes that are too common among prefilters and are
99 // triggering too many parents. Then get rid of them if possible.
100 // Note that getting rid of a prefilter node simply means they are
101 // no longer necessary for their parent to trigger; that is, we do
102 // not miss out on any regexps triggering by getting rid of a
103 // prefilter node.
104 for (size_t i = 0; i < entries_.size(); i++) {
105 StdIntMap* parents = entries_[i].parents;
106 if (parents->size() > 8) {
107 // This one triggers too many things. If all the parents are AND
108 // nodes and have other things guarding them, then get rid of
109 // this trigger. TODO(vsri): Adjust the threshold appropriately,
110 // make it a function of total number of nodes?
111 bool have_other_guard = true;
112 for (StdIntMap::iterator it = parents->begin();
113 it != parents->end(); ++it) {
114 have_other_guard = have_other_guard &&
115 (entries_[it->first].propagate_up_at_count > 1);
116 }
117
118 if (have_other_guard) {
119 for (StdIntMap::iterator it = parents->begin();
120 it != parents->end(); ++it)
121 entries_[it->first].propagate_up_at_count -= 1;
122
123 parents->clear(); // Forget the parents
124 }
125 }
126 }
127
128 PrintDebugInfo();
129 }
130
131 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) {
132 string node_string = NodeString(node);
133 map<string, Prefilter*>::iterator iter = node_map_.find(node_string);
134 if (iter == node_map_.end())
135 return NULL;
136 return (*iter).second;
137 }
138
139 static string Itoa(int n) {
140 char buf[100];
141 snprintf(buf, sizeof buf, "%d", n);
142 return string(buf);
143 }
144
145 string PrefilterTree::NodeString(Prefilter* node) const {
146 // Adding the operation disambiguates AND/OR/atom nodes.
147 string s = Itoa(node->op()) + ":";
148 if (node->op() == Prefilter::ATOM) {
149 s += node->atom();
150 } else {
151 for (size_t i = 0; i < node->subs()->size(); i++) {
152 if (i > 0)
153 s += ',';
154 s += Itoa((*node->subs())[i]->unique_id());
155 }
156 }
157 return s;
158 }
159
160 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) {
161 atom_vec->clear();
162
163 // Build vector of all filter nodes, sorted topologically
164 // from top to bottom in v.
165 vector<Prefilter*> v;
166
167 // Add the top level nodes of each regexp prefilter.
168 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
169 Prefilter* f = prefilter_vec_[i];
170 if (f == NULL)
171 unfiltered_.push_back(static_cast<int>(i));
172
173 // We push NULL also on to v, so that we maintain the
174 // mapping of index==regexpid for level=0 prefilter nodes.
175 v.push_back(f);
176 }
177
178 // Now add all the descendant nodes.
179 for (size_t i = 0; i < v.size(); i++) {
180 Prefilter* f = v[i];
181 if (f == NULL)
182 continue;
183 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) {
184 const vector<Prefilter*>& subs = *f->subs();
185 for (size_t j = 0; j < subs.size(); j++)
186 v.push_back(subs[j]);
187 }
188 }
189
190 // Identify unique nodes.
191 int unique_id = 0;
192 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
193 Prefilter *node = v[i];
194 if (node == NULL)
195 continue;
196 node->set_unique_id(-1);
197 Prefilter* canonical = CanonicalNode(node);
198 if (canonical == NULL) {
199 // Any further nodes that have the same node string
200 // will find this node as the canonical node.
201 node_map_[NodeString(node)] = node;
202 if (node->op() == Prefilter::ATOM) {
203 atom_vec->push_back(node->atom());
204 atom_index_to_id_.push_back(unique_id);
205 }
206 node->set_unique_id(unique_id++);
207 } else {
208 node->set_unique_id(canonical->unique_id());
209 }
210 }
211 entries_.resize(node_map_.size());
212
213 // Create parent StdIntMap for the entries.
214 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
215 Prefilter* prefilter = v[i];
216 if (prefilter == NULL)
217 continue;
218
219 if (CanonicalNode(prefilter) != prefilter)
220 continue;
221
222 Entry* entry = &entries_[prefilter->unique_id()];
223 entry->parents = new StdIntMap();
224 }
225
226 // Fill the entries.
227 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) {
228 Prefilter* prefilter = v[i];
229 if (prefilter == NULL)
230 continue;
231
232 if (CanonicalNode(prefilter) != prefilter)
233 continue;
234
235 Entry* entry = &entries_[prefilter->unique_id()];
236
237 switch (prefilter->op()) {
238 default:
239 case Prefilter::ALL:
240 LOG(DFATAL) << "Unexpected op: " << prefilter->op();
241 return;
242
243 case Prefilter::ATOM:
244 entry->propagate_up_at_count = 1;
245 break;
246
247 case Prefilter::OR:
248 case Prefilter::AND: {
249 set<int> uniq_child;
250 for (size_t j = 0; j < prefilter->subs()->size(); j++) {
251 Prefilter* child = (*prefilter->subs())[j];
252 Prefilter* canonical = CanonicalNode(child);
253 if (canonical == NULL) {
254 LOG(DFATAL) << "Null canonical node";
255 return;
256 }
257 int child_id = canonical->unique_id();
258 uniq_child.insert(child_id);
259 // To the child, we want to add to parent indices.
260 Entry* child_entry = &entries_[child_id];
261 if (child_entry->parents->find(prefilter->unique_id()) ==
262 child_entry->parents->end()) {
263 (*child_entry->parents)[prefilter->unique_id()] = 1;
264 }
265 }
266 entry->propagate_up_at_count = prefilter->op() == Prefilter::AND
267 ? static_cast<int>(uniq_child.size())
268 : 1;
269
270 break;
271 }
272 }
273 }
274
275 // For top level nodes, populate regexp id.
276 for (size_t i = 0; i < prefilter_vec_.size(); i++) {
277 if (prefilter_vec_[i] == NULL)
278 continue;
279 int id = CanonicalNode(prefilter_vec_[i])->unique_id();
280 DCHECK_LE(0, id);
281 Entry* entry = &entries_[id];
282 entry->regexps.push_back(static_cast<int>(i));
283 }
284 }
285
286 // Functions for triggering during search.
287 void PrefilterTree::RegexpsGivenStrings(
288 const vector<int>& matched_atoms,
289 vector<int>* regexps) const {
290 regexps->clear();
291 if (!compiled_) {
292 LOG(WARNING) << "Compile() not called";
293 for (size_t i = 0; i < prefilter_vec_.size(); ++i)
294 regexps->push_back(static_cast<int>(i));
295 } else {
296 if (!prefilter_vec_.empty()) {
297 IntMap regexps_map(static_cast<int>(prefilter_vec_.size()));
298 vector<int> matched_atom_ids;
299 for (size_t j = 0; j < matched_atoms.size(); j++) {
300 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]);
301 VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]];
302 }
303 PropagateMatch(matched_atom_ids, &regexps_map);
304 for (IntMap::iterator it = regexps_map.begin();
305 it != regexps_map.end();
306 ++it)
307 regexps->push_back(it->index());
308
309 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end());
310 }
311 }
312 sort(regexps->begin(), regexps->end());
313 }
314
315 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids,
316 IntMap* regexps) const {
317 IntMap count(static_cast<int>(entries_.size()));
318 IntMap work(static_cast<int>(entries_.size()));
319 for (size_t i = 0; i < atom_ids.size(); i++)
320 work.set(atom_ids[i], 1);
321 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) {
322 const Entry& entry = entries_[it->index()];
323 VLOG(10) << "Processing: " << it->index();
324 // Record regexps triggered.
325 for (size_t i = 0; i < entry.regexps.size(); i++) {
326 VLOG(10) << "Regexp triggered: " << entry.regexps[i];
327 regexps->set(entry.regexps[i], 1);
328 }
329 int c;
330 // Pass trigger up to parents.
331 for (StdIntMap::iterator it = entry.parents->begin();
332 it != entry.parents->end();
333 ++it) {
334 int j = it->first;
335 const Entry& parent = entries_[j];
336 VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count;
337 // Delay until all the children have succeeded.
338 if (parent.propagate_up_at_count > 1) {
339 if (count.has_index(j)) {
340 c = count.get_existing(j) + 1;
341 count.set_existing(j, c);
342 } else {
343 c = 1;
344 count.set_new(j, c);
345 }
346 if (c < parent.propagate_up_at_count)
347 continue;
348 }
349 VLOG(10) << "Triggering: " << j;
350 // Trigger the parent.
351 work.set(j, 1);
352 }
353 }
354 }
355
356 // Debugging help.
357 void PrefilterTree::PrintPrefilter(int regexpid) {
358 LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]);
359 }
360
361 void PrefilterTree::PrintDebugInfo() {
362 VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size();
363 VLOG(10) << "#Unique Nodes: " << entries_.size();
364
365 for (size_t i = 0; i < entries_.size(); ++i) {
366 StdIntMap* parents = entries_[i].parents;
367 const vector<int>& regexps = entries_[i].regexps;
368 VLOG(10) << "EntryId: " << i
369 << " N: " << parents->size() << " R: " << regexps.size();
370 for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it)
371 VLOG(10) << it->first;
372 }
373 VLOG(10) << "Map:";
374 for (map<string, Prefilter*>::const_iterator iter = node_map_.begin();
375 iter != node_map_.end(); ++iter)
376 VLOG(10) << "NodeId: " << (*iter).second->unique_id()
377 << " Str: " << (*iter).first;
378 }
379
380 string PrefilterTree::DebugNodeString(Prefilter* node) const {
381 string node_string = "";
382
383 if (node->op() == Prefilter::ATOM) {
384 DCHECK(!node->atom().empty());
385 node_string += node->atom();
386 } else {
387 // Adding the operation disambiguates AND and OR nodes.
388 node_string += node->op() == Prefilter::AND ? "AND" : "OR";
389 node_string += "(";
390 for (size_t i = 0; i < node->subs()->size(); i++) {
391 if (i > 0)
392 node_string += ',';
393 node_string += Itoa((*node->subs())[i]->unique_id());
394 node_string += ":";
395 node_string += DebugNodeString((*node->subs())[i]);
396 }
397 node_string += ")";
398 }
399 return node_string;
400 }
401
402 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/prefilter_tree.h ('k') | third_party/re2/re2/prog.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698