OLD | NEW |
| (Empty) |
1 // Copyright 2009 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 #include "util/util.h" | |
6 #include "util/flags.h" | |
7 #include "re2/prefilter.h" | |
8 #include "re2/prefilter_tree.h" | |
9 #include "re2/re2.h" | |
10 | |
11 DEFINE_int32(filtered_re2_min_atom_len, | |
12 3, | |
13 "Strings less than this length are not stored as atoms"); | |
14 | |
15 namespace re2 { | |
16 | |
17 PrefilterTree::PrefilterTree() | |
18 : compiled_(false) { | |
19 } | |
20 | |
21 PrefilterTree::~PrefilterTree() { | |
22 for (size_t i = 0; i < prefilter_vec_.size(); i++) | |
23 delete prefilter_vec_[i]; | |
24 | |
25 for (size_t i = 0; i < entries_.size(); i++) | |
26 delete entries_[i].parents; | |
27 } | |
28 | |
29 // Functions used for adding and Compiling prefilters to the | |
30 // PrefilterTree. | |
31 static bool KeepPart(Prefilter* prefilter, int level) { | |
32 if (prefilter == NULL) | |
33 return false; | |
34 | |
35 switch (prefilter->op()) { | |
36 default: | |
37 LOG(DFATAL) << "Unexpected op in KeepPart: " | |
38 << prefilter->op(); | |
39 return false; | |
40 | |
41 case Prefilter::ALL: | |
42 return false; | |
43 | |
44 case Prefilter::ATOM: | |
45 return prefilter->atom().size() >= | |
46 static_cast<size_t>(FLAGS_filtered_re2_min_atom_len); | |
47 | |
48 case Prefilter::AND: { | |
49 int j = 0; | |
50 vector<Prefilter*>* subs = prefilter->subs(); | |
51 for (size_t i = 0; i < subs->size(); i++) | |
52 if (KeepPart((*subs)[i], level + 1)) | |
53 (*subs)[j++] = (*subs)[i]; | |
54 else | |
55 delete (*subs)[i]; | |
56 | |
57 subs->resize(j); | |
58 return j > 0; | |
59 } | |
60 | |
61 case Prefilter::OR: | |
62 for (size_t i = 0; i < prefilter->subs()->size(); i++) | |
63 if (!KeepPart((*prefilter->subs())[i], level + 1)) | |
64 return false; | |
65 return true; | |
66 } | |
67 } | |
68 | |
69 void PrefilterTree::Add(Prefilter *f) { | |
70 if (compiled_) { | |
71 LOG(DFATAL) << "Add after Compile."; | |
72 return; | |
73 } | |
74 if (f != NULL && !KeepPart(f, 0)) { | |
75 delete f; | |
76 f = NULL; | |
77 } | |
78 | |
79 prefilter_vec_.push_back(f); | |
80 } | |
81 | |
82 void PrefilterTree::Compile(vector<string>* atom_vec) { | |
83 if (compiled_) { | |
84 LOG(DFATAL) << "Compile after Compile."; | |
85 return; | |
86 } | |
87 | |
88 // We do this check to support some legacy uses of | |
89 // PrefilterTree that call Compile before adding any regexps, | |
90 // and expect Compile not to have effect. | |
91 if (prefilter_vec_.empty()) | |
92 return; | |
93 | |
94 compiled_ = true; | |
95 | |
96 AssignUniqueIds(atom_vec); | |
97 | |
98 // Identify nodes that are too common among prefilters and are | |
99 // triggering too many parents. Then get rid of them if possible. | |
100 // Note that getting rid of a prefilter node simply means they are | |
101 // no longer necessary for their parent to trigger; that is, we do | |
102 // not miss out on any regexps triggering by getting rid of a | |
103 // prefilter node. | |
104 for (size_t i = 0; i < entries_.size(); i++) { | |
105 StdIntMap* parents = entries_[i].parents; | |
106 if (parents->size() > 8) { | |
107 // This one triggers too many things. If all the parents are AND | |
108 // nodes and have other things guarding them, then get rid of | |
109 // this trigger. TODO(vsri): Adjust the threshold appropriately, | |
110 // make it a function of total number of nodes? | |
111 bool have_other_guard = true; | |
112 for (StdIntMap::iterator it = parents->begin(); | |
113 it != parents->end(); ++it) { | |
114 have_other_guard = have_other_guard && | |
115 (entries_[it->first].propagate_up_at_count > 1); | |
116 } | |
117 | |
118 if (have_other_guard) { | |
119 for (StdIntMap::iterator it = parents->begin(); | |
120 it != parents->end(); ++it) | |
121 entries_[it->first].propagate_up_at_count -= 1; | |
122 | |
123 parents->clear(); // Forget the parents | |
124 } | |
125 } | |
126 } | |
127 | |
128 PrintDebugInfo(); | |
129 } | |
130 | |
131 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { | |
132 string node_string = NodeString(node); | |
133 map<string, Prefilter*>::iterator iter = node_map_.find(node_string); | |
134 if (iter == node_map_.end()) | |
135 return NULL; | |
136 return (*iter).second; | |
137 } | |
138 | |
139 static string Itoa(int n) { | |
140 char buf[100]; | |
141 snprintf(buf, sizeof buf, "%d", n); | |
142 return string(buf); | |
143 } | |
144 | |
145 string PrefilterTree::NodeString(Prefilter* node) const { | |
146 // Adding the operation disambiguates AND/OR/atom nodes. | |
147 string s = Itoa(node->op()) + ":"; | |
148 if (node->op() == Prefilter::ATOM) { | |
149 s += node->atom(); | |
150 } else { | |
151 for (size_t i = 0; i < node->subs()->size(); i++) { | |
152 if (i > 0) | |
153 s += ','; | |
154 s += Itoa((*node->subs())[i]->unique_id()); | |
155 } | |
156 } | |
157 return s; | |
158 } | |
159 | |
160 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { | |
161 atom_vec->clear(); | |
162 | |
163 // Build vector of all filter nodes, sorted topologically | |
164 // from top to bottom in v. | |
165 vector<Prefilter*> v; | |
166 | |
167 // Add the top level nodes of each regexp prefilter. | |
168 for (size_t i = 0; i < prefilter_vec_.size(); i++) { | |
169 Prefilter* f = prefilter_vec_[i]; | |
170 if (f == NULL) | |
171 unfiltered_.push_back(static_cast<int>(i)); | |
172 | |
173 // We push NULL also on to v, so that we maintain the | |
174 // mapping of index==regexpid for level=0 prefilter nodes. | |
175 v.push_back(f); | |
176 } | |
177 | |
178 // Now add all the descendant nodes. | |
179 for (size_t i = 0; i < v.size(); i++) { | |
180 Prefilter* f = v[i]; | |
181 if (f == NULL) | |
182 continue; | |
183 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { | |
184 const vector<Prefilter*>& subs = *f->subs(); | |
185 for (size_t j = 0; j < subs.size(); j++) | |
186 v.push_back(subs[j]); | |
187 } | |
188 } | |
189 | |
190 // Identify unique nodes. | |
191 int unique_id = 0; | |
192 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { | |
193 Prefilter *node = v[i]; | |
194 if (node == NULL) | |
195 continue; | |
196 node->set_unique_id(-1); | |
197 Prefilter* canonical = CanonicalNode(node); | |
198 if (canonical == NULL) { | |
199 // Any further nodes that have the same node string | |
200 // will find this node as the canonical node. | |
201 node_map_[NodeString(node)] = node; | |
202 if (node->op() == Prefilter::ATOM) { | |
203 atom_vec->push_back(node->atom()); | |
204 atom_index_to_id_.push_back(unique_id); | |
205 } | |
206 node->set_unique_id(unique_id++); | |
207 } else { | |
208 node->set_unique_id(canonical->unique_id()); | |
209 } | |
210 } | |
211 entries_.resize(node_map_.size()); | |
212 | |
213 // Create parent StdIntMap for the entries. | |
214 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { | |
215 Prefilter* prefilter = v[i]; | |
216 if (prefilter == NULL) | |
217 continue; | |
218 | |
219 if (CanonicalNode(prefilter) != prefilter) | |
220 continue; | |
221 | |
222 Entry* entry = &entries_[prefilter->unique_id()]; | |
223 entry->parents = new StdIntMap(); | |
224 } | |
225 | |
226 // Fill the entries. | |
227 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { | |
228 Prefilter* prefilter = v[i]; | |
229 if (prefilter == NULL) | |
230 continue; | |
231 | |
232 if (CanonicalNode(prefilter) != prefilter) | |
233 continue; | |
234 | |
235 Entry* entry = &entries_[prefilter->unique_id()]; | |
236 | |
237 switch (prefilter->op()) { | |
238 default: | |
239 case Prefilter::ALL: | |
240 LOG(DFATAL) << "Unexpected op: " << prefilter->op(); | |
241 return; | |
242 | |
243 case Prefilter::ATOM: | |
244 entry->propagate_up_at_count = 1; | |
245 break; | |
246 | |
247 case Prefilter::OR: | |
248 case Prefilter::AND: { | |
249 set<int> uniq_child; | |
250 for (size_t j = 0; j < prefilter->subs()->size(); j++) { | |
251 Prefilter* child = (*prefilter->subs())[j]; | |
252 Prefilter* canonical = CanonicalNode(child); | |
253 if (canonical == NULL) { | |
254 LOG(DFATAL) << "Null canonical node"; | |
255 return; | |
256 } | |
257 int child_id = canonical->unique_id(); | |
258 uniq_child.insert(child_id); | |
259 // To the child, we want to add to parent indices. | |
260 Entry* child_entry = &entries_[child_id]; | |
261 if (child_entry->parents->find(prefilter->unique_id()) == | |
262 child_entry->parents->end()) { | |
263 (*child_entry->parents)[prefilter->unique_id()] = 1; | |
264 } | |
265 } | |
266 entry->propagate_up_at_count = prefilter->op() == Prefilter::AND | |
267 ? static_cast<int>(uniq_child.size()) | |
268 : 1; | |
269 | |
270 break; | |
271 } | |
272 } | |
273 } | |
274 | |
275 // For top level nodes, populate regexp id. | |
276 for (size_t i = 0; i < prefilter_vec_.size(); i++) { | |
277 if (prefilter_vec_[i] == NULL) | |
278 continue; | |
279 int id = CanonicalNode(prefilter_vec_[i])->unique_id(); | |
280 DCHECK_LE(0, id); | |
281 Entry* entry = &entries_[id]; | |
282 entry->regexps.push_back(static_cast<int>(i)); | |
283 } | |
284 } | |
285 | |
286 // Functions for triggering during search. | |
287 void PrefilterTree::RegexpsGivenStrings( | |
288 const vector<int>& matched_atoms, | |
289 vector<int>* regexps) const { | |
290 regexps->clear(); | |
291 if (!compiled_) { | |
292 LOG(WARNING) << "Compile() not called"; | |
293 for (size_t i = 0; i < prefilter_vec_.size(); ++i) | |
294 regexps->push_back(static_cast<int>(i)); | |
295 } else { | |
296 if (!prefilter_vec_.empty()) { | |
297 IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); | |
298 vector<int> matched_atom_ids; | |
299 for (size_t j = 0; j < matched_atoms.size(); j++) { | |
300 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); | |
301 VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; | |
302 } | |
303 PropagateMatch(matched_atom_ids, ®exps_map); | |
304 for (IntMap::iterator it = regexps_map.begin(); | |
305 it != regexps_map.end(); | |
306 ++it) | |
307 regexps->push_back(it->index()); | |
308 | |
309 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); | |
310 } | |
311 } | |
312 sort(regexps->begin(), regexps->end()); | |
313 } | |
314 | |
315 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids, | |
316 IntMap* regexps) const { | |
317 IntMap count(static_cast<int>(entries_.size())); | |
318 IntMap work(static_cast<int>(entries_.size())); | |
319 for (size_t i = 0; i < atom_ids.size(); i++) | |
320 work.set(atom_ids[i], 1); | |
321 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { | |
322 const Entry& entry = entries_[it->index()]; | |
323 VLOG(10) << "Processing: " << it->index(); | |
324 // Record regexps triggered. | |
325 for (size_t i = 0; i < entry.regexps.size(); i++) { | |
326 VLOG(10) << "Regexp triggered: " << entry.regexps[i]; | |
327 regexps->set(entry.regexps[i], 1); | |
328 } | |
329 int c; | |
330 // Pass trigger up to parents. | |
331 for (StdIntMap::iterator it = entry.parents->begin(); | |
332 it != entry.parents->end(); | |
333 ++it) { | |
334 int j = it->first; | |
335 const Entry& parent = entries_[j]; | |
336 VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; | |
337 // Delay until all the children have succeeded. | |
338 if (parent.propagate_up_at_count > 1) { | |
339 if (count.has_index(j)) { | |
340 c = count.get_existing(j) + 1; | |
341 count.set_existing(j, c); | |
342 } else { | |
343 c = 1; | |
344 count.set_new(j, c); | |
345 } | |
346 if (c < parent.propagate_up_at_count) | |
347 continue; | |
348 } | |
349 VLOG(10) << "Triggering: " << j; | |
350 // Trigger the parent. | |
351 work.set(j, 1); | |
352 } | |
353 } | |
354 } | |
355 | |
356 // Debugging help. | |
357 void PrefilterTree::PrintPrefilter(int regexpid) { | |
358 LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); | |
359 } | |
360 | |
361 void PrefilterTree::PrintDebugInfo() { | |
362 VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); | |
363 VLOG(10) << "#Unique Nodes: " << entries_.size(); | |
364 | |
365 for (size_t i = 0; i < entries_.size(); ++i) { | |
366 StdIntMap* parents = entries_[i].parents; | |
367 const vector<int>& regexps = entries_[i].regexps; | |
368 VLOG(10) << "EntryId: " << i | |
369 << " N: " << parents->size() << " R: " << regexps.size(); | |
370 for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) | |
371 VLOG(10) << it->first; | |
372 } | |
373 VLOG(10) << "Map:"; | |
374 for (map<string, Prefilter*>::const_iterator iter = node_map_.begin(); | |
375 iter != node_map_.end(); ++iter) | |
376 VLOG(10) << "NodeId: " << (*iter).second->unique_id() | |
377 << " Str: " << (*iter).first; | |
378 } | |
379 | |
380 string PrefilterTree::DebugNodeString(Prefilter* node) const { | |
381 string node_string = ""; | |
382 | |
383 if (node->op() == Prefilter::ATOM) { | |
384 DCHECK(!node->atom().empty()); | |
385 node_string += node->atom(); | |
386 } else { | |
387 // Adding the operation disambiguates AND and OR nodes. | |
388 node_string += node->op() == Prefilter::AND ? "AND" : "OR"; | |
389 node_string += "("; | |
390 for (size_t i = 0; i < node->subs()->size(); i++) { | |
391 if (i > 0) | |
392 node_string += ','; | |
393 node_string += Itoa((*node->subs())[i]->unique_id()); | |
394 node_string += ":"; | |
395 node_string += DebugNodeString((*node->subs())[i]); | |
396 } | |
397 node_string += ")"; | |
398 } | |
399 return node_string; | |
400 } | |
401 | |
402 } // namespace re2 | |
OLD | NEW |