| OLD | NEW |
| (Empty) |
| 1 // Copyright 2009 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 #include "util/util.h" | |
| 6 #include "util/flags.h" | |
| 7 #include "re2/prefilter.h" | |
| 8 #include "re2/prefilter_tree.h" | |
| 9 #include "re2/re2.h" | |
| 10 | |
| 11 DEFINE_int32(filtered_re2_min_atom_len, | |
| 12 3, | |
| 13 "Strings less than this length are not stored as atoms"); | |
| 14 | |
| 15 namespace re2 { | |
| 16 | |
| 17 PrefilterTree::PrefilterTree() | |
| 18 : compiled_(false) { | |
| 19 } | |
| 20 | |
| 21 PrefilterTree::~PrefilterTree() { | |
| 22 for (size_t i = 0; i < prefilter_vec_.size(); i++) | |
| 23 delete prefilter_vec_[i]; | |
| 24 | |
| 25 for (size_t i = 0; i < entries_.size(); i++) | |
| 26 delete entries_[i].parents; | |
| 27 } | |
| 28 | |
| 29 // Functions used for adding and Compiling prefilters to the | |
| 30 // PrefilterTree. | |
| 31 static bool KeepPart(Prefilter* prefilter, int level) { | |
| 32 if (prefilter == NULL) | |
| 33 return false; | |
| 34 | |
| 35 switch (prefilter->op()) { | |
| 36 default: | |
| 37 LOG(DFATAL) << "Unexpected op in KeepPart: " | |
| 38 << prefilter->op(); | |
| 39 return false; | |
| 40 | |
| 41 case Prefilter::ALL: | |
| 42 return false; | |
| 43 | |
| 44 case Prefilter::ATOM: | |
| 45 return prefilter->atom().size() >= | |
| 46 static_cast<size_t>(FLAGS_filtered_re2_min_atom_len); | |
| 47 | |
| 48 case Prefilter::AND: { | |
| 49 int j = 0; | |
| 50 vector<Prefilter*>* subs = prefilter->subs(); | |
| 51 for (size_t i = 0; i < subs->size(); i++) | |
| 52 if (KeepPart((*subs)[i], level + 1)) | |
| 53 (*subs)[j++] = (*subs)[i]; | |
| 54 else | |
| 55 delete (*subs)[i]; | |
| 56 | |
| 57 subs->resize(j); | |
| 58 return j > 0; | |
| 59 } | |
| 60 | |
| 61 case Prefilter::OR: | |
| 62 for (size_t i = 0; i < prefilter->subs()->size(); i++) | |
| 63 if (!KeepPart((*prefilter->subs())[i], level + 1)) | |
| 64 return false; | |
| 65 return true; | |
| 66 } | |
| 67 } | |
| 68 | |
| 69 void PrefilterTree::Add(Prefilter *f) { | |
| 70 if (compiled_) { | |
| 71 LOG(DFATAL) << "Add after Compile."; | |
| 72 return; | |
| 73 } | |
| 74 if (f != NULL && !KeepPart(f, 0)) { | |
| 75 delete f; | |
| 76 f = NULL; | |
| 77 } | |
| 78 | |
| 79 prefilter_vec_.push_back(f); | |
| 80 } | |
| 81 | |
| 82 void PrefilterTree::Compile(vector<string>* atom_vec) { | |
| 83 if (compiled_) { | |
| 84 LOG(DFATAL) << "Compile after Compile."; | |
| 85 return; | |
| 86 } | |
| 87 | |
| 88 // We do this check to support some legacy uses of | |
| 89 // PrefilterTree that call Compile before adding any regexps, | |
| 90 // and expect Compile not to have effect. | |
| 91 if (prefilter_vec_.empty()) | |
| 92 return; | |
| 93 | |
| 94 compiled_ = true; | |
| 95 | |
| 96 AssignUniqueIds(atom_vec); | |
| 97 | |
| 98 // Identify nodes that are too common among prefilters and are | |
| 99 // triggering too many parents. Then get rid of them if possible. | |
| 100 // Note that getting rid of a prefilter node simply means they are | |
| 101 // no longer necessary for their parent to trigger; that is, we do | |
| 102 // not miss out on any regexps triggering by getting rid of a | |
| 103 // prefilter node. | |
| 104 for (size_t i = 0; i < entries_.size(); i++) { | |
| 105 StdIntMap* parents = entries_[i].parents; | |
| 106 if (parents->size() > 8) { | |
| 107 // This one triggers too many things. If all the parents are AND | |
| 108 // nodes and have other things guarding them, then get rid of | |
| 109 // this trigger. TODO(vsri): Adjust the threshold appropriately, | |
| 110 // make it a function of total number of nodes? | |
| 111 bool have_other_guard = true; | |
| 112 for (StdIntMap::iterator it = parents->begin(); | |
| 113 it != parents->end(); ++it) { | |
| 114 have_other_guard = have_other_guard && | |
| 115 (entries_[it->first].propagate_up_at_count > 1); | |
| 116 } | |
| 117 | |
| 118 if (have_other_guard) { | |
| 119 for (StdIntMap::iterator it = parents->begin(); | |
| 120 it != parents->end(); ++it) | |
| 121 entries_[it->first].propagate_up_at_count -= 1; | |
| 122 | |
| 123 parents->clear(); // Forget the parents | |
| 124 } | |
| 125 } | |
| 126 } | |
| 127 | |
| 128 PrintDebugInfo(); | |
| 129 } | |
| 130 | |
| 131 Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { | |
| 132 string node_string = NodeString(node); | |
| 133 map<string, Prefilter*>::iterator iter = node_map_.find(node_string); | |
| 134 if (iter == node_map_.end()) | |
| 135 return NULL; | |
| 136 return (*iter).second; | |
| 137 } | |
| 138 | |
| 139 static string Itoa(int n) { | |
| 140 char buf[100]; | |
| 141 snprintf(buf, sizeof buf, "%d", n); | |
| 142 return string(buf); | |
| 143 } | |
| 144 | |
| 145 string PrefilterTree::NodeString(Prefilter* node) const { | |
| 146 // Adding the operation disambiguates AND/OR/atom nodes. | |
| 147 string s = Itoa(node->op()) + ":"; | |
| 148 if (node->op() == Prefilter::ATOM) { | |
| 149 s += node->atom(); | |
| 150 } else { | |
| 151 for (size_t i = 0; i < node->subs()->size(); i++) { | |
| 152 if (i > 0) | |
| 153 s += ','; | |
| 154 s += Itoa((*node->subs())[i]->unique_id()); | |
| 155 } | |
| 156 } | |
| 157 return s; | |
| 158 } | |
| 159 | |
| 160 void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { | |
| 161 atom_vec->clear(); | |
| 162 | |
| 163 // Build vector of all filter nodes, sorted topologically | |
| 164 // from top to bottom in v. | |
| 165 vector<Prefilter*> v; | |
| 166 | |
| 167 // Add the top level nodes of each regexp prefilter. | |
| 168 for (size_t i = 0; i < prefilter_vec_.size(); i++) { | |
| 169 Prefilter* f = prefilter_vec_[i]; | |
| 170 if (f == NULL) | |
| 171 unfiltered_.push_back(static_cast<int>(i)); | |
| 172 | |
| 173 // We push NULL also on to v, so that we maintain the | |
| 174 // mapping of index==regexpid for level=0 prefilter nodes. | |
| 175 v.push_back(f); | |
| 176 } | |
| 177 | |
| 178 // Now add all the descendant nodes. | |
| 179 for (size_t i = 0; i < v.size(); i++) { | |
| 180 Prefilter* f = v[i]; | |
| 181 if (f == NULL) | |
| 182 continue; | |
| 183 if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { | |
| 184 const vector<Prefilter*>& subs = *f->subs(); | |
| 185 for (size_t j = 0; j < subs.size(); j++) | |
| 186 v.push_back(subs[j]); | |
| 187 } | |
| 188 } | |
| 189 | |
| 190 // Identify unique nodes. | |
| 191 int unique_id = 0; | |
| 192 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { | |
| 193 Prefilter *node = v[i]; | |
| 194 if (node == NULL) | |
| 195 continue; | |
| 196 node->set_unique_id(-1); | |
| 197 Prefilter* canonical = CanonicalNode(node); | |
| 198 if (canonical == NULL) { | |
| 199 // Any further nodes that have the same node string | |
| 200 // will find this node as the canonical node. | |
| 201 node_map_[NodeString(node)] = node; | |
| 202 if (node->op() == Prefilter::ATOM) { | |
| 203 atom_vec->push_back(node->atom()); | |
| 204 atom_index_to_id_.push_back(unique_id); | |
| 205 } | |
| 206 node->set_unique_id(unique_id++); | |
| 207 } else { | |
| 208 node->set_unique_id(canonical->unique_id()); | |
| 209 } | |
| 210 } | |
| 211 entries_.resize(node_map_.size()); | |
| 212 | |
| 213 // Create parent StdIntMap for the entries. | |
| 214 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { | |
| 215 Prefilter* prefilter = v[i]; | |
| 216 if (prefilter == NULL) | |
| 217 continue; | |
| 218 | |
| 219 if (CanonicalNode(prefilter) != prefilter) | |
| 220 continue; | |
| 221 | |
| 222 Entry* entry = &entries_[prefilter->unique_id()]; | |
| 223 entry->parents = new StdIntMap(); | |
| 224 } | |
| 225 | |
| 226 // Fill the entries. | |
| 227 for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { | |
| 228 Prefilter* prefilter = v[i]; | |
| 229 if (prefilter == NULL) | |
| 230 continue; | |
| 231 | |
| 232 if (CanonicalNode(prefilter) != prefilter) | |
| 233 continue; | |
| 234 | |
| 235 Entry* entry = &entries_[prefilter->unique_id()]; | |
| 236 | |
| 237 switch (prefilter->op()) { | |
| 238 default: | |
| 239 case Prefilter::ALL: | |
| 240 LOG(DFATAL) << "Unexpected op: " << prefilter->op(); | |
| 241 return; | |
| 242 | |
| 243 case Prefilter::ATOM: | |
| 244 entry->propagate_up_at_count = 1; | |
| 245 break; | |
| 246 | |
| 247 case Prefilter::OR: | |
| 248 case Prefilter::AND: { | |
| 249 set<int> uniq_child; | |
| 250 for (size_t j = 0; j < prefilter->subs()->size(); j++) { | |
| 251 Prefilter* child = (*prefilter->subs())[j]; | |
| 252 Prefilter* canonical = CanonicalNode(child); | |
| 253 if (canonical == NULL) { | |
| 254 LOG(DFATAL) << "Null canonical node"; | |
| 255 return; | |
| 256 } | |
| 257 int child_id = canonical->unique_id(); | |
| 258 uniq_child.insert(child_id); | |
| 259 // To the child, we want to add to parent indices. | |
| 260 Entry* child_entry = &entries_[child_id]; | |
| 261 if (child_entry->parents->find(prefilter->unique_id()) == | |
| 262 child_entry->parents->end()) { | |
| 263 (*child_entry->parents)[prefilter->unique_id()] = 1; | |
| 264 } | |
| 265 } | |
| 266 entry->propagate_up_at_count = prefilter->op() == Prefilter::AND | |
| 267 ? static_cast<int>(uniq_child.size()) | |
| 268 : 1; | |
| 269 | |
| 270 break; | |
| 271 } | |
| 272 } | |
| 273 } | |
| 274 | |
| 275 // For top level nodes, populate regexp id. | |
| 276 for (size_t i = 0; i < prefilter_vec_.size(); i++) { | |
| 277 if (prefilter_vec_[i] == NULL) | |
| 278 continue; | |
| 279 int id = CanonicalNode(prefilter_vec_[i])->unique_id(); | |
| 280 DCHECK_LE(0, id); | |
| 281 Entry* entry = &entries_[id]; | |
| 282 entry->regexps.push_back(static_cast<int>(i)); | |
| 283 } | |
| 284 } | |
| 285 | |
| 286 // Functions for triggering during search. | |
| 287 void PrefilterTree::RegexpsGivenStrings( | |
| 288 const vector<int>& matched_atoms, | |
| 289 vector<int>* regexps) const { | |
| 290 regexps->clear(); | |
| 291 if (!compiled_) { | |
| 292 LOG(WARNING) << "Compile() not called"; | |
| 293 for (size_t i = 0; i < prefilter_vec_.size(); ++i) | |
| 294 regexps->push_back(static_cast<int>(i)); | |
| 295 } else { | |
| 296 if (!prefilter_vec_.empty()) { | |
| 297 IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); | |
| 298 vector<int> matched_atom_ids; | |
| 299 for (size_t j = 0; j < matched_atoms.size(); j++) { | |
| 300 matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); | |
| 301 VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; | |
| 302 } | |
| 303 PropagateMatch(matched_atom_ids, ®exps_map); | |
| 304 for (IntMap::iterator it = regexps_map.begin(); | |
| 305 it != regexps_map.end(); | |
| 306 ++it) | |
| 307 regexps->push_back(it->index()); | |
| 308 | |
| 309 regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); | |
| 310 } | |
| 311 } | |
| 312 sort(regexps->begin(), regexps->end()); | |
| 313 } | |
| 314 | |
| 315 void PrefilterTree::PropagateMatch(const vector<int>& atom_ids, | |
| 316 IntMap* regexps) const { | |
| 317 IntMap count(static_cast<int>(entries_.size())); | |
| 318 IntMap work(static_cast<int>(entries_.size())); | |
| 319 for (size_t i = 0; i < atom_ids.size(); i++) | |
| 320 work.set(atom_ids[i], 1); | |
| 321 for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { | |
| 322 const Entry& entry = entries_[it->index()]; | |
| 323 VLOG(10) << "Processing: " << it->index(); | |
| 324 // Record regexps triggered. | |
| 325 for (size_t i = 0; i < entry.regexps.size(); i++) { | |
| 326 VLOG(10) << "Regexp triggered: " << entry.regexps[i]; | |
| 327 regexps->set(entry.regexps[i], 1); | |
| 328 } | |
| 329 int c; | |
| 330 // Pass trigger up to parents. | |
| 331 for (StdIntMap::iterator it = entry.parents->begin(); | |
| 332 it != entry.parents->end(); | |
| 333 ++it) { | |
| 334 int j = it->first; | |
| 335 const Entry& parent = entries_[j]; | |
| 336 VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; | |
| 337 // Delay until all the children have succeeded. | |
| 338 if (parent.propagate_up_at_count > 1) { | |
| 339 if (count.has_index(j)) { | |
| 340 c = count.get_existing(j) + 1; | |
| 341 count.set_existing(j, c); | |
| 342 } else { | |
| 343 c = 1; | |
| 344 count.set_new(j, c); | |
| 345 } | |
| 346 if (c < parent.propagate_up_at_count) | |
| 347 continue; | |
| 348 } | |
| 349 VLOG(10) << "Triggering: " << j; | |
| 350 // Trigger the parent. | |
| 351 work.set(j, 1); | |
| 352 } | |
| 353 } | |
| 354 } | |
| 355 | |
| 356 // Debugging help. | |
| 357 void PrefilterTree::PrintPrefilter(int regexpid) { | |
| 358 LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); | |
| 359 } | |
| 360 | |
| 361 void PrefilterTree::PrintDebugInfo() { | |
| 362 VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); | |
| 363 VLOG(10) << "#Unique Nodes: " << entries_.size(); | |
| 364 | |
| 365 for (size_t i = 0; i < entries_.size(); ++i) { | |
| 366 StdIntMap* parents = entries_[i].parents; | |
| 367 const vector<int>& regexps = entries_[i].regexps; | |
| 368 VLOG(10) << "EntryId: " << i | |
| 369 << " N: " << parents->size() << " R: " << regexps.size(); | |
| 370 for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) | |
| 371 VLOG(10) << it->first; | |
| 372 } | |
| 373 VLOG(10) << "Map:"; | |
| 374 for (map<string, Prefilter*>::const_iterator iter = node_map_.begin(); | |
| 375 iter != node_map_.end(); ++iter) | |
| 376 VLOG(10) << "NodeId: " << (*iter).second->unique_id() | |
| 377 << " Str: " << (*iter).first; | |
| 378 } | |
| 379 | |
| 380 string PrefilterTree::DebugNodeString(Prefilter* node) const { | |
| 381 string node_string = ""; | |
| 382 | |
| 383 if (node->op() == Prefilter::ATOM) { | |
| 384 DCHECK(!node->atom().empty()); | |
| 385 node_string += node->atom(); | |
| 386 } else { | |
| 387 // Adding the operation disambiguates AND and OR nodes. | |
| 388 node_string += node->op() == Prefilter::AND ? "AND" : "OR"; | |
| 389 node_string += "("; | |
| 390 for (size_t i = 0; i < node->subs()->size(); i++) { | |
| 391 if (i > 0) | |
| 392 node_string += ','; | |
| 393 node_string += Itoa((*node->subs())[i]->unique_id()); | |
| 394 node_string += ":"; | |
| 395 node_string += DebugNodeString((*node->subs())[i]); | |
| 396 } | |
| 397 node_string += ")"; | |
| 398 } | |
| 399 return node_string; | |
| 400 } | |
| 401 | |
| 402 } // namespace re2 | |
| OLD | NEW |