Index: third_party/re2/re2/prefilter_tree.cc |
diff --git a/third_party/re2/re2/prefilter_tree.cc b/third_party/re2/re2/prefilter_tree.cc |
deleted file mode 100644 |
index be9b5844b8f05718929305a388eccb1a117c100a..0000000000000000000000000000000000000000 |
--- a/third_party/re2/re2/prefilter_tree.cc |
+++ /dev/null |
@@ -1,402 +0,0 @@ |
-// Copyright 2009 The RE2 Authors. All Rights Reserved. |
-// Use of this source code is governed by a BSD-style |
-// license that can be found in the LICENSE file. |
- |
-#include "util/util.h" |
-#include "util/flags.h" |
-#include "re2/prefilter.h" |
-#include "re2/prefilter_tree.h" |
-#include "re2/re2.h" |
- |
-DEFINE_int32(filtered_re2_min_atom_len, |
- 3, |
- "Strings less than this length are not stored as atoms"); |
- |
-namespace re2 { |
- |
-PrefilterTree::PrefilterTree() |
- : compiled_(false) { |
-} |
- |
-PrefilterTree::~PrefilterTree() { |
- for (size_t i = 0; i < prefilter_vec_.size(); i++) |
- delete prefilter_vec_[i]; |
- |
- for (size_t i = 0; i < entries_.size(); i++) |
- delete entries_[i].parents; |
-} |
- |
-// Functions used for adding and Compiling prefilters to the |
-// PrefilterTree. |
-static bool KeepPart(Prefilter* prefilter, int level) { |
- if (prefilter == NULL) |
- return false; |
- |
- switch (prefilter->op()) { |
- default: |
- LOG(DFATAL) << "Unexpected op in KeepPart: " |
- << prefilter->op(); |
- return false; |
- |
- case Prefilter::ALL: |
- return false; |
- |
- case Prefilter::ATOM: |
- return prefilter->atom().size() >= |
- static_cast<size_t>(FLAGS_filtered_re2_min_atom_len); |
- |
- case Prefilter::AND: { |
- int j = 0; |
- vector<Prefilter*>* subs = prefilter->subs(); |
- for (size_t i = 0; i < subs->size(); i++) |
- if (KeepPart((*subs)[i], level + 1)) |
- (*subs)[j++] = (*subs)[i]; |
- else |
- delete (*subs)[i]; |
- |
- subs->resize(j); |
- return j > 0; |
- } |
- |
- case Prefilter::OR: |
- for (size_t i = 0; i < prefilter->subs()->size(); i++) |
- if (!KeepPart((*prefilter->subs())[i], level + 1)) |
- return false; |
- return true; |
- } |
-} |
- |
-void PrefilterTree::Add(Prefilter *f) { |
- if (compiled_) { |
- LOG(DFATAL) << "Add after Compile."; |
- return; |
- } |
- if (f != NULL && !KeepPart(f, 0)) { |
- delete f; |
- f = NULL; |
- } |
- |
- prefilter_vec_.push_back(f); |
-} |
- |
-void PrefilterTree::Compile(vector<string>* atom_vec) { |
- if (compiled_) { |
- LOG(DFATAL) << "Compile after Compile."; |
- return; |
- } |
- |
- // We do this check to support some legacy uses of |
- // PrefilterTree that call Compile before adding any regexps, |
- // and expect Compile not to have effect. |
- if (prefilter_vec_.empty()) |
- return; |
- |
- compiled_ = true; |
- |
- AssignUniqueIds(atom_vec); |
- |
- // Identify nodes that are too common among prefilters and are |
- // triggering too many parents. Then get rid of them if possible. |
- // Note that getting rid of a prefilter node simply means they are |
- // no longer necessary for their parent to trigger; that is, we do |
- // not miss out on any regexps triggering by getting rid of a |
- // prefilter node. |
- for (size_t i = 0; i < entries_.size(); i++) { |
- StdIntMap* parents = entries_[i].parents; |
- if (parents->size() > 8) { |
- // This one triggers too many things. If all the parents are AND |
- // nodes and have other things guarding them, then get rid of |
- // this trigger. TODO(vsri): Adjust the threshold appropriately, |
- // make it a function of total number of nodes? |
- bool have_other_guard = true; |
- for (StdIntMap::iterator it = parents->begin(); |
- it != parents->end(); ++it) { |
- have_other_guard = have_other_guard && |
- (entries_[it->first].propagate_up_at_count > 1); |
- } |
- |
- if (have_other_guard) { |
- for (StdIntMap::iterator it = parents->begin(); |
- it != parents->end(); ++it) |
- entries_[it->first].propagate_up_at_count -= 1; |
- |
- parents->clear(); // Forget the parents |
- } |
- } |
- } |
- |
- PrintDebugInfo(); |
-} |
- |
-Prefilter* PrefilterTree::CanonicalNode(Prefilter* node) { |
- string node_string = NodeString(node); |
- map<string, Prefilter*>::iterator iter = node_map_.find(node_string); |
- if (iter == node_map_.end()) |
- return NULL; |
- return (*iter).second; |
-} |
- |
-static string Itoa(int n) { |
- char buf[100]; |
- snprintf(buf, sizeof buf, "%d", n); |
- return string(buf); |
-} |
- |
-string PrefilterTree::NodeString(Prefilter* node) const { |
- // Adding the operation disambiguates AND/OR/atom nodes. |
- string s = Itoa(node->op()) + ":"; |
- if (node->op() == Prefilter::ATOM) { |
- s += node->atom(); |
- } else { |
- for (size_t i = 0; i < node->subs()->size(); i++) { |
- if (i > 0) |
- s += ','; |
- s += Itoa((*node->subs())[i]->unique_id()); |
- } |
- } |
- return s; |
-} |
- |
-void PrefilterTree::AssignUniqueIds(vector<string>* atom_vec) { |
- atom_vec->clear(); |
- |
- // Build vector of all filter nodes, sorted topologically |
- // from top to bottom in v. |
- vector<Prefilter*> v; |
- |
- // Add the top level nodes of each regexp prefilter. |
- for (size_t i = 0; i < prefilter_vec_.size(); i++) { |
- Prefilter* f = prefilter_vec_[i]; |
- if (f == NULL) |
- unfiltered_.push_back(static_cast<int>(i)); |
- |
- // We push NULL also on to v, so that we maintain the |
- // mapping of index==regexpid for level=0 prefilter nodes. |
- v.push_back(f); |
- } |
- |
- // Now add all the descendant nodes. |
- for (size_t i = 0; i < v.size(); i++) { |
- Prefilter* f = v[i]; |
- if (f == NULL) |
- continue; |
- if (f->op() == Prefilter::AND || f->op() == Prefilter::OR) { |
- const vector<Prefilter*>& subs = *f->subs(); |
- for (size_t j = 0; j < subs.size(); j++) |
- v.push_back(subs[j]); |
- } |
- } |
- |
- // Identify unique nodes. |
- int unique_id = 0; |
- for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
- Prefilter *node = v[i]; |
- if (node == NULL) |
- continue; |
- node->set_unique_id(-1); |
- Prefilter* canonical = CanonicalNode(node); |
- if (canonical == NULL) { |
- // Any further nodes that have the same node string |
- // will find this node as the canonical node. |
- node_map_[NodeString(node)] = node; |
- if (node->op() == Prefilter::ATOM) { |
- atom_vec->push_back(node->atom()); |
- atom_index_to_id_.push_back(unique_id); |
- } |
- node->set_unique_id(unique_id++); |
- } else { |
- node->set_unique_id(canonical->unique_id()); |
- } |
- } |
- entries_.resize(node_map_.size()); |
- |
- // Create parent StdIntMap for the entries. |
- for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
- Prefilter* prefilter = v[i]; |
- if (prefilter == NULL) |
- continue; |
- |
- if (CanonicalNode(prefilter) != prefilter) |
- continue; |
- |
- Entry* entry = &entries_[prefilter->unique_id()]; |
- entry->parents = new StdIntMap(); |
- } |
- |
- // Fill the entries. |
- for (int i = static_cast<int>(v.size()) - 1; i >= 0; i--) { |
- Prefilter* prefilter = v[i]; |
- if (prefilter == NULL) |
- continue; |
- |
- if (CanonicalNode(prefilter) != prefilter) |
- continue; |
- |
- Entry* entry = &entries_[prefilter->unique_id()]; |
- |
- switch (prefilter->op()) { |
- default: |
- case Prefilter::ALL: |
- LOG(DFATAL) << "Unexpected op: " << prefilter->op(); |
- return; |
- |
- case Prefilter::ATOM: |
- entry->propagate_up_at_count = 1; |
- break; |
- |
- case Prefilter::OR: |
- case Prefilter::AND: { |
- set<int> uniq_child; |
- for (size_t j = 0; j < prefilter->subs()->size(); j++) { |
- Prefilter* child = (*prefilter->subs())[j]; |
- Prefilter* canonical = CanonicalNode(child); |
- if (canonical == NULL) { |
- LOG(DFATAL) << "Null canonical node"; |
- return; |
- } |
- int child_id = canonical->unique_id(); |
- uniq_child.insert(child_id); |
- // To the child, we want to add to parent indices. |
- Entry* child_entry = &entries_[child_id]; |
- if (child_entry->parents->find(prefilter->unique_id()) == |
- child_entry->parents->end()) { |
- (*child_entry->parents)[prefilter->unique_id()] = 1; |
- } |
- } |
- entry->propagate_up_at_count = prefilter->op() == Prefilter::AND |
- ? static_cast<int>(uniq_child.size()) |
- : 1; |
- |
- break; |
- } |
- } |
- } |
- |
- // For top level nodes, populate regexp id. |
- for (size_t i = 0; i < prefilter_vec_.size(); i++) { |
- if (prefilter_vec_[i] == NULL) |
- continue; |
- int id = CanonicalNode(prefilter_vec_[i])->unique_id(); |
- DCHECK_LE(0, id); |
- Entry* entry = &entries_[id]; |
- entry->regexps.push_back(static_cast<int>(i)); |
- } |
-} |
- |
-// Functions for triggering during search. |
-void PrefilterTree::RegexpsGivenStrings( |
- const vector<int>& matched_atoms, |
- vector<int>* regexps) const { |
- regexps->clear(); |
- if (!compiled_) { |
- LOG(WARNING) << "Compile() not called"; |
- for (size_t i = 0; i < prefilter_vec_.size(); ++i) |
- regexps->push_back(static_cast<int>(i)); |
- } else { |
- if (!prefilter_vec_.empty()) { |
- IntMap regexps_map(static_cast<int>(prefilter_vec_.size())); |
- vector<int> matched_atom_ids; |
- for (size_t j = 0; j < matched_atoms.size(); j++) { |
- matched_atom_ids.push_back(atom_index_to_id_[matched_atoms[j]]); |
- VLOG(10) << "Atom id:" << atom_index_to_id_[matched_atoms[j]]; |
- } |
- PropagateMatch(matched_atom_ids, ®exps_map); |
- for (IntMap::iterator it = regexps_map.begin(); |
- it != regexps_map.end(); |
- ++it) |
- regexps->push_back(it->index()); |
- |
- regexps->insert(regexps->end(), unfiltered_.begin(), unfiltered_.end()); |
- } |
- } |
- sort(regexps->begin(), regexps->end()); |
-} |
- |
-void PrefilterTree::PropagateMatch(const vector<int>& atom_ids, |
- IntMap* regexps) const { |
- IntMap count(static_cast<int>(entries_.size())); |
- IntMap work(static_cast<int>(entries_.size())); |
- for (size_t i = 0; i < atom_ids.size(); i++) |
- work.set(atom_ids[i], 1); |
- for (IntMap::iterator it = work.begin(); it != work.end(); ++it) { |
- const Entry& entry = entries_[it->index()]; |
- VLOG(10) << "Processing: " << it->index(); |
- // Record regexps triggered. |
- for (size_t i = 0; i < entry.regexps.size(); i++) { |
- VLOG(10) << "Regexp triggered: " << entry.regexps[i]; |
- regexps->set(entry.regexps[i], 1); |
- } |
- int c; |
- // Pass trigger up to parents. |
- for (StdIntMap::iterator it = entry.parents->begin(); |
- it != entry.parents->end(); |
- ++it) { |
- int j = it->first; |
- const Entry& parent = entries_[j]; |
- VLOG(10) << " parent= " << j << " trig= " << parent.propagate_up_at_count; |
- // Delay until all the children have succeeded. |
- if (parent.propagate_up_at_count > 1) { |
- if (count.has_index(j)) { |
- c = count.get_existing(j) + 1; |
- count.set_existing(j, c); |
- } else { |
- c = 1; |
- count.set_new(j, c); |
- } |
- if (c < parent.propagate_up_at_count) |
- continue; |
- } |
- VLOG(10) << "Triggering: " << j; |
- // Trigger the parent. |
- work.set(j, 1); |
- } |
- } |
-} |
- |
-// Debugging help. |
-void PrefilterTree::PrintPrefilter(int regexpid) { |
- LOG(INFO) << DebugNodeString(prefilter_vec_[regexpid]); |
-} |
- |
-void PrefilterTree::PrintDebugInfo() { |
- VLOG(10) << "#Unique Atoms: " << atom_index_to_id_.size(); |
- VLOG(10) << "#Unique Nodes: " << entries_.size(); |
- |
- for (size_t i = 0; i < entries_.size(); ++i) { |
- StdIntMap* parents = entries_[i].parents; |
- const vector<int>& regexps = entries_[i].regexps; |
- VLOG(10) << "EntryId: " << i |
- << " N: " << parents->size() << " R: " << regexps.size(); |
- for (StdIntMap::iterator it = parents->begin(); it != parents->end(); ++it) |
- VLOG(10) << it->first; |
- } |
- VLOG(10) << "Map:"; |
- for (map<string, Prefilter*>::const_iterator iter = node_map_.begin(); |
- iter != node_map_.end(); ++iter) |
- VLOG(10) << "NodeId: " << (*iter).second->unique_id() |
- << " Str: " << (*iter).first; |
-} |
- |
-string PrefilterTree::DebugNodeString(Prefilter* node) const { |
- string node_string = ""; |
- |
- if (node->op() == Prefilter::ATOM) { |
- DCHECK(!node->atom().empty()); |
- node_string += node->atom(); |
- } else { |
- // Adding the operation disambiguates AND and OR nodes. |
- node_string += node->op() == Prefilter::AND ? "AND" : "OR"; |
- node_string += "("; |
- for (size_t i = 0; i < node->subs()->size(); i++) { |
- if (i > 0) |
- node_string += ','; |
- node_string += Itoa((*node->subs())[i]->unique_id()); |
- node_string += ":"; |
- node_string += DebugNodeString((*node->subs())[i]); |
- } |
- node_string += ")"; |
- } |
- return node_string; |
-} |
- |
-} // namespace re2 |