OLD | NEW |
| (Empty) |
1 // Copyright 2009 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // The PrefilterTree class is used to form an AND-OR tree of strings | |
6 // that would trigger each regexp. The 'prefilter' of each regexp is | |
7 // added tp PrefilterTree, and then PrefilterTree is used to find all | |
8 // the unique strings across the prefilters. During search, by using | |
9 // matches from a string matching engine, PrefilterTree deduces the | |
10 // set of regexps that are to be triggered. The 'string matching | |
11 // engine' itself is outside of this class, and the caller can use any | |
12 // favorite engine. PrefilterTree provides a set of strings (called | |
13 // atoms) that the user of this class should use to do the string | |
14 // matching. | |
15 // | |
16 #ifndef RE2_PREFILTER_TREE_H_ | |
17 #define RE2_PREFILTER_TREE_H_ | |
18 | |
19 #include "util/util.h" | |
20 #include "util/sparse_array.h" | |
21 | |
22 namespace re2 { | |
23 | |
24 typedef SparseArray<int> IntMap; | |
25 typedef map<int, int> StdIntMap; | |
26 | |
27 class Prefilter; | |
28 | |
29 class PrefilterTree { | |
30 public: | |
31 PrefilterTree(); | |
32 ~PrefilterTree(); | |
33 | |
34 // Adds the prefilter for the next regexp. Note that we assume that | |
35 // Add called sequentially for all regexps. All Add calls | |
36 // must precede Compile. | |
37 void Add(Prefilter* prefilter); | |
38 | |
39 // The Compile returns a vector of string in atom_vec. | |
40 // Call this after all the prefilters are added through Add. | |
41 // No calls to Add after Compile are allowed. | |
42 // The caller should use the returned set of strings to do string matching. | |
43 // Each time a string matches, the corresponding index then has to be | |
44 // and passed to RegexpsGivenStrings below. | |
45 void Compile(vector<string>* atom_vec); | |
46 | |
47 // Given the indices of the atoms that matched, returns the indexes | |
48 // of regexps that should be searched. The matched_atoms should | |
49 // contain all the ids of string atoms that were found to match the | |
50 // content. The caller can use any string match engine to perform | |
51 // this function. This function is thread safe. | |
52 void RegexpsGivenStrings(const vector<int>& matched_atoms, | |
53 vector<int>* regexps) const; | |
54 | |
55 // Print debug prefilter. Also prints unique ids associated with | |
56 // nodes of the prefilter of the regexp. | |
57 void PrintPrefilter(int regexpid); | |
58 | |
59 | |
60 // Each unique node has a corresponding Entry that helps in | |
61 // passing the matching trigger information along the tree. | |
62 struct Entry { | |
63 public: | |
64 // How many children should match before this node triggers the | |
65 // parent. For an atom and an OR node, this is 1 and for an AND | |
66 // node, it is the number of unique children. | |
67 int propagate_up_at_count; | |
68 | |
69 // When this node is ready to trigger the parent, what are the indices | |
70 // of the parent nodes to trigger. The reason there may be more than | |
71 // one is because of sharing. For example (abc | def) and (xyz | def) | |
72 // are two different nodes, but they share the atom 'def'. So when | |
73 // 'def' matches, it triggers two parents, corresponding to the two | |
74 // different OR nodes. | |
75 StdIntMap* parents; | |
76 | |
77 // When this node is ready to trigger the parent, what are the | |
78 // regexps that are triggered. | |
79 vector<int> regexps; | |
80 }; | |
81 | |
82 private: | |
83 // This function assigns unique ids to various parts of the | |
84 // prefilter, by looking at if these nodes are already in the | |
85 // PrefilterTree. | |
86 void AssignUniqueIds(vector<string>* atom_vec); | |
87 | |
88 // Given the matching atoms, find the regexps to be triggered. | |
89 void PropagateMatch(const vector<int>& atom_ids, | |
90 IntMap* regexps) const; | |
91 | |
92 // Returns the prefilter node that has the same NodeString as this | |
93 // node. For the canonical node, returns node. | |
94 Prefilter* CanonicalNode(Prefilter* node); | |
95 | |
96 // A string that uniquely identifies the node. Assumes that the | |
97 // children of node has already been assigned unique ids. | |
98 string NodeString(Prefilter* node) const; | |
99 | |
100 // Recursively constructs a readable prefilter string. | |
101 string DebugNodeString(Prefilter* node) const; | |
102 | |
103 // Used for debugging. | |
104 void PrintDebugInfo(); | |
105 | |
106 // These are all the nodes formed by Compile. Essentially, there is | |
107 // one node for each unique atom and each unique AND/OR node. | |
108 vector<Entry> entries_; | |
109 | |
110 // Map node string to canonical Prefilter node. | |
111 map<string, Prefilter*> node_map_; | |
112 | |
113 // indices of regexps that always pass through the filter (since we | |
114 // found no required literals in these regexps). | |
115 vector<int> unfiltered_; | |
116 | |
117 // vector of Prefilter for all regexps. | |
118 vector<Prefilter*> prefilter_vec_; | |
119 | |
120 // Atom index in returned strings to entry id mapping. | |
121 vector<int> atom_index_to_id_; | |
122 | |
123 // Has the prefilter tree been compiled. | |
124 bool compiled_; | |
125 | |
126 DISALLOW_COPY_AND_ASSIGN(PrefilterTree); | |
127 }; | |
128 | |
129 } // namespace | |
130 | |
131 #endif // RE2_PREFILTER_TREE_H_ | |
OLD | NEW |