OLD | NEW |
| (Empty) |
1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Helper class for traversing Regexps without recursion. | |
6 // Clients should declare their own subclasses that override | |
7 // the PreVisit and PostVisit methods, which are called before | |
8 // and after visiting the subexpressions. | |
9 | |
10 // Not quite the Visitor pattern, because (among other things) | |
11 // the Visitor pattern is recursive. | |
12 | |
13 #ifndef RE2_WALKER_INL_H__ | |
14 #define RE2_WALKER_INL_H__ | |
15 | |
16 #include "re2/regexp.h" | |
17 | |
18 namespace re2 { | |
19 | |
20 template<typename T> struct WalkState; | |
21 | |
22 template<typename T> class Regexp::Walker { | |
23 public: | |
24 Walker(); | |
25 virtual ~Walker(); | |
26 | |
27 // Virtual method called before visiting re's children. | |
28 // PreVisit passes ownership of its return value to its caller. | |
29 // The Arg* that PreVisit returns will be passed to PostVisit as pre_arg | |
30 // and passed to the child PreVisits and PostVisits as parent_arg. | |
31 // At the top-most Regexp, parent_arg is arg passed to walk. | |
32 // If PreVisit sets *stop to true, the walk does not recurse | |
33 // into the children. Instead it behaves as though the return | |
34 // value from PreVisit is the return value from PostVisit. | |
35 // The default PreVisit returns parent_arg. | |
36 virtual T PreVisit(Regexp* re, T parent_arg, bool* stop); | |
37 | |
38 // Virtual method called after visiting re's children. | |
39 // The pre_arg is the T that PreVisit returned. | |
40 // The child_args is a vector of the T that the child PostVisits returned. | |
41 // PostVisit takes ownership of pre_arg. | |
42 // PostVisit takes ownership of the Ts | |
43 // in *child_args, but not the vector itself. | |
44 // PostVisit passes ownership of its return value | |
45 // to its caller. | |
46 // The default PostVisit simply returns pre_arg. | |
47 virtual T PostVisit(Regexp* re, T parent_arg, T pre_arg, | |
48 T* child_args, int nchild_args); | |
49 | |
50 // Virtual method called to copy a T, | |
51 // when Walk notices that more than one child is the same re. | |
52 virtual T Copy(T arg); | |
53 | |
54 // Virtual method called to do a "quick visit" of the re, | |
55 // but not its children. Only called once the visit budget | |
56 // has been used up and we're trying to abort the walk | |
57 // as quickly as possible. Should return a value that | |
58 // makes sense for the parent PostVisits still to be run. | |
59 // This function is (hopefully) only called by | |
60 // WalkExponential, but must be implemented by all clients, | |
61 // just in case. | |
62 virtual T ShortVisit(Regexp* re, T parent_arg) = 0; | |
63 | |
64 // Walks over a regular expression. | |
65 // Top_arg is passed as parent_arg to PreVisit and PostVisit of re. | |
66 // Returns the T returned by PostVisit on re. | |
67 T Walk(Regexp* re, T top_arg); | |
68 | |
69 // Like Walk, but doesn't use Copy. This can lead to | |
70 // exponential runtimes on cross-linked Regexps like the | |
71 // ones generated by Simplify. To help limit this, | |
72 // at most max_visits nodes will be visited and then | |
73 // the walk will be cut off early. | |
74 // If the walk *is* cut off early, ShortVisit(re) | |
75 // will be called on regexps that cannot be fully | |
76 // visited rather than calling PreVisit/PostVisit. | |
77 T WalkExponential(Regexp* re, T top_arg, int max_visits); | |
78 | |
79 // Clears the stack. Should never be necessary, since | |
80 // Walk always enters and exits with an empty stack. | |
81 // Logs DFATAL if stack is not already clear. | |
82 void Reset(); | |
83 | |
84 // Returns whether walk was cut off. | |
85 bool stopped_early() { return stopped_early_; } | |
86 | |
87 private: | |
88 // Walk state for the entire traversal. | |
89 stack<WalkState<T> >* stack_; | |
90 bool stopped_early_; | |
91 int max_visits_; | |
92 | |
93 T WalkInternal(Regexp* re, T top_arg, bool use_copy); | |
94 | |
95 DISALLOW_COPY_AND_ASSIGN(Walker); | |
96 }; | |
97 | |
98 template<typename T> T Regexp::Walker<T>::PreVisit(Regexp* re, | |
99 T parent_arg, | |
100 bool* stop) { | |
101 return parent_arg; | |
102 } | |
103 | |
104 template<typename T> T Regexp::Walker<T>::PostVisit(Regexp* re, | |
105 T parent_arg, | |
106 T pre_arg, | |
107 T* child_args, | |
108 int nchild_args) { | |
109 return pre_arg; | |
110 } | |
111 | |
112 template<typename T> T Regexp::Walker<T>::Copy(T arg) { | |
113 return arg; | |
114 } | |
115 | |
116 // State about a single level in the traversal. | |
117 template<typename T> struct WalkState { | |
118 WalkState<T>(Regexp* re, T parent) | |
119 : re(re), | |
120 n(-1), | |
121 parent_arg(parent), | |
122 child_args(NULL) { } | |
123 | |
124 Regexp* re; // The regexp | |
125 int n; // The index of the next child to process; -1 means need to PreVisit | |
126 T parent_arg; // Accumulated arguments. | |
127 T pre_arg; | |
128 T child_arg; // One-element buffer for child_args. | |
129 T* child_args; | |
130 }; | |
131 | |
132 template<typename T> Regexp::Walker<T>::Walker() { | |
133 stack_ = new stack<WalkState<T> >; | |
134 stopped_early_ = false; | |
135 } | |
136 | |
137 template<typename T> Regexp::Walker<T>::~Walker() { | |
138 Reset(); | |
139 delete stack_; | |
140 } | |
141 | |
142 // Clears the stack. Should never be necessary, since | |
143 // Walk always enters and exits with an empty stack. | |
144 // Logs DFATAL if stack is not already clear. | |
145 template<typename T> void Regexp::Walker<T>::Reset() { | |
146 if (stack_ && stack_->size() > 0) { | |
147 LOG(DFATAL) << "Stack not empty."; | |
148 while (stack_->size() > 0) { | |
149 delete stack_->top().child_args; | |
150 stack_->pop(); | |
151 } | |
152 } | |
153 } | |
154 | |
155 template<typename T> T Regexp::Walker<T>::WalkInternal(Regexp* re, T top_arg, | |
156 bool use_copy) { | |
157 Reset(); | |
158 | |
159 if (re == NULL) { | |
160 LOG(DFATAL) << "Walk NULL"; | |
161 return top_arg; | |
162 } | |
163 | |
164 stack_->push(WalkState<T>(re, top_arg)); | |
165 | |
166 WalkState<T>* s; | |
167 for (;;) { | |
168 T t; | |
169 s = &stack_->top(); | |
170 Regexp* re = s->re; | |
171 switch (s->n) { | |
172 case -1: { | |
173 if (--max_visits_ < 0) { | |
174 stopped_early_ = true; | |
175 t = ShortVisit(re, s->parent_arg); | |
176 break; | |
177 } | |
178 bool stop = false; | |
179 s->pre_arg = PreVisit(re, s->parent_arg, &stop); | |
180 if (stop) { | |
181 t = s->pre_arg; | |
182 break; | |
183 } | |
184 s->n = 0; | |
185 s->child_args = NULL; | |
186 if (re->nsub_ == 1) | |
187 s->child_args = &s->child_arg; | |
188 else if (re->nsub_ > 1) | |
189 s->child_args = new T[re->nsub_]; | |
190 // Fall through. | |
191 } | |
192 default: { | |
193 if (re->nsub_ > 0) { | |
194 Regexp** sub = re->sub(); | |
195 if (s->n < re->nsub_) { | |
196 if (use_copy && s->n > 0 && sub[s->n - 1] == sub[s->n]) { | |
197 s->child_args[s->n] = Copy(s->child_args[s->n - 1]); | |
198 s->n++; | |
199 } else { | |
200 stack_->push(WalkState<T>(sub[s->n], s->pre_arg)); | |
201 } | |
202 continue; | |
203 } | |
204 } | |
205 | |
206 t = PostVisit(re, s->parent_arg, s->pre_arg, s->child_args, s->n); | |
207 if (re->nsub_ > 1) | |
208 delete[] s->child_args; | |
209 break; | |
210 } | |
211 } | |
212 | |
213 // We've finished stack_->top(). | |
214 // Update next guy down. | |
215 stack_->pop(); | |
216 if (stack_->size() == 0) | |
217 return t; | |
218 s = &stack_->top(); | |
219 if (s->child_args != NULL) | |
220 s->child_args[s->n] = t; | |
221 else | |
222 s->child_arg = t; | |
223 s->n++; | |
224 } | |
225 } | |
226 | |
227 template<typename T> T Regexp::Walker<T>::Walk(Regexp* re, T top_arg) { | |
228 // Without the exponential walking behavior, | |
229 // this budget should be more than enough for any | |
230 // regexp, and yet not enough to get us in trouble | |
231 // as far as CPU time. | |
232 max_visits_ = 1000000; | |
233 return WalkInternal(re, top_arg, true); | |
234 } | |
235 | |
236 template<typename T> T Regexp::Walker<T>::WalkExponential(Regexp* re, T top_arg, | |
237 int max_visits) { | |
238 max_visits_ = max_visits; | |
239 return WalkInternal(re, top_arg, false); | |
240 } | |
241 | |
242 } // namespace re2 | |
243 | |
244 #endif // RE2_WALKER_INL_H__ | |
OLD | NEW |