OLD | NEW |
| (Empty) |
1 // Copyright 2007 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Compiled representation of regular expressions. | |
6 // See regexp.h for the Regexp class, which represents a regular | |
7 // expression symbolically. | |
8 | |
9 #ifndef RE2_PROG_H__ | |
10 #define RE2_PROG_H__ | |
11 | |
12 #include "util/util.h" | |
13 #include "util/sparse_array.h" | |
14 #include "re2/re2.h" | |
15 | |
16 namespace re2 { | |
17 | |
18 // Simple fixed-size bitmap. | |
19 template<int Bits> | |
20 class Bitmap { | |
21 public: | |
22 Bitmap() { Reset(); } | |
23 int Size() { return Bits; } | |
24 | |
25 void Reset() { | |
26 for (int i = 0; i < Words; i++) | |
27 w_[i] = 0; | |
28 } | |
29 bool Get(int k) const { | |
30 return w_[k >> WordLog] & (1<<(k & 31)); | |
31 } | |
32 void Set(int k) { | |
33 w_[k >> WordLog] |= 1<<(k & 31); | |
34 } | |
35 void Clear(int k) { | |
36 w_[k >> WordLog] &= ~(1<<(k & 31)); | |
37 } | |
38 uint32 Word(int i) const { | |
39 return w_[i]; | |
40 } | |
41 | |
42 private: | |
43 static const int WordLog = 5; | |
44 static const int Words = (Bits+31)/32; | |
45 uint32 w_[Words]; | |
46 DISALLOW_COPY_AND_ASSIGN(Bitmap); | |
47 }; | |
48 | |
49 | |
50 // Opcodes for Inst | |
51 enum InstOp { | |
52 kInstAlt = 0, // choose between out_ and out1_ | |
53 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice v
ersa. | |
54 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] | |
55 kInstCapture, // capturing parenthesis number cap_ | |
56 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ | |
57 kInstMatch, // found a match! | |
58 kInstNop, // no-op; occasionally unavoidable | |
59 kInstFail, // never match; occasionally unavoidable | |
60 }; | |
61 | |
62 // Bit flags for empty-width specials | |
63 enum EmptyOp { | |
64 kEmptyBeginLine = 1<<0, // ^ - beginning of line | |
65 kEmptyEndLine = 1<<1, // $ - end of line | |
66 kEmptyBeginText = 1<<2, // \A - beginning of text | |
67 kEmptyEndText = 1<<3, // \z - end of text | |
68 kEmptyWordBoundary = 1<<4, // \b - word boundary | |
69 kEmptyNonWordBoundary = 1<<5, // \B - not \b | |
70 kEmptyAllFlags = (1<<6)-1, | |
71 }; | |
72 | |
73 class Regexp; | |
74 | |
75 class DFA; | |
76 struct OneState; | |
77 | |
78 // Compiled form of regexp program. | |
79 class Prog { | |
80 public: | |
81 Prog(); | |
82 ~Prog(); | |
83 | |
84 // Single instruction in regexp program. | |
85 class Inst { | |
86 public: | |
87 Inst() : out_opcode_(0), out1_(0) { } | |
88 | |
89 // Constructors per opcode | |
90 void InitAlt(uint32 out, uint32 out1); | |
91 void InitByteRange(int lo, int hi, int foldcase, uint32 out); | |
92 void InitCapture(int cap, uint32 out); | |
93 void InitEmptyWidth(EmptyOp empty, uint32 out); | |
94 void InitMatch(int id); | |
95 void InitNop(uint32 out); | |
96 void InitFail(); | |
97 | |
98 // Getters | |
99 int id(Prog* p) { return static_cast<int>(this - p->inst_); } | |
100 InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } | |
101 int out() { return out_opcode_>>3; } | |
102 int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); r
eturn out1_; } | |
103 int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } | |
104 int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } | |
105 int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } | |
106 int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } | |
107 int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } | |
108 EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } | |
109 bool greedy(Prog *p) { | |
110 DCHECK_EQ(opcode(), kInstAltMatch); | |
111 return p->inst(out())->opcode() == kInstByteRange; | |
112 } | |
113 | |
114 // Does this inst (an kInstByteRange) match c? | |
115 inline bool Matches(int c) { | |
116 DCHECK_EQ(opcode(), kInstByteRange); | |
117 if (foldcase_ && 'A' <= c && c <= 'Z') | |
118 c += 'a' - 'A'; | |
119 return lo_ <= c && c <= hi_; | |
120 } | |
121 | |
122 // Returns string representation for debugging. | |
123 string Dump(); | |
124 | |
125 // Maximum instruction id. | |
126 // (Must fit in out_opcode_, and PatchList steals another bit.) | |
127 static const int kMaxInst = (1<<28) - 1; | |
128 | |
129 private: | |
130 void set_opcode(InstOp opcode) { | |
131 out_opcode_ = (out()<<3) | opcode; | |
132 } | |
133 | |
134 void set_out(int out) { | |
135 out_opcode_ = (out<<3) | opcode(); | |
136 } | |
137 | |
138 void set_out_opcode(int out, InstOp opcode) { | |
139 out_opcode_ = (out<<3) | opcode; | |
140 } | |
141 | |
142 uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode | |
143 union { // additional instruction arguments: | |
144 uint32 out1_; // opcode == kInstAlt | |
145 // alternate next instruction | |
146 | |
147 int32 cap_; // opcode == kInstCapture | |
148 // Index of capture register (holds text | |
149 // position recorded by capturing parentheses). | |
150 // For \n (the submatch for the nth parentheses), | |
151 // the left parenthesis captures into register 2*n | |
152 // and the right one captures into register 2*n+1. | |
153 | |
154 int32 match_id_; // opcode == kInstMatch | |
155 // Match ID to identify this match (for re2::Set). | |
156 | |
157 struct { // opcode == kInstByteRange | |
158 uint8 lo_; // byte range is lo_-hi_ inclusive | |
159 uint8 hi_; // | |
160 uint8 foldcase_; // convert A-Z to a-z before checking range. | |
161 }; | |
162 | |
163 EmptyOp empty_; // opcode == kInstEmptyWidth | |
164 // empty_ is bitwise OR of kEmpty* flags above. | |
165 }; | |
166 | |
167 friend class Compiler; | |
168 friend struct PatchList; | |
169 friend class Prog; | |
170 | |
171 DISALLOW_COPY_AND_ASSIGN(Inst); | |
172 }; | |
173 | |
174 // Whether to anchor the search. | |
175 enum Anchor { | |
176 kUnanchored, // match anywhere | |
177 kAnchored, // match only starting at beginning of text | |
178 }; | |
179 | |
180 // Kind of match to look for (for anchor != kFullMatch) | |
181 // | |
182 // kLongestMatch mode finds the overall longest | |
183 // match but still makes its submatch choices the way | |
184 // Perl would, not in the way prescribed by POSIX. | |
185 // The POSIX rules are much more expensive to implement, | |
186 // and no one has needed them. | |
187 // | |
188 // kFullMatch is not strictly necessary -- we could use | |
189 // kLongestMatch and then check the length of the match -- but | |
190 // the matching code can run faster if it knows to consider only | |
191 // full matches. | |
192 enum MatchKind { | |
193 kFirstMatch, // like Perl, PCRE | |
194 kLongestMatch, // like egrep or POSIX | |
195 kFullMatch, // match only entire text; implies anchor==kAnchored | |
196 kManyMatch // for SearchDFA, records set of matches | |
197 }; | |
198 | |
199 Inst *inst(int id) { return &inst_[id]; } | |
200 int start() { return start_; } | |
201 int start_unanchored() { return start_unanchored_; } | |
202 void set_start(int start) { start_ = start; } | |
203 void set_start_unanchored(int start) { start_unanchored_ = start; } | |
204 int size() { return size_; } | |
205 bool reversed() { return reversed_; } | |
206 void set_reversed(bool reversed) { reversed_ = reversed; } | |
207 int byte_inst_count() { return byte_inst_count_; } | |
208 const Bitmap<256>& byterange() { return byterange_; } | |
209 void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } | |
210 int64 dfa_mem() { return dfa_mem_; } | |
211 int flags() { return flags_; } | |
212 void set_flags(int flags) { flags_ = flags; } | |
213 bool anchor_start() { return anchor_start_; } | |
214 void set_anchor_start(bool b) { anchor_start_ = b; } | |
215 bool anchor_end() { return anchor_end_; } | |
216 void set_anchor_end(bool b) { anchor_end_ = b; } | |
217 int bytemap_range() { return bytemap_range_; } | |
218 const uint8* bytemap() { return bytemap_; } | |
219 | |
220 // Returns string representation of program for debugging. | |
221 string Dump(); | |
222 string DumpUnanchored(); | |
223 | |
224 // Record that at some point in the prog, the bytes in the range | |
225 // lo-hi (inclusive) are treated as different from bytes outside the range. | |
226 // Tracking this lets the DFA collapse commonly-treated byte ranges | |
227 // when recording state pointers, greatly reducing its memory footprint. | |
228 void MarkByteRange(int lo, int hi); | |
229 | |
230 // Returns the set of kEmpty flags that are in effect at | |
231 // position p within context. | |
232 static uint32 EmptyFlags(const StringPiece& context, const char* p); | |
233 | |
234 // Returns whether byte c is a word character: ASCII only. | |
235 // Used by the implementation of \b and \B. | |
236 // This is not right for Unicode, but: | |
237 // - it's hard to get right in a byte-at-a-time matching world | |
238 // (the DFA has only one-byte lookahead). | |
239 // - even if the lookahead were possible, the Progs would be huge. | |
240 // This crude approximation is the same one PCRE uses. | |
241 static bool IsWordChar(uint8 c) { | |
242 return ('A' <= c && c <= 'Z') || | |
243 ('a' <= c && c <= 'z') || | |
244 ('0' <= c && c <= '9') || | |
245 c == '_'; | |
246 } | |
247 | |
248 // Execution engines. They all search for the regexp (run the prog) | |
249 // in text, which is in the larger context (used for ^ $ \b etc). | |
250 // Anchor and kind control the kind of search. | |
251 // Returns true if match found, false if not. | |
252 // If match found, fills match[0..nmatch-1] with submatch info. | |
253 // match[0] is overall match, match[1] is first set of parens, etc. | |
254 // If a particular submatch is not matched during the regexp match, | |
255 // it is set to NULL. | |
256 // | |
257 // Matching text == StringPiece(NULL, 0) is treated as any other empty | |
258 // string, but note that on return, it will not be possible to distinguish | |
259 // submatches that matched that empty string from submatches that didn't | |
260 // match anything. Either way, match[i] == NULL. | |
261 | |
262 // Search using NFA: can find submatches but kind of slow. | |
263 bool SearchNFA(const StringPiece& text, const StringPiece& context, | |
264 Anchor anchor, MatchKind kind, | |
265 StringPiece* match, int nmatch); | |
266 | |
267 // Search using DFA: much faster than NFA but only finds | |
268 // end of match and can use a lot more memory. | |
269 // Returns whether a match was found. | |
270 // If the DFA runs out of memory, sets *failed to true and returns false. | |
271 // If matches != NULL and kind == kManyMatch and there is a match, | |
272 // SearchDFA fills matches with the match IDs of the final matching state. | |
273 bool SearchDFA(const StringPiece& text, const StringPiece& context, | |
274 Anchor anchor, MatchKind kind, | |
275 StringPiece* match0, bool* failed, | |
276 vector<int>* matches); | |
277 | |
278 // Build the entire DFA for the given match kind. FOR TESTING ONLY. | |
279 // Usually the DFA is built out incrementally, as needed, which | |
280 // avoids lots of unnecessary work. This function is useful only | |
281 // for testing purposes. Returns number of states. | |
282 int BuildEntireDFA(MatchKind kind); | |
283 | |
284 // Compute byte map. | |
285 void ComputeByteMap(); | |
286 | |
287 // Run peep-hole optimizer on program. | |
288 void Optimize(); | |
289 | |
290 // One-pass NFA: only correct if IsOnePass() is true, | |
291 // but much faster than NFA (competitive with PCRE) | |
292 // for those expressions. | |
293 bool IsOnePass(); | |
294 bool SearchOnePass(const StringPiece& text, const StringPiece& context, | |
295 Anchor anchor, MatchKind kind, | |
296 StringPiece* match, int nmatch); | |
297 | |
298 // Bit-state backtracking. Fast on small cases but uses memory | |
299 // proportional to the product of the program size and the text size. | |
300 bool SearchBitState(const StringPiece& text, const StringPiece& context, | |
301 Anchor anchor, MatchKind kind, | |
302 StringPiece* match, int nmatch); | |
303 | |
304 static const int kMaxOnePassCapture = 5; // $0 through $4 | |
305 | |
306 // Backtracking search: the gold standard against which the other | |
307 // implementations are checked. FOR TESTING ONLY. | |
308 // It allocates a ton of memory to avoid running forever. | |
309 // It is also recursive, so can't use in production (will overflow stacks). | |
310 // The name "Unsafe" here is supposed to be a flag that | |
311 // you should not be using this function. | |
312 bool UnsafeSearchBacktrack(const StringPiece& text, | |
313 const StringPiece& context, | |
314 Anchor anchor, MatchKind kind, | |
315 StringPiece* match, int nmatch); | |
316 | |
317 // Computes range for any strings matching regexp. The min and max can in | |
318 // some cases be arbitrarily precise, so the caller gets to specify the | |
319 // maximum desired length of string returned. | |
320 // | |
321 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any | |
322 // string s that is an anchored match for this regexp satisfies | |
323 // min <= s && s <= max. | |
324 // | |
325 // Note that PossibleMatchRange() will only consider the first copy of an | |
326 // infinitely repeated element (i.e., any regexp element followed by a '*' or | |
327 // '+' operator). Regexps with "{N}" constructions are not affected, as those | |
328 // do not compile down to infinite repetitions. | |
329 // | |
330 // Returns true on success, false on error. | |
331 bool PossibleMatchRange(string* min, string* max, int maxlen); | |
332 | |
333 // EXPERIMENTAL! SUBJECT TO CHANGE! | |
334 // Outputs the program fanout into the given sparse array. | |
335 void Fanout(SparseArray<int>* fanout); | |
336 | |
337 // Compiles a collection of regexps to Prog. Each regexp will have | |
338 // its own Match instruction recording the index in the vector. | |
339 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, | |
340 Regexp* re); | |
341 | |
342 private: | |
343 friend class Compiler; | |
344 | |
345 DFA* GetDFA(MatchKind kind); | |
346 | |
347 bool anchor_start_; // regexp has explicit start anchor | |
348 bool anchor_end_; // regexp has explicit end anchor | |
349 bool reversed_; // whether program runs backward over input | |
350 bool did_onepass_; // has IsOnePass been called? | |
351 | |
352 int start_; // entry point for program | |
353 int start_unanchored_; // unanchored entry point for program | |
354 int size_; // number of instructions | |
355 int byte_inst_count_; // number of kInstByteRange instructions | |
356 int bytemap_range_; // bytemap_[x] < bytemap_range_ | |
357 int flags_; // regexp parse flags | |
358 int onepass_statesize_; // byte size of each OneState* node | |
359 | |
360 Inst* inst_; // pointer to instruction array | |
361 | |
362 Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ | |
363 DFA* volatile dfa_first_; // DFA cached for kFirstMatch | |
364 DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch | |
365 int64 dfa_mem_; // Maximum memory for DFAs. | |
366 void (*delete_dfa_)(DFA* dfa); | |
367 | |
368 Bitmap<256> byterange_; // byterange.Get(x) true if x ends a | |
369 // commonly-treated byte range. | |
370 uint8 bytemap_[256]; // map from input bytes to byte classes | |
371 uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x | |
372 | |
373 uint8* onepass_nodes_; // data for OnePass nodes | |
374 OneState* onepass_start_; // start node for OnePass program | |
375 | |
376 DISALLOW_COPY_AND_ASSIGN(Prog); | |
377 }; | |
378 | |
379 } // namespace re2 | |
380 | |
381 #endif // RE2_PROG_H__ | |
OLD | NEW |