Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(235)

Side by Side Diff: third_party/re2/re2/prog.h

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src
Patch Set: Removed valgrind specific code that is in Chromium already Created 8 years, 5 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch | Annotate | Revision Log
OLDNEW
(Empty)
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compiled representation of regular expressions.
6 // See regexp.h for the Regexp class, which represents a regular
7 // expression symbolically.
8
9 #ifndef RE2_PROG_H__
10 #define RE2_PROG_H__
11
12 #include "util/util.h"
13 #include "re2/re2.h"
14
15 namespace re2 {
16
17 // Simple fixed-size bitmap.
18 template<int Bits>
19 class Bitmap {
20 public:
21 Bitmap() { Reset(); }
22 int Size() { return Bits; }
23
24 void Reset() {
25 for (int i = 0; i < Words; i++)
26 w_[i] = 0;
27 }
28 bool Get(int k) const {
29 return w_[k >> WordLog] & (1<<(k & 31));
30 }
31 void Set(int k) {
32 w_[k >> WordLog] |= 1<<(k & 31);
33 }
34 void Clear(int k) {
35 w_[k >> WordLog] &= ~(1<<(k & 31));
36 }
37 uint32 Word(int i) const {
38 return w_[i];
39 }
40
41 private:
42 static const int WordLog = 5;
43 static const int Words = (Bits+31)/32;
44 uint32 w_[Words];
45 DISALLOW_EVIL_CONSTRUCTORS(Bitmap);
46 };
47
48
49 // Opcodes for Inst
50 enum InstOp {
51 kInstAlt = 0, // choose between out_ and out1_
52 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice v ersa.
53 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
54 kInstCapture, // capturing parenthesis number cap_
55 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
56 kInstMatch, // found a match!
57 kInstNop, // no-op; occasionally unavoidable
58 kInstFail, // never match; occasionally unavoidable
59 };
60
61 // Bit flags for empty-width specials
62 enum EmptyOp {
63 kEmptyBeginLine = 1<<0, // ^ - beginning of line
64 kEmptyEndLine = 1<<1, // $ - end of line
65 kEmptyBeginText = 1<<2, // \A - beginning of text
66 kEmptyEndText = 1<<3, // \z - end of text
67 kEmptyWordBoundary = 1<<4, // \b - word boundary
68 kEmptyNonWordBoundary = 1<<5, // \B - not \b
69 kEmptyAllFlags = (1<<6)-1,
70 };
71
72 class Regexp;
73
74 class DFA;
75 struct OneState;
76
77 // Compiled form of regexp program.
78 class Prog {
79 public:
80 Prog();
81 ~Prog();
82
83 // Single instruction in regexp program.
84 class Inst {
85 public:
86 Inst() : out_opcode_(0), out1_(0) { }
87
88 // Constructors per opcode
89 void InitAlt(uint32 out, uint32 out1);
90 void InitByteRange(int lo, int hi, int foldcase, uint32 out);
91 void InitCapture(int cap, uint32 out);
92 void InitEmptyWidth(EmptyOp empty, uint32 out);
93 void InitMatch(int id);
94 void InitNop(uint32 out);
95 void InitFail();
96
97 // Getters
98 int id(Prog* p) { return this - p->inst_; }
99 InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
100 int out() { return out_opcode_>>3; }
101 int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); r eturn out1_; }
102 int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
103 int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
104 int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
105 int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
106 int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
107 EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
108 bool greedy(Prog *p) {
109 DCHECK_EQ(opcode(), kInstAltMatch);
110 return p->inst(out())->opcode() == kInstByteRange;
111 }
112
113 // Does this inst (an kInstByteRange) match c?
114 inline bool Matches(int c) {
115 DCHECK_EQ(opcode(), kInstByteRange);
116 if (foldcase_ && 'A' <= c && c <= 'Z')
117 c += 'a' - 'A';
118 return lo_ <= c && c <= hi_;
119 }
120
121 // Returns string representation for debugging.
122 string Dump();
123
124 // Maximum instruction id.
125 // (Must fit in out_opcode_, and PatchList steals another bit.)
126 static const int kMaxInst = (1<<28) - 1;
127
128 private:
129 void set_opcode(InstOp opcode) {
130 out_opcode_ = (out()<<3) | opcode;
131 }
132
133 void set_out(int out) {
134 out_opcode_ = (out<<3) | opcode();
135 }
136
137 void set_out_opcode(int out, InstOp opcode) {
138 out_opcode_ = (out<<3) | opcode;
139 }
140
141 uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
142 union { // additional instruction arguments:
143 uint32 out1_; // opcode == kInstAlt
144 // alternate next instruction
145
146 int32 cap_; // opcode == kInstCapture
147 // Index of capture register (holds text
148 // position recorded by capturing parentheses).
149 // For \n (the submatch for the nth parentheses),
150 // the left parenthesis captures into register 2*n
151 // and the right one captures into register 2*n+1.
152
153 int32 match_id_; // opcode == kInstMatch
154 // Match ID to identify this match (for re2::Set).
155
156 struct { // opcode == kInstByteRange
157 uint8 lo_; // byte range is lo_-hi_ inclusive
158 uint8 hi_; //
159 uint8 foldcase_; // convert A-Z to a-z before checking range.
160 };
161
162 EmptyOp empty_; // opcode == kInstEmptyWidth
163 // empty_ is bitwise OR of kEmpty* flags above.
164 };
165
166 friend class Compiler;
167 friend struct PatchList;
168 friend class Prog;
169
170 DISALLOW_EVIL_CONSTRUCTORS(Inst);
171 };
172
173 // Whether to anchor the search.
174 enum Anchor {
175 kUnanchored, // match anywhere
176 kAnchored, // match only starting at beginning of text
177 };
178
179 // Kind of match to look for (for anchor != kFullMatch)
180 //
181 // kLongestMatch mode finds the overall longest
182 // match but still makes its submatch choices the way
183 // Perl would, not in the way prescribed by POSIX.
184 // The POSIX rules are much more expensive to implement,
185 // and no one has needed them.
186 //
187 // kFullMatch is not strictly necessary -- we could use
188 // kLongestMatch and then check the length of the match -- but
189 // the matching code can run faster if it knows to consider only
190 // full matches.
191 enum MatchKind {
192 kFirstMatch, // like Perl, PCRE
193 kLongestMatch, // like egrep or POSIX
194 kFullMatch, // match only entire text; implies anchor==kAnchored
195 kManyMatch // for SearchDFA, records set of matches
196 };
197
198 Inst *inst(int id) { return &inst_[id]; }
199 int start() { return start_; }
200 int start_unanchored() { return start_unanchored_; }
201 void set_start(int start) { start_ = start; }
202 void set_start_unanchored(int start) { start_unanchored_ = start; }
203 int64 size() { return size_; }
204 bool reversed() { return reversed_; }
205 void set_reversed(bool reversed) { reversed_ = reversed; }
206 int64 byte_inst_count() { return byte_inst_count_; }
207 const Bitmap<256>& byterange() { return byterange_; }
208 void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
209 int64 dfa_mem() { return dfa_mem_; }
210 int flags() { return flags_; }
211 void set_flags(int flags) { flags_ = flags; }
212 bool anchor_start() { return anchor_start_; }
213 void set_anchor_start(bool b) { anchor_start_ = b; }
214 bool anchor_end() { return anchor_end_; }
215 void set_anchor_end(bool b) { anchor_end_ = b; }
216 int bytemap_range() { return bytemap_range_; }
217 const uint8* bytemap() { return bytemap_; }
218
219 // Returns string representation of program for debugging.
220 string Dump();
221 string DumpUnanchored();
222
223 // Record that at some point in the prog, the bytes in the range
224 // lo-hi (inclusive) are treated as different from bytes outside the range.
225 // Tracking this lets the DFA collapse commonly-treated byte ranges
226 // when recording state pointers, greatly reducing its memory footprint.
227 void MarkByteRange(int lo, int hi);
228
229 // Returns the set of kEmpty flags that are in effect at
230 // position p within context.
231 static uint32 EmptyFlags(const StringPiece& context, const char* p);
232
233 // Returns whether byte c is a word character: ASCII only.
234 // Used by the implementation of \b and \B.
235 // This is not right for Unicode, but:
236 // - it's hard to get right in a byte-at-a-time matching world
237 // (the DFA has only one-byte lookahead).
238 // - even if the lookahead were possible, the Progs would be huge.
239 // This crude approximation is the same one PCRE uses.
240 static bool IsWordChar(uint8 c) {
241 return ('A' <= c && c <= 'Z') ||
242 ('a' <= c && c <= 'z') ||
243 ('0' <= c && c <= '9') ||
244 c == '_';
245 }
246
247 // Execution engines. They all search for the regexp (run the prog)
248 // in text, which is in the larger context (used for ^ $ \b etc).
249 // Anchor and kind control the kind of search.
250 // Returns true if match found, false if not.
251 // If match found, fills match[0..nmatch-1] with submatch info.
252 // match[0] is overall match, match[1] is first set of parens, etc.
253 // If a particular submatch is not matched during the regexp match,
254 // it is set to NULL.
255 //
256 // Matching text == StringPiece(NULL, 0) is treated as any other empty
257 // string, but note that on return, it will not be possible to distinguish
258 // submatches that matched that empty string from submatches that didn't
259 // match anything. Either way, match[i] == NULL.
260
261 // Search using NFA: can find submatches but kind of slow.
262 bool SearchNFA(const StringPiece& text, const StringPiece& context,
263 Anchor anchor, MatchKind kind,
264 StringPiece* match, int nmatch);
265
266 // Search using DFA: much faster than NFA but only finds
267 // end of match and can use a lot more memory.
268 // Returns whether a match was found.
269 // If the DFA runs out of memory, sets *failed to true and returns false.
270 // If matches != NULL and kind == kManyMatch and there is a match,
271 // SearchDFA fills matches with the match IDs of the final matching state.
272 bool SearchDFA(const StringPiece& text, const StringPiece& context,
273 Anchor anchor, MatchKind kind,
274 StringPiece* match0, bool* failed,
275 vector<int>* matches);
276
277 // Build the entire DFA for the given match kind. FOR TESTING ONLY.
278 // Usually the DFA is built out incrementally, as needed, which
279 // avoids lots of unnecessary work. This function is useful only
280 // for testing purposes. Returns number of states.
281 int BuildEntireDFA(MatchKind kind);
282
283 // Compute byte map.
284 void ComputeByteMap();
285
286 // Run peep-hole optimizer on program.
287 void Optimize();
288
289 // One-pass NFA: only correct if IsOnePass() is true,
290 // but much faster than NFA (competitive with PCRE)
291 // for those expressions.
292 bool IsOnePass();
293 bool SearchOnePass(const StringPiece& text, const StringPiece& context,
294 Anchor anchor, MatchKind kind,
295 StringPiece* match, int nmatch);
296
297 // Bit-state backtracking. Fast on small cases but uses memory
298 // proportional to the product of the program size and the text size.
299 bool SearchBitState(const StringPiece& text, const StringPiece& context,
300 Anchor anchor, MatchKind kind,
301 StringPiece* match, int nmatch);
302
303 static const int kMaxOnePassCapture = 5; // $0 through $4
304
305 // Backtracking search: the gold standard against which the other
306 // implementations are checked. FOR TESTING ONLY.
307 // It allocates a ton of memory to avoid running forever.
308 // It is also recursive, so can't use in production (will overflow stacks).
309 // The name "Unsafe" here is supposed to be a flag that
310 // you should not be using this function.
311 bool UnsafeSearchBacktrack(const StringPiece& text,
312 const StringPiece& context,
313 Anchor anchor, MatchKind kind,
314 StringPiece* match, int nmatch);
315
316 // Computes range for any strings matching regexp. The min and max can in
317 // some cases be arbitrarily precise, so the caller gets to specify the
318 // maximum desired length of string returned.
319 //
320 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
321 // string s that is an anchored match for this regexp satisfies
322 // min <= s && s <= max.
323 //
324 // Note that PossibleMatchRange() will only consider the first copy of an
325 // infinitely repeated element (i.e., any regexp element followed by a '*' or
326 // '+' operator). Regexps with "{N}" constructions are not affected, as those
327 // do not compile down to infinite repetitions.
328 //
329 // Returns true on success, false on error.
330 bool PossibleMatchRange(string* min, string* max, int maxlen);
331
332 // Compiles a collection of regexps to Prog. Each regexp will have
333 // its own Match instruction recording the index in the vector.
334 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
335 Regexp* re);
336
337 private:
338 friend class Compiler;
339
340 DFA* GetDFA(MatchKind kind);
341
342 bool anchor_start_; // regexp has explicit start anchor
343 bool anchor_end_; // regexp has explicit end anchor
344 bool reversed_; // whether program runs backward over input
345 bool did_onepass_; // has IsOnePass been called?
346
347 int start_; // entry point for program
348 int start_unanchored_; // unanchored entry point for program
349 int size_; // number of instructions
350 int byte_inst_count_; // number of kInstByteRange instructions
351 int bytemap_range_; // bytemap_[x] < bytemap_range_
352 int flags_; // regexp parse flags
353 int onepass_statesize_; // byte size of each OneState* node
354
355 Inst* inst_; // pointer to instruction array
356
357 Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
358 DFA* volatile dfa_first_; // DFA cached for kFirstMatch
359 DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
360 int64 dfa_mem_; // Maximum memory for DFAs.
361 void (*delete_dfa_)(DFA* dfa);
362
363 Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
364 // commonly-treated byte range.
365 uint8 bytemap_[256]; // map from input bytes to byte classes
366 uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
367
368 uint8* onepass_nodes_; // data for OnePass nodes
369 OneState* onepass_start_; // start node for OnePass program
370
371 DISALLOW_EVIL_CONSTRUCTORS(Prog);
372 };
373
374 } // namespace re2
375
376 #endif // RE2_PROG_H__
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698