Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: third_party/re2/re2/prog.h

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/prefilter_tree.cc ('k') | third_party/re2/re2/prog.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Compiled representation of regular expressions.
6 // See regexp.h for the Regexp class, which represents a regular
7 // expression symbolically.
8
9 #ifndef RE2_PROG_H__
10 #define RE2_PROG_H__
11
12 #include "util/util.h"
13 #include "util/sparse_array.h"
14 #include "re2/re2.h"
15
16 namespace re2 {
17
18 // Simple fixed-size bitmap.
19 template<int Bits>
20 class Bitmap {
21 public:
22 Bitmap() { Reset(); }
23 int Size() { return Bits; }
24
25 void Reset() {
26 for (int i = 0; i < Words; i++)
27 w_[i] = 0;
28 }
29 bool Get(int k) const {
30 return w_[k >> WordLog] & (1<<(k & 31));
31 }
32 void Set(int k) {
33 w_[k >> WordLog] |= 1<<(k & 31);
34 }
35 void Clear(int k) {
36 w_[k >> WordLog] &= ~(1<<(k & 31));
37 }
38 uint32 Word(int i) const {
39 return w_[i];
40 }
41
42 private:
43 static const int WordLog = 5;
44 static const int Words = (Bits+31)/32;
45 uint32 w_[Words];
46 DISALLOW_COPY_AND_ASSIGN(Bitmap);
47 };
48
49
50 // Opcodes for Inst
51 enum InstOp {
52 kInstAlt = 0, // choose between out_ and out1_
53 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice v ersa.
54 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_]
55 kInstCapture, // capturing parenthesis number cap_
56 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_
57 kInstMatch, // found a match!
58 kInstNop, // no-op; occasionally unavoidable
59 kInstFail, // never match; occasionally unavoidable
60 };
61
62 // Bit flags for empty-width specials
63 enum EmptyOp {
64 kEmptyBeginLine = 1<<0, // ^ - beginning of line
65 kEmptyEndLine = 1<<1, // $ - end of line
66 kEmptyBeginText = 1<<2, // \A - beginning of text
67 kEmptyEndText = 1<<3, // \z - end of text
68 kEmptyWordBoundary = 1<<4, // \b - word boundary
69 kEmptyNonWordBoundary = 1<<5, // \B - not \b
70 kEmptyAllFlags = (1<<6)-1,
71 };
72
73 class Regexp;
74
75 class DFA;
76 struct OneState;
77
78 // Compiled form of regexp program.
79 class Prog {
80 public:
81 Prog();
82 ~Prog();
83
84 // Single instruction in regexp program.
85 class Inst {
86 public:
87 Inst() : out_opcode_(0), out1_(0) { }
88
89 // Constructors per opcode
90 void InitAlt(uint32 out, uint32 out1);
91 void InitByteRange(int lo, int hi, int foldcase, uint32 out);
92 void InitCapture(int cap, uint32 out);
93 void InitEmptyWidth(EmptyOp empty, uint32 out);
94 void InitMatch(int id);
95 void InitNop(uint32 out);
96 void InitFail();
97
98 // Getters
99 int id(Prog* p) { return static_cast<int>(this - p->inst_); }
100 InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); }
101 int out() { return out_opcode_>>3; }
102 int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); r eturn out1_; }
103 int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; }
104 int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; }
105 int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; }
106 int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; }
107 int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; }
108 EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; }
109 bool greedy(Prog *p) {
110 DCHECK_EQ(opcode(), kInstAltMatch);
111 return p->inst(out())->opcode() == kInstByteRange;
112 }
113
114 // Does this inst (an kInstByteRange) match c?
115 inline bool Matches(int c) {
116 DCHECK_EQ(opcode(), kInstByteRange);
117 if (foldcase_ && 'A' <= c && c <= 'Z')
118 c += 'a' - 'A';
119 return lo_ <= c && c <= hi_;
120 }
121
122 // Returns string representation for debugging.
123 string Dump();
124
125 // Maximum instruction id.
126 // (Must fit in out_opcode_, and PatchList steals another bit.)
127 static const int kMaxInst = (1<<28) - 1;
128
129 private:
130 void set_opcode(InstOp opcode) {
131 out_opcode_ = (out()<<3) | opcode;
132 }
133
134 void set_out(int out) {
135 out_opcode_ = (out<<3) | opcode();
136 }
137
138 void set_out_opcode(int out, InstOp opcode) {
139 out_opcode_ = (out<<3) | opcode;
140 }
141
142 uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode
143 union { // additional instruction arguments:
144 uint32 out1_; // opcode == kInstAlt
145 // alternate next instruction
146
147 int32 cap_; // opcode == kInstCapture
148 // Index of capture register (holds text
149 // position recorded by capturing parentheses).
150 // For \n (the submatch for the nth parentheses),
151 // the left parenthesis captures into register 2*n
152 // and the right one captures into register 2*n+1.
153
154 int32 match_id_; // opcode == kInstMatch
155 // Match ID to identify this match (for re2::Set).
156
157 struct { // opcode == kInstByteRange
158 uint8 lo_; // byte range is lo_-hi_ inclusive
159 uint8 hi_; //
160 uint8 foldcase_; // convert A-Z to a-z before checking range.
161 };
162
163 EmptyOp empty_; // opcode == kInstEmptyWidth
164 // empty_ is bitwise OR of kEmpty* flags above.
165 };
166
167 friend class Compiler;
168 friend struct PatchList;
169 friend class Prog;
170
171 DISALLOW_COPY_AND_ASSIGN(Inst);
172 };
173
174 // Whether to anchor the search.
175 enum Anchor {
176 kUnanchored, // match anywhere
177 kAnchored, // match only starting at beginning of text
178 };
179
180 // Kind of match to look for (for anchor != kFullMatch)
181 //
182 // kLongestMatch mode finds the overall longest
183 // match but still makes its submatch choices the way
184 // Perl would, not in the way prescribed by POSIX.
185 // The POSIX rules are much more expensive to implement,
186 // and no one has needed them.
187 //
188 // kFullMatch is not strictly necessary -- we could use
189 // kLongestMatch and then check the length of the match -- but
190 // the matching code can run faster if it knows to consider only
191 // full matches.
192 enum MatchKind {
193 kFirstMatch, // like Perl, PCRE
194 kLongestMatch, // like egrep or POSIX
195 kFullMatch, // match only entire text; implies anchor==kAnchored
196 kManyMatch // for SearchDFA, records set of matches
197 };
198
199 Inst *inst(int id) { return &inst_[id]; }
200 int start() { return start_; }
201 int start_unanchored() { return start_unanchored_; }
202 void set_start(int start) { start_ = start; }
203 void set_start_unanchored(int start) { start_unanchored_ = start; }
204 int size() { return size_; }
205 bool reversed() { return reversed_; }
206 void set_reversed(bool reversed) { reversed_ = reversed; }
207 int byte_inst_count() { return byte_inst_count_; }
208 const Bitmap<256>& byterange() { return byterange_; }
209 void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; }
210 int64 dfa_mem() { return dfa_mem_; }
211 int flags() { return flags_; }
212 void set_flags(int flags) { flags_ = flags; }
213 bool anchor_start() { return anchor_start_; }
214 void set_anchor_start(bool b) { anchor_start_ = b; }
215 bool anchor_end() { return anchor_end_; }
216 void set_anchor_end(bool b) { anchor_end_ = b; }
217 int bytemap_range() { return bytemap_range_; }
218 const uint8* bytemap() { return bytemap_; }
219
220 // Returns string representation of program for debugging.
221 string Dump();
222 string DumpUnanchored();
223
224 // Record that at some point in the prog, the bytes in the range
225 // lo-hi (inclusive) are treated as different from bytes outside the range.
226 // Tracking this lets the DFA collapse commonly-treated byte ranges
227 // when recording state pointers, greatly reducing its memory footprint.
228 void MarkByteRange(int lo, int hi);
229
230 // Returns the set of kEmpty flags that are in effect at
231 // position p within context.
232 static uint32 EmptyFlags(const StringPiece& context, const char* p);
233
234 // Returns whether byte c is a word character: ASCII only.
235 // Used by the implementation of \b and \B.
236 // This is not right for Unicode, but:
237 // - it's hard to get right in a byte-at-a-time matching world
238 // (the DFA has only one-byte lookahead).
239 // - even if the lookahead were possible, the Progs would be huge.
240 // This crude approximation is the same one PCRE uses.
241 static bool IsWordChar(uint8 c) {
242 return ('A' <= c && c <= 'Z') ||
243 ('a' <= c && c <= 'z') ||
244 ('0' <= c && c <= '9') ||
245 c == '_';
246 }
247
248 // Execution engines. They all search for the regexp (run the prog)
249 // in text, which is in the larger context (used for ^ $ \b etc).
250 // Anchor and kind control the kind of search.
251 // Returns true if match found, false if not.
252 // If match found, fills match[0..nmatch-1] with submatch info.
253 // match[0] is overall match, match[1] is first set of parens, etc.
254 // If a particular submatch is not matched during the regexp match,
255 // it is set to NULL.
256 //
257 // Matching text == StringPiece(NULL, 0) is treated as any other empty
258 // string, but note that on return, it will not be possible to distinguish
259 // submatches that matched that empty string from submatches that didn't
260 // match anything. Either way, match[i] == NULL.
261
262 // Search using NFA: can find submatches but kind of slow.
263 bool SearchNFA(const StringPiece& text, const StringPiece& context,
264 Anchor anchor, MatchKind kind,
265 StringPiece* match, int nmatch);
266
267 // Search using DFA: much faster than NFA but only finds
268 // end of match and can use a lot more memory.
269 // Returns whether a match was found.
270 // If the DFA runs out of memory, sets *failed to true and returns false.
271 // If matches != NULL and kind == kManyMatch and there is a match,
272 // SearchDFA fills matches with the match IDs of the final matching state.
273 bool SearchDFA(const StringPiece& text, const StringPiece& context,
274 Anchor anchor, MatchKind kind,
275 StringPiece* match0, bool* failed,
276 vector<int>* matches);
277
278 // Build the entire DFA for the given match kind. FOR TESTING ONLY.
279 // Usually the DFA is built out incrementally, as needed, which
280 // avoids lots of unnecessary work. This function is useful only
281 // for testing purposes. Returns number of states.
282 int BuildEntireDFA(MatchKind kind);
283
284 // Compute byte map.
285 void ComputeByteMap();
286
287 // Run peep-hole optimizer on program.
288 void Optimize();
289
290 // One-pass NFA: only correct if IsOnePass() is true,
291 // but much faster than NFA (competitive with PCRE)
292 // for those expressions.
293 bool IsOnePass();
294 bool SearchOnePass(const StringPiece& text, const StringPiece& context,
295 Anchor anchor, MatchKind kind,
296 StringPiece* match, int nmatch);
297
298 // Bit-state backtracking. Fast on small cases but uses memory
299 // proportional to the product of the program size and the text size.
300 bool SearchBitState(const StringPiece& text, const StringPiece& context,
301 Anchor anchor, MatchKind kind,
302 StringPiece* match, int nmatch);
303
304 static const int kMaxOnePassCapture = 5; // $0 through $4
305
306 // Backtracking search: the gold standard against which the other
307 // implementations are checked. FOR TESTING ONLY.
308 // It allocates a ton of memory to avoid running forever.
309 // It is also recursive, so can't use in production (will overflow stacks).
310 // The name "Unsafe" here is supposed to be a flag that
311 // you should not be using this function.
312 bool UnsafeSearchBacktrack(const StringPiece& text,
313 const StringPiece& context,
314 Anchor anchor, MatchKind kind,
315 StringPiece* match, int nmatch);
316
317 // Computes range for any strings matching regexp. The min and max can in
318 // some cases be arbitrarily precise, so the caller gets to specify the
319 // maximum desired length of string returned.
320 //
321 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any
322 // string s that is an anchored match for this regexp satisfies
323 // min <= s && s <= max.
324 //
325 // Note that PossibleMatchRange() will only consider the first copy of an
326 // infinitely repeated element (i.e., any regexp element followed by a '*' or
327 // '+' operator). Regexps with "{N}" constructions are not affected, as those
328 // do not compile down to infinite repetitions.
329 //
330 // Returns true on success, false on error.
331 bool PossibleMatchRange(string* min, string* max, int maxlen);
332
333 // EXPERIMENTAL! SUBJECT TO CHANGE!
334 // Outputs the program fanout into the given sparse array.
335 void Fanout(SparseArray<int>* fanout);
336
337 // Compiles a collection of regexps to Prog. Each regexp will have
338 // its own Match instruction recording the index in the vector.
339 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor,
340 Regexp* re);
341
342 private:
343 friend class Compiler;
344
345 DFA* GetDFA(MatchKind kind);
346
347 bool anchor_start_; // regexp has explicit start anchor
348 bool anchor_end_; // regexp has explicit end anchor
349 bool reversed_; // whether program runs backward over input
350 bool did_onepass_; // has IsOnePass been called?
351
352 int start_; // entry point for program
353 int start_unanchored_; // unanchored entry point for program
354 int size_; // number of instructions
355 int byte_inst_count_; // number of kInstByteRange instructions
356 int bytemap_range_; // bytemap_[x] < bytemap_range_
357 int flags_; // regexp parse flags
358 int onepass_statesize_; // byte size of each OneState* node
359
360 Inst* inst_; // pointer to instruction array
361
362 Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_
363 DFA* volatile dfa_first_; // DFA cached for kFirstMatch
364 DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch
365 int64 dfa_mem_; // Maximum memory for DFAs.
366 void (*delete_dfa_)(DFA* dfa);
367
368 Bitmap<256> byterange_; // byterange.Get(x) true if x ends a
369 // commonly-treated byte range.
370 uint8 bytemap_[256]; // map from input bytes to byte classes
371 uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x
372
373 uint8* onepass_nodes_; // data for OnePass nodes
374 OneState* onepass_start_; // start node for OnePass program
375
376 DISALLOW_COPY_AND_ASSIGN(Prog);
377 };
378
379 } // namespace re2
380
381 #endif // RE2_PROG_H__
OLDNEW
« no previous file with comments | « third_party/re2/re2/prefilter_tree.cc ('k') | third_party/re2/re2/prog.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698