OLD | NEW |
(Empty) | |
| 1 // Copyright 2007 The RE2 Authors. All Rights Reserved. |
| 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. |
| 4 |
| 5 // Compiled representation of regular expressions. |
| 6 // See regexp.h for the Regexp class, which represents a regular |
| 7 // expression symbolically. |
| 8 |
| 9 #ifndef RE2_PROG_H__ |
| 10 #define RE2_PROG_H__ |
| 11 |
| 12 #include "util/util.h" |
| 13 #include "re2/re2.h" |
| 14 |
| 15 namespace re2 { |
| 16 |
| 17 // Simple fixed-size bitmap. |
| 18 template<int Bits> |
| 19 class Bitmap { |
| 20 public: |
| 21 Bitmap() { Reset(); } |
| 22 int Size() { return Bits; } |
| 23 |
| 24 void Reset() { |
| 25 for (int i = 0; i < Words; i++) |
| 26 w_[i] = 0; |
| 27 } |
| 28 bool Get(int k) const { |
| 29 return w_[k >> WordLog] & (1<<(k & 31)); |
| 30 } |
| 31 void Set(int k) { |
| 32 w_[k >> WordLog] |= 1<<(k & 31); |
| 33 } |
| 34 void Clear(int k) { |
| 35 w_[k >> WordLog] &= ~(1<<(k & 31)); |
| 36 } |
| 37 uint32 Word(int i) const { |
| 38 return w_[i]; |
| 39 } |
| 40 |
| 41 private: |
| 42 static const int WordLog = 5; |
| 43 static const int Words = (Bits+31)/32; |
| 44 uint32 w_[Words]; |
| 45 DISALLOW_EVIL_CONSTRUCTORS(Bitmap); |
| 46 }; |
| 47 |
| 48 |
| 49 // Opcodes for Inst |
| 50 enum InstOp { |
| 51 kInstAlt = 0, // choose between out_ and out1_ |
| 52 kInstAltMatch, // Alt: out_ is [00-FF] and back, out1_ is match; or vice v
ersa. |
| 53 kInstByteRange, // next (possible case-folded) byte must be in [lo_, hi_] |
| 54 kInstCapture, // capturing parenthesis number cap_ |
| 55 kInstEmptyWidth, // empty-width special (^ $ ...); bit(s) set in empty_ |
| 56 kInstMatch, // found a match! |
| 57 kInstNop, // no-op; occasionally unavoidable |
| 58 kInstFail, // never match; occasionally unavoidable |
| 59 }; |
| 60 |
| 61 // Bit flags for empty-width specials |
| 62 enum EmptyOp { |
| 63 kEmptyBeginLine = 1<<0, // ^ - beginning of line |
| 64 kEmptyEndLine = 1<<1, // $ - end of line |
| 65 kEmptyBeginText = 1<<2, // \A - beginning of text |
| 66 kEmptyEndText = 1<<3, // \z - end of text |
| 67 kEmptyWordBoundary = 1<<4, // \b - word boundary |
| 68 kEmptyNonWordBoundary = 1<<5, // \B - not \b |
| 69 kEmptyAllFlags = (1<<6)-1, |
| 70 }; |
| 71 |
| 72 class Regexp; |
| 73 |
| 74 class DFA; |
| 75 struct OneState; |
| 76 |
| 77 // Compiled form of regexp program. |
| 78 class Prog { |
| 79 public: |
| 80 Prog(); |
| 81 ~Prog(); |
| 82 |
| 83 // Single instruction in regexp program. |
| 84 class Inst { |
| 85 public: |
| 86 Inst() : out_opcode_(0), out1_(0) { } |
| 87 |
| 88 // Constructors per opcode |
| 89 void InitAlt(uint32 out, uint32 out1); |
| 90 void InitByteRange(int lo, int hi, int foldcase, uint32 out); |
| 91 void InitCapture(int cap, uint32 out); |
| 92 void InitEmptyWidth(EmptyOp empty, uint32 out); |
| 93 void InitMatch(int id); |
| 94 void InitNop(uint32 out); |
| 95 void InitFail(); |
| 96 |
| 97 // Getters |
| 98 int id(Prog* p) { return this - p->inst_; } |
| 99 InstOp opcode() { return static_cast<InstOp>(out_opcode_&7); } |
| 100 int out() { return out_opcode_>>3; } |
| 101 int out1() { DCHECK(opcode() == kInstAlt || opcode() == kInstAltMatch); r
eturn out1_; } |
| 102 int cap() { DCHECK_EQ(opcode(), kInstCapture); return cap_; } |
| 103 int lo() { DCHECK_EQ(opcode(), kInstByteRange); return lo_; } |
| 104 int hi() { DCHECK_EQ(opcode(), kInstByteRange); return hi_; } |
| 105 int foldcase() { DCHECK_EQ(opcode(), kInstByteRange); return foldcase_; } |
| 106 int match_id() { DCHECK_EQ(opcode(), kInstMatch); return match_id_; } |
| 107 EmptyOp empty() { DCHECK_EQ(opcode(), kInstEmptyWidth); return empty_; } |
| 108 bool greedy(Prog *p) { |
| 109 DCHECK_EQ(opcode(), kInstAltMatch); |
| 110 return p->inst(out())->opcode() == kInstByteRange; |
| 111 } |
| 112 |
| 113 // Does this inst (an kInstByteRange) match c? |
| 114 inline bool Matches(int c) { |
| 115 DCHECK_EQ(opcode(), kInstByteRange); |
| 116 if (foldcase_ && 'A' <= c && c <= 'Z') |
| 117 c += 'a' - 'A'; |
| 118 return lo_ <= c && c <= hi_; |
| 119 } |
| 120 |
| 121 // Returns string representation for debugging. |
| 122 string Dump(); |
| 123 |
| 124 // Maximum instruction id. |
| 125 // (Must fit in out_opcode_, and PatchList steals another bit.) |
| 126 static const int kMaxInst = (1<<28) - 1; |
| 127 |
| 128 private: |
| 129 void set_opcode(InstOp opcode) { |
| 130 out_opcode_ = (out()<<3) | opcode; |
| 131 } |
| 132 |
| 133 void set_out(int out) { |
| 134 out_opcode_ = (out<<3) | opcode(); |
| 135 } |
| 136 |
| 137 void set_out_opcode(int out, InstOp opcode) { |
| 138 out_opcode_ = (out<<3) | opcode; |
| 139 } |
| 140 |
| 141 uint32 out_opcode_; // 29 bits of out, 3 (low) bits opcode |
| 142 union { // additional instruction arguments: |
| 143 uint32 out1_; // opcode == kInstAlt |
| 144 // alternate next instruction |
| 145 |
| 146 int32 cap_; // opcode == kInstCapture |
| 147 // Index of capture register (holds text |
| 148 // position recorded by capturing parentheses). |
| 149 // For \n (the submatch for the nth parentheses), |
| 150 // the left parenthesis captures into register 2*n |
| 151 // and the right one captures into register 2*n+1. |
| 152 |
| 153 int32 match_id_; // opcode == kInstMatch |
| 154 // Match ID to identify this match (for re2::Set). |
| 155 |
| 156 struct { // opcode == kInstByteRange |
| 157 uint8 lo_; // byte range is lo_-hi_ inclusive |
| 158 uint8 hi_; // |
| 159 uint8 foldcase_; // convert A-Z to a-z before checking range. |
| 160 }; |
| 161 |
| 162 EmptyOp empty_; // opcode == kInstEmptyWidth |
| 163 // empty_ is bitwise OR of kEmpty* flags above. |
| 164 }; |
| 165 |
| 166 friend class Compiler; |
| 167 friend struct PatchList; |
| 168 friend class Prog; |
| 169 |
| 170 DISALLOW_EVIL_CONSTRUCTORS(Inst); |
| 171 }; |
| 172 |
| 173 // Whether to anchor the search. |
| 174 enum Anchor { |
| 175 kUnanchored, // match anywhere |
| 176 kAnchored, // match only starting at beginning of text |
| 177 }; |
| 178 |
| 179 // Kind of match to look for (for anchor != kFullMatch) |
| 180 // |
| 181 // kLongestMatch mode finds the overall longest |
| 182 // match but still makes its submatch choices the way |
| 183 // Perl would, not in the way prescribed by POSIX. |
| 184 // The POSIX rules are much more expensive to implement, |
| 185 // and no one has needed them. |
| 186 // |
| 187 // kFullMatch is not strictly necessary -- we could use |
| 188 // kLongestMatch and then check the length of the match -- but |
| 189 // the matching code can run faster if it knows to consider only |
| 190 // full matches. |
| 191 enum MatchKind { |
| 192 kFirstMatch, // like Perl, PCRE |
| 193 kLongestMatch, // like egrep or POSIX |
| 194 kFullMatch, // match only entire text; implies anchor==kAnchored |
| 195 kManyMatch // for SearchDFA, records set of matches |
| 196 }; |
| 197 |
| 198 Inst *inst(int id) { return &inst_[id]; } |
| 199 int start() { return start_; } |
| 200 int start_unanchored() { return start_unanchored_; } |
| 201 void set_start(int start) { start_ = start; } |
| 202 void set_start_unanchored(int start) { start_unanchored_ = start; } |
| 203 int64 size() { return size_; } |
| 204 bool reversed() { return reversed_; } |
| 205 void set_reversed(bool reversed) { reversed_ = reversed; } |
| 206 int64 byte_inst_count() { return byte_inst_count_; } |
| 207 const Bitmap<256>& byterange() { return byterange_; } |
| 208 void set_dfa_mem(int64 dfa_mem) { dfa_mem_ = dfa_mem; } |
| 209 int64 dfa_mem() { return dfa_mem_; } |
| 210 int flags() { return flags_; } |
| 211 void set_flags(int flags) { flags_ = flags; } |
| 212 bool anchor_start() { return anchor_start_; } |
| 213 void set_anchor_start(bool b) { anchor_start_ = b; } |
| 214 bool anchor_end() { return anchor_end_; } |
| 215 void set_anchor_end(bool b) { anchor_end_ = b; } |
| 216 int bytemap_range() { return bytemap_range_; } |
| 217 const uint8* bytemap() { return bytemap_; } |
| 218 |
| 219 // Returns string representation of program for debugging. |
| 220 string Dump(); |
| 221 string DumpUnanchored(); |
| 222 |
| 223 // Record that at some point in the prog, the bytes in the range |
| 224 // lo-hi (inclusive) are treated as different from bytes outside the range. |
| 225 // Tracking this lets the DFA collapse commonly-treated byte ranges |
| 226 // when recording state pointers, greatly reducing its memory footprint. |
| 227 void MarkByteRange(int lo, int hi); |
| 228 |
| 229 // Returns the set of kEmpty flags that are in effect at |
| 230 // position p within context. |
| 231 static uint32 EmptyFlags(const StringPiece& context, const char* p); |
| 232 |
| 233 // Returns whether byte c is a word character: ASCII only. |
| 234 // Used by the implementation of \b and \B. |
| 235 // This is not right for Unicode, but: |
| 236 // - it's hard to get right in a byte-at-a-time matching world |
| 237 // (the DFA has only one-byte lookahead). |
| 238 // - even if the lookahead were possible, the Progs would be huge. |
| 239 // This crude approximation is the same one PCRE uses. |
| 240 static bool IsWordChar(uint8 c) { |
| 241 return ('A' <= c && c <= 'Z') || |
| 242 ('a' <= c && c <= 'z') || |
| 243 ('0' <= c && c <= '9') || |
| 244 c == '_'; |
| 245 } |
| 246 |
| 247 // Execution engines. They all search for the regexp (run the prog) |
| 248 // in text, which is in the larger context (used for ^ $ \b etc). |
| 249 // Anchor and kind control the kind of search. |
| 250 // Returns true if match found, false if not. |
| 251 // If match found, fills match[0..nmatch-1] with submatch info. |
| 252 // match[0] is overall match, match[1] is first set of parens, etc. |
| 253 // If a particular submatch is not matched during the regexp match, |
| 254 // it is set to NULL. |
| 255 // |
| 256 // Matching text == StringPiece(NULL, 0) is treated as any other empty |
| 257 // string, but note that on return, it will not be possible to distinguish |
| 258 // submatches that matched that empty string from submatches that didn't |
| 259 // match anything. Either way, match[i] == NULL. |
| 260 |
| 261 // Search using NFA: can find submatches but kind of slow. |
| 262 bool SearchNFA(const StringPiece& text, const StringPiece& context, |
| 263 Anchor anchor, MatchKind kind, |
| 264 StringPiece* match, int nmatch); |
| 265 |
| 266 // Search using DFA: much faster than NFA but only finds |
| 267 // end of match and can use a lot more memory. |
| 268 // Returns whether a match was found. |
| 269 // If the DFA runs out of memory, sets *failed to true and returns false. |
| 270 // If matches != NULL and kind == kManyMatch and there is a match, |
| 271 // SearchDFA fills matches with the match IDs of the final matching state. |
| 272 bool SearchDFA(const StringPiece& text, const StringPiece& context, |
| 273 Anchor anchor, MatchKind kind, |
| 274 StringPiece* match0, bool* failed, |
| 275 vector<int>* matches); |
| 276 |
| 277 // Build the entire DFA for the given match kind. FOR TESTING ONLY. |
| 278 // Usually the DFA is built out incrementally, as needed, which |
| 279 // avoids lots of unnecessary work. This function is useful only |
| 280 // for testing purposes. Returns number of states. |
| 281 int BuildEntireDFA(MatchKind kind); |
| 282 |
| 283 // Compute byte map. |
| 284 void ComputeByteMap(); |
| 285 |
| 286 // Run peep-hole optimizer on program. |
| 287 void Optimize(); |
| 288 |
| 289 // One-pass NFA: only correct if IsOnePass() is true, |
| 290 // but much faster than NFA (competitive with PCRE) |
| 291 // for those expressions. |
| 292 bool IsOnePass(); |
| 293 bool SearchOnePass(const StringPiece& text, const StringPiece& context, |
| 294 Anchor anchor, MatchKind kind, |
| 295 StringPiece* match, int nmatch); |
| 296 |
| 297 // Bit-state backtracking. Fast on small cases but uses memory |
| 298 // proportional to the product of the program size and the text size. |
| 299 bool SearchBitState(const StringPiece& text, const StringPiece& context, |
| 300 Anchor anchor, MatchKind kind, |
| 301 StringPiece* match, int nmatch); |
| 302 |
| 303 static const int kMaxOnePassCapture = 5; // $0 through $4 |
| 304 |
| 305 // Backtracking search: the gold standard against which the other |
| 306 // implementations are checked. FOR TESTING ONLY. |
| 307 // It allocates a ton of memory to avoid running forever. |
| 308 // It is also recursive, so can't use in production (will overflow stacks). |
| 309 // The name "Unsafe" here is supposed to be a flag that |
| 310 // you should not be using this function. |
| 311 bool UnsafeSearchBacktrack(const StringPiece& text, |
| 312 const StringPiece& context, |
| 313 Anchor anchor, MatchKind kind, |
| 314 StringPiece* match, int nmatch); |
| 315 |
| 316 // Computes range for any strings matching regexp. The min and max can in |
| 317 // some cases be arbitrarily precise, so the caller gets to specify the |
| 318 // maximum desired length of string returned. |
| 319 // |
| 320 // Assuming PossibleMatchRange(&min, &max, N) returns successfully, any |
| 321 // string s that is an anchored match for this regexp satisfies |
| 322 // min <= s && s <= max. |
| 323 // |
| 324 // Note that PossibleMatchRange() will only consider the first copy of an |
| 325 // infinitely repeated element (i.e., any regexp element followed by a '*' or |
| 326 // '+' operator). Regexps with "{N}" constructions are not affected, as those |
| 327 // do not compile down to infinite repetitions. |
| 328 // |
| 329 // Returns true on success, false on error. |
| 330 bool PossibleMatchRange(string* min, string* max, int maxlen); |
| 331 |
| 332 // Compiles a collection of regexps to Prog. Each regexp will have |
| 333 // its own Match instruction recording the index in the vector. |
| 334 static Prog* CompileSet(const RE2::Options& options, RE2::Anchor anchor, |
| 335 Regexp* re); |
| 336 |
| 337 private: |
| 338 friend class Compiler; |
| 339 |
| 340 DFA* GetDFA(MatchKind kind); |
| 341 |
| 342 bool anchor_start_; // regexp has explicit start anchor |
| 343 bool anchor_end_; // regexp has explicit end anchor |
| 344 bool reversed_; // whether program runs backward over input |
| 345 bool did_onepass_; // has IsOnePass been called? |
| 346 |
| 347 int start_; // entry point for program |
| 348 int start_unanchored_; // unanchored entry point for program |
| 349 int size_; // number of instructions |
| 350 int byte_inst_count_; // number of kInstByteRange instructions |
| 351 int bytemap_range_; // bytemap_[x] < bytemap_range_ |
| 352 int flags_; // regexp parse flags |
| 353 int onepass_statesize_; // byte size of each OneState* node |
| 354 |
| 355 Inst* inst_; // pointer to instruction array |
| 356 |
| 357 Mutex dfa_mutex_; // Protects dfa_first_, dfa_longest_ |
| 358 DFA* volatile dfa_first_; // DFA cached for kFirstMatch |
| 359 DFA* volatile dfa_longest_; // DFA cached for kLongestMatch and kFullMatch |
| 360 int64 dfa_mem_; // Maximum memory for DFAs. |
| 361 void (*delete_dfa_)(DFA* dfa); |
| 362 |
| 363 Bitmap<256> byterange_; // byterange.Get(x) true if x ends a |
| 364 // commonly-treated byte range. |
| 365 uint8 bytemap_[256]; // map from input bytes to byte classes |
| 366 uint8 *unbytemap_; // bytemap_[unbytemap_[x]] == x |
| 367 |
| 368 uint8* onepass_nodes_; // data for OnePass nodes |
| 369 OneState* onepass_start_; // start node for OnePass program |
| 370 |
| 371 DISALLOW_EVIL_CONSTRUCTORS(Prog); |
| 372 }; |
| 373 |
| 374 } // namespace re2 |
| 375 |
| 376 #endif // RE2_PROG_H__ |
OLD | NEW |