| OLD | NEW |
| (Empty) |
| 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc | |
| 6 // | |
| 7 // Prog::BadSearchBacktrack is a backtracking regular expression search, | |
| 8 // except that it remembers where it has been, trading a lot of | |
| 9 // memory for a lot of time. It exists only for testing purposes. | |
| 10 // | |
| 11 // Let me repeat that. | |
| 12 // | |
| 13 // THIS CODE SHOULD NEVER BE USED IN PRODUCTION: | |
| 14 // - It uses a ton of memory. | |
| 15 // - It uses a ton of stack. | |
| 16 // - It uses CHECK and LOG(FATAL). | |
| 17 // - It implements unanchored search by repeated anchored search. | |
| 18 // | |
| 19 // On the other hand, it is very simple and a good reference | |
| 20 // implementation for the more complicated regexp packages. | |
| 21 // | |
| 22 // In BUILD, this file is linked into the ":testing" library, | |
| 23 // not the main library, in order to make it harder to pick up | |
| 24 // accidentally. | |
| 25 | |
| 26 #include "util/util.h" | |
| 27 #include "re2/prog.h" | |
| 28 #include "re2/regexp.h" | |
| 29 | |
| 30 namespace re2 { | |
| 31 | |
| 32 // Backtracker holds the state for a backtracking search. | |
| 33 // | |
| 34 // Excluding the search parameters, the main search state | |
| 35 // is just the "capture registers", which record, for the | |
| 36 // current execution, the string position at which each | |
| 37 // parenthesis was passed. cap_[0] and cap_[1] are the | |
| 38 // left and right parenthesis in $0, cap_[2] and cap_[3] in $1, etc. | |
| 39 // | |
| 40 // To avoid infinite loops during backtracking on expressions | |
| 41 // like (a*)*, the visited_[] bitmap marks the (state, string-position) | |
| 42 // pairs that have already been explored and are thus not worth | |
| 43 // re-exploring if we get there via another path. Modern backtracking | |
| 44 // libraries engineer their program representation differently, to make | |
| 45 // such infinite loops possible to avoid without keeping a giant visited_ | |
| 46 // bitmap, but visited_ works fine for a reference implementation | |
| 47 // and it has the nice benefit of making the search run in linear time. | |
| 48 class Backtracker { | |
| 49 public: | |
| 50 explicit Backtracker(Prog* prog); | |
| 51 ~Backtracker(); | |
| 52 | |
| 53 bool Search(const StringPiece& text, const StringPiece& context, | |
| 54 bool anchored, bool longest, | |
| 55 StringPiece* submatch, int nsubmatch); | |
| 56 | |
| 57 private: | |
| 58 // Explores from instruction ip at string position p looking for a match. | |
| 59 // Returns true if found (so that caller can stop trying other possibilities). | |
| 60 bool Visit(int id, const char* p); | |
| 61 | |
| 62 // Search parameters | |
| 63 Prog* prog_; // program being run | |
| 64 StringPiece text_; // text being searched | |
| 65 StringPiece context_; // greater context of text being searched | |
| 66 bool anchored_; // whether search is anchored at text.begin() | |
| 67 bool longest_; // whether search wants leftmost-longest match | |
| 68 bool endmatch_; // whether search must end at text.end() | |
| 69 StringPiece *submatch_; // submatches to fill in | |
| 70 int nsubmatch_; // # of submatches to fill in | |
| 71 | |
| 72 // Search state | |
| 73 const char* cap_[64]; // capture registers | |
| 74 uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked | |
| 75 size_t nvisited_; // # of words in bitmap | |
| 76 }; | |
| 77 | |
| 78 Backtracker::Backtracker(Prog* prog) | |
| 79 : prog_(prog), | |
| 80 anchored_(false), | |
| 81 longest_(false), | |
| 82 endmatch_(false), | |
| 83 submatch_(NULL), | |
| 84 nsubmatch_(0), | |
| 85 visited_(NULL), | |
| 86 nvisited_(0) { | |
| 87 } | |
| 88 | |
| 89 Backtracker::~Backtracker() { | |
| 90 delete[] visited_; | |
| 91 } | |
| 92 | |
| 93 // Runs a backtracking search. | |
| 94 bool Backtracker::Search(const StringPiece& text, const StringPiece& context, | |
| 95 bool anchored, bool longest, | |
| 96 StringPiece* submatch, int nsubmatch) { | |
| 97 text_ = text; | |
| 98 context_ = context; | |
| 99 if (context_.begin() == NULL) | |
| 100 context_ = text; | |
| 101 if (prog_->anchor_start() && text.begin() > context_.begin()) | |
| 102 return false; | |
| 103 if (prog_->anchor_end() && text.end() < context_.end()) | |
| 104 return false; | |
| 105 anchored_ = anchored | prog_->anchor_start(); | |
| 106 longest_ = longest | prog_->anchor_end(); | |
| 107 endmatch_ = prog_->anchor_end(); | |
| 108 submatch_ = submatch; | |
| 109 nsubmatch_ = nsubmatch; | |
| 110 CHECK(2*nsubmatch_ < arraysize(cap_)); | |
| 111 memset(cap_, 0, sizeof cap_); | |
| 112 | |
| 113 // We use submatch_[0] for our own bookkeeping, | |
| 114 // so it had better exist. | |
| 115 StringPiece sp0; | |
| 116 if (nsubmatch < 1) { | |
| 117 submatch_ = &sp0; | |
| 118 nsubmatch_ = 1; | |
| 119 } | |
| 120 submatch_[0] = NULL; | |
| 121 | |
| 122 // Allocate new visited_ bitmap -- size is proportional | |
| 123 // to text, so have to reallocate on each call to Search. | |
| 124 delete[] visited_; | |
| 125 nvisited_ = (prog_->size()*(text.size()+1) + 31)/32; | |
| 126 visited_ = new uint32[nvisited_]; | |
| 127 memset(visited_, 0, nvisited_*sizeof visited_[0]); | |
| 128 | |
| 129 // Anchored search must start at text.begin(). | |
| 130 if (anchored_) { | |
| 131 cap_[0] = text.begin(); | |
| 132 return Visit(prog_->start(), text.begin()); | |
| 133 } | |
| 134 | |
| 135 // Unanchored search, starting from each possible text position. | |
| 136 // Notice that we have to try the empty string at the end of | |
| 137 // the text, so the loop condition is p <= text.end(), not p < text.end(). | |
| 138 for (const char* p = text.begin(); p <= text.end(); p++) { | |
| 139 cap_[0] = p; | |
| 140 if (Visit(prog_->start(), p)) // Match must be leftmost; done. | |
| 141 return true; | |
| 142 } | |
| 143 return false; | |
| 144 } | |
| 145 | |
| 146 // Explores from instruction ip at string position p looking for a match. | |
| 147 // Return true if found (so that caller can stop trying other possibilities). | |
| 148 bool Backtracker::Visit(int id, const char* p) { | |
| 149 // Check bitmap. If we've already explored from here, | |
| 150 // either it didn't match or it did but we're hoping for a better match. | |
| 151 // Either way, don't go down that road again. | |
| 152 CHECK(p <= text_.end()); | |
| 153 size_t n = id*(text_.size()+1) + (p - text_.begin()); | |
| 154 CHECK_LT(n/32, nvisited_); | |
| 155 if (visited_[n/32] & (1 << (n&31))) | |
| 156 return false; | |
| 157 visited_[n/32] |= 1 << (n&31); | |
| 158 | |
| 159 // Pick out byte at current position. If at end of string, | |
| 160 // have to explore in hope of finishing a match. Use impossible byte -1. | |
| 161 int c = -1; | |
| 162 if (p < text_.end()) | |
| 163 c = *p & 0xFF; | |
| 164 | |
| 165 Prog::Inst* ip = prog_->inst(id); | |
| 166 switch (ip->opcode()) { | |
| 167 default: | |
| 168 LOG(FATAL) << "Unexpected opcode: " << (int)ip->opcode(); | |
| 169 return false; // not reached | |
| 170 | |
| 171 case kInstAlt: | |
| 172 case kInstAltMatch: | |
| 173 // Try both possible next states: out is preferred to out1. | |
| 174 if (Visit(ip->out(), p)) { | |
| 175 if (longest_) | |
| 176 Visit(ip->out1(), p); | |
| 177 return true; | |
| 178 } | |
| 179 return Visit(ip->out1(), p); | |
| 180 | |
| 181 case kInstByteRange: | |
| 182 if (ip->Matches(c)) | |
| 183 return Visit(ip->out(), p+1); | |
| 184 return false; | |
| 185 | |
| 186 case kInstCapture: | |
| 187 if (0 <= ip->cap() && ip->cap() < arraysize(cap_)) { | |
| 188 // Capture p to register, but save old value. | |
| 189 const char* q = cap_[ip->cap()]; | |
| 190 cap_[ip->cap()] = p; | |
| 191 bool ret = Visit(ip->out(), p); | |
| 192 // Restore old value as we backtrack. | |
| 193 cap_[ip->cap()] = q; | |
| 194 return ret; | |
| 195 } | |
| 196 return Visit(ip->out(), p); | |
| 197 | |
| 198 case kInstEmptyWidth: | |
| 199 if (ip->empty() & ~Prog::EmptyFlags(context_, p)) | |
| 200 return false; | |
| 201 return Visit(ip->out(), p); | |
| 202 | |
| 203 case kInstNop: | |
| 204 return Visit(ip->out(), p); | |
| 205 | |
| 206 case kInstMatch: | |
| 207 // We found a match. If it's the best so far, record the | |
| 208 // parameters in the caller's submatch_ array. | |
| 209 if (endmatch_ && p != context_.end()) | |
| 210 return false; | |
| 211 cap_[1] = p; | |
| 212 if (submatch_[0].data() == NULL || // First match so far ... | |
| 213 (longest_ && p > submatch_[0].end())) { // ... or better match | |
| 214 for (int i = 0; i < nsubmatch_; i++) | |
| 215 submatch_[i].set(cap_[2*i], | |
| 216 static_cast<int>(cap_[2*i+1] - cap_[2*i])); | |
| 217 } | |
| 218 return true; | |
| 219 | |
| 220 case kInstFail: | |
| 221 return false; | |
| 222 } | |
| 223 } | |
| 224 | |
| 225 // Runs a backtracking search. | |
| 226 bool Prog::UnsafeSearchBacktrack(const StringPiece& text, | |
| 227 const StringPiece& context, | |
| 228 Anchor anchor, | |
| 229 MatchKind kind, | |
| 230 StringPiece* match, | |
| 231 int nmatch) { | |
| 232 // If full match, we ask for an anchored longest match | |
| 233 // and then check that match[0] == text. | |
| 234 // So make sure match[0] exists. | |
| 235 StringPiece sp0; | |
| 236 if (kind == kFullMatch) { | |
| 237 anchor = kAnchored; | |
| 238 if (nmatch < 1) { | |
| 239 match = &sp0; | |
| 240 nmatch = 1; | |
| 241 } | |
| 242 } | |
| 243 | |
| 244 // Run the search. | |
| 245 Backtracker b(this); | |
| 246 bool anchored = anchor == kAnchored; | |
| 247 bool longest = kind != kFirstMatch; | |
| 248 if (!b.Search(text, context, anchored, longest, match, nmatch)) | |
| 249 return false; | |
| 250 if (kind == kFullMatch && match[0].end() != text.end()) | |
| 251 return false; | |
| 252 return true; | |
| 253 } | |
| 254 | |
| 255 } // namespace re2 | |
| OLD | NEW |