OLD | NEW |
| (Empty) |
1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Tested by search_test.cc, exhaustive_test.cc, tester.cc | |
6 | |
7 // Prog::SearchBitState is a regular expression search with submatch | |
8 // tracking for small regular expressions and texts. Like | |
9 // testing/backtrack.cc, it allocates a bit vector with (length of | |
10 // text) * (length of prog) bits, to make sure it never explores the | |
11 // same (character position, instruction) state multiple times. This | |
12 // limits the search to run in time linear in the length of the text. | |
13 // | |
14 // Unlike testing/backtrack.cc, SearchBitState is not recursive | |
15 // on the text. | |
16 // | |
17 // SearchBitState is a fast replacement for the NFA code on small | |
18 // regexps and texts when SearchOnePass cannot be used. | |
19 | |
20 #include "re2/prog.h" | |
21 #include "re2/regexp.h" | |
22 | |
23 namespace re2 { | |
24 | |
25 struct Job { | |
26 int id; | |
27 int arg; | |
28 const char* p; | |
29 }; | |
30 | |
31 class BitState { | |
32 public: | |
33 explicit BitState(Prog* prog); | |
34 ~BitState(); | |
35 | |
36 // The usual Search prototype. | |
37 // Can only call Search once per BitState. | |
38 bool Search(const StringPiece& text, const StringPiece& context, | |
39 bool anchored, bool longest, | |
40 StringPiece* submatch, int nsubmatch); | |
41 | |
42 private: | |
43 inline bool ShouldVisit(int id, const char* p); | |
44 void Push(int id, const char* p, int arg); | |
45 bool GrowStack(); | |
46 bool TrySearch(int id, const char* p); | |
47 | |
48 // Search parameters | |
49 Prog* prog_; // program being run | |
50 StringPiece text_; // text being searched | |
51 StringPiece context_; // greater context of text being searched | |
52 bool anchored_; // whether search is anchored at text.begin() | |
53 bool longest_; // whether search wants leftmost-longest match | |
54 bool endmatch_; // whether match must end at text.end() | |
55 StringPiece *submatch_; // submatches to fill in | |
56 int nsubmatch_; // # of submatches to fill in | |
57 | |
58 // Search state | |
59 const char** cap_; // capture registers | |
60 int ncap_; | |
61 | |
62 static const int VisitedBits = 32; | |
63 uint32 *visited_; // bitmap: (Inst*, char*) pairs already backtracked | |
64 int nvisited_; // # of words in bitmap | |
65 | |
66 Job *job_; // stack of text positions to explore | |
67 int njob_; | |
68 int maxjob_; | |
69 }; | |
70 | |
71 BitState::BitState(Prog* prog) | |
72 : prog_(prog), | |
73 anchored_(false), | |
74 longest_(false), | |
75 endmatch_(false), | |
76 submatch_(NULL), | |
77 nsubmatch_(0), | |
78 cap_(NULL), | |
79 ncap_(0), | |
80 visited_(NULL), | |
81 nvisited_(0), | |
82 job_(NULL), | |
83 njob_(0), | |
84 maxjob_(0) { | |
85 } | |
86 | |
87 BitState::~BitState() { | |
88 delete[] visited_; | |
89 delete[] job_; | |
90 delete[] cap_; | |
91 } | |
92 | |
93 // Should the search visit the pair ip, p? | |
94 // If so, remember that it was visited so that the next time, | |
95 // we don't repeat the visit. | |
96 bool BitState::ShouldVisit(int id, const char* p) { | |
97 size_t n = id * (text_.size() + 1) + (p - text_.begin()); | |
98 if (visited_[n/VisitedBits] & (1 << (n & (VisitedBits-1)))) | |
99 return false; | |
100 visited_[n/VisitedBits] |= 1 << (n & (VisitedBits-1)); | |
101 return true; | |
102 } | |
103 | |
104 // Grow the stack. | |
105 bool BitState::GrowStack() { | |
106 // VLOG(0) << "Reallocate."; | |
107 maxjob_ *= 2; | |
108 Job* newjob = new Job[maxjob_]; | |
109 memmove(newjob, job_, njob_*sizeof job_[0]); | |
110 delete[] job_; | |
111 job_ = newjob; | |
112 if (njob_ >= maxjob_) { | |
113 LOG(DFATAL) << "Job stack overflow."; | |
114 return false; | |
115 } | |
116 return true; | |
117 } | |
118 | |
119 // Push the triple (id, p, arg) onto the stack, growing it if necessary. | |
120 void BitState::Push(int id, const char* p, int arg) { | |
121 if (njob_ >= maxjob_) { | |
122 if (!GrowStack()) | |
123 return; | |
124 } | |
125 int op = prog_->inst(id)->opcode(); | |
126 if (op == kInstFail) | |
127 return; | |
128 | |
129 // Only check ShouldVisit when arg == 0. | |
130 // When arg > 0, we are continuing a previous visit. | |
131 if (arg == 0 && !ShouldVisit(id, p)) | |
132 return; | |
133 | |
134 Job* j = &job_[njob_++]; | |
135 j->id = id; | |
136 j->p = p; | |
137 j->arg = arg; | |
138 } | |
139 | |
140 // Try a search from instruction id0 in state p0. | |
141 // Return whether it succeeded. | |
142 bool BitState::TrySearch(int id0, const char* p0) { | |
143 bool matched = false; | |
144 const char* end = text_.end(); | |
145 njob_ = 0; | |
146 Push(id0, p0, 0); | |
147 while (njob_ > 0) { | |
148 // Pop job off stack. | |
149 --njob_; | |
150 int id = job_[njob_].id; | |
151 const char* p = job_[njob_].p; | |
152 int arg = job_[njob_].arg; | |
153 | |
154 // Optimization: rather than push and pop, | |
155 // code that is going to Push and continue | |
156 // the loop simply updates ip, p, and arg | |
157 // and jumps to CheckAndLoop. We have to | |
158 // do the ShouldVisit check that Push | |
159 // would have, but we avoid the stack | |
160 // manipulation. | |
161 if (0) { | |
162 CheckAndLoop: | |
163 if (!ShouldVisit(id, p)) | |
164 continue; | |
165 } | |
166 | |
167 // Visit ip, p. | |
168 // VLOG(0) << "Job: " << ip->id() << " " | |
169 // << (p - text_.begin()) << " " << arg; | |
170 Prog::Inst* ip = prog_->inst(id); | |
171 switch (ip->opcode()) { | |
172 case kInstFail: | |
173 return false; | |
174 | |
175 default: | |
176 LOG(DFATAL) << "Unexpected opcode: " << ip->opcode() << " arg " << arg; | |
177 return false; | |
178 | |
179 case kInstAlt: | |
180 // Cannot just | |
181 // Push(ip->out1(), p, 0); | |
182 // Push(ip->out(), p, 0); | |
183 // If, during the processing of ip->out(), we encounter | |
184 // ip->out1() via another path, we want to process it then. | |
185 // Pushing it here will inhibit that. Instead, re-push | |
186 // ip with arg==1 as a reminder to push ip->out1() later. | |
187 switch (arg) { | |
188 case 0: | |
189 Push(id, p, 1); // come back when we're done | |
190 id = ip->out(); | |
191 goto CheckAndLoop; | |
192 | |
193 case 1: | |
194 // Finished ip->out(); try ip->out1(). | |
195 arg = 0; | |
196 id = ip->out1(); | |
197 goto CheckAndLoop; | |
198 } | |
199 LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; | |
200 continue; | |
201 | |
202 case kInstAltMatch: | |
203 // One opcode is byte range; the other leads to match. | |
204 if (ip->greedy(prog_)) { | |
205 // out1 is the match | |
206 Push(ip->out1(), p, 0); | |
207 id = ip->out1(); | |
208 p = end; | |
209 goto CheckAndLoop; | |
210 } | |
211 // out is the match - non-greedy | |
212 Push(ip->out(), end, 0); | |
213 id = ip->out(); | |
214 goto CheckAndLoop; | |
215 | |
216 case kInstByteRange: { | |
217 int c = -1; | |
218 if (p < end) | |
219 c = *p & 0xFF; | |
220 if (ip->Matches(c)) { | |
221 id = ip->out(); | |
222 p++; | |
223 goto CheckAndLoop; | |
224 } | |
225 continue; | |
226 } | |
227 | |
228 case kInstCapture: | |
229 switch (arg) { | |
230 case 0: | |
231 if (0 <= ip->cap() && ip->cap() < ncap_) { | |
232 // Capture p to register, but save old value. | |
233 Push(id, cap_[ip->cap()], 1); // come back when we're done | |
234 cap_[ip->cap()] = p; | |
235 } | |
236 // Continue on. | |
237 id = ip->out(); | |
238 goto CheckAndLoop; | |
239 case 1: | |
240 // Finished ip->out(); restore the old value. | |
241 cap_[ip->cap()] = p; | |
242 continue; | |
243 } | |
244 LOG(DFATAL) << "Bad arg in kInstCapture: " << arg; | |
245 continue; | |
246 | |
247 case kInstEmptyWidth: | |
248 if (ip->empty() & ~Prog::EmptyFlags(context_, p)) | |
249 continue; | |
250 id = ip->out(); | |
251 goto CheckAndLoop; | |
252 | |
253 case kInstNop: | |
254 id = ip->out(); | |
255 goto CheckAndLoop; | |
256 | |
257 case kInstMatch: { | |
258 if (endmatch_ && p != text_.end()) | |
259 continue; | |
260 | |
261 // VLOG(0) << "Found match."; | |
262 // We found a match. If the caller doesn't care | |
263 // where the match is, no point going further. | |
264 if (nsubmatch_ == 0) | |
265 return true; | |
266 | |
267 // Record best match so far. | |
268 // Only need to check end point, because this entire | |
269 // call is only considering one start position. | |
270 matched = true; | |
271 cap_[1] = p; | |
272 if (submatch_[0].data() == NULL || | |
273 (longest_ && p > submatch_[0].end())) { | |
274 for (int i = 0; i < nsubmatch_; i++) | |
275 submatch_[i].set(cap_[2*i], | |
276 static_cast<int>(cap_[2*i+1] - cap_[2*i])); | |
277 } | |
278 | |
279 // If going for first match, we're done. | |
280 if (!longest_) | |
281 return true; | |
282 | |
283 // If we used the entire text, no longer match is possible. | |
284 if (p == text_.end()) | |
285 return true; | |
286 | |
287 // Otherwise, continue on in hope of a longer match. | |
288 continue; | |
289 } | |
290 } | |
291 } | |
292 return matched; | |
293 } | |
294 | |
295 // Search text (within context) for prog_. | |
296 bool BitState::Search(const StringPiece& text, const StringPiece& context, | |
297 bool anchored, bool longest, | |
298 StringPiece* submatch, int nsubmatch) { | |
299 // Search parameters. | |
300 text_ = text; | |
301 context_ = context; | |
302 if (context_.begin() == NULL) | |
303 context_ = text; | |
304 if (prog_->anchor_start() && context_.begin() != text.begin()) | |
305 return false; | |
306 if (prog_->anchor_end() && context_.end() != text.end()) | |
307 return false; | |
308 anchored_ = anchored || prog_->anchor_start(); | |
309 longest_ = longest || prog_->anchor_end(); | |
310 endmatch_ = prog_->anchor_end(); | |
311 submatch_ = submatch; | |
312 nsubmatch_ = nsubmatch; | |
313 for (int i = 0; i < nsubmatch_; i++) | |
314 submatch_[i] = NULL; | |
315 | |
316 // Allocate scratch space. | |
317 nvisited_ = (prog_->size() * (text.size()+1) + VisitedBits-1) / VisitedBits; | |
318 visited_ = new uint32[nvisited_]; | |
319 memset(visited_, 0, nvisited_*sizeof visited_[0]); | |
320 // VLOG(0) << "nvisited_ = " << nvisited_; | |
321 | |
322 ncap_ = 2*nsubmatch; | |
323 if (ncap_ < 2) | |
324 ncap_ = 2; | |
325 cap_ = new const char*[ncap_]; | |
326 memset(cap_, 0, ncap_*sizeof cap_[0]); | |
327 | |
328 maxjob_ = 256; | |
329 job_ = new Job[maxjob_]; | |
330 | |
331 // Anchored search must start at text.begin(). | |
332 if (anchored_) { | |
333 cap_[0] = text.begin(); | |
334 return TrySearch(prog_->start(), text.begin()); | |
335 } | |
336 | |
337 // Unanchored search, starting from each possible text position. | |
338 // Notice that we have to try the empty string at the end of | |
339 // the text, so the loop condition is p <= text.end(), not p < text.end(). | |
340 // This looks like it's quadratic in the size of the text, | |
341 // but we are not clearing visited_ between calls to TrySearch, | |
342 // so no work is duplicated and it ends up still being linear. | |
343 for (const char* p = text.begin(); p <= text.end(); p++) { | |
344 cap_[0] = p; | |
345 if (TrySearch(prog_->start(), p)) // Match must be leftmost; done. | |
346 return true; | |
347 } | |
348 return false; | |
349 } | |
350 | |
351 // Bit-state search. | |
352 bool Prog::SearchBitState(const StringPiece& text, | |
353 const StringPiece& context, | |
354 Anchor anchor, | |
355 MatchKind kind, | |
356 StringPiece* match, | |
357 int nmatch) { | |
358 // If full match, we ask for an anchored longest match | |
359 // and then check that match[0] == text. | |
360 // So make sure match[0] exists. | |
361 StringPiece sp0; | |
362 if (kind == kFullMatch) { | |
363 anchor = kAnchored; | |
364 if (nmatch < 1) { | |
365 match = &sp0; | |
366 nmatch = 1; | |
367 } | |
368 } | |
369 | |
370 // Run the search. | |
371 BitState b(this); | |
372 bool anchored = anchor == kAnchored; | |
373 bool longest = kind != kFirstMatch; | |
374 if (!b.Search(text, context, anchored, longest, match, nmatch)) | |
375 return false; | |
376 if (kind == kFullMatch && match[0].end() != text.end()) | |
377 return false; | |
378 return true; | |
379 } | |
380 | |
381 } // namespace re2 | |
OLD | NEW |