OLD | NEW |
| (Empty) |
1 // Copyright 2006-2007 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Tested by search_test.cc. | |
6 // | |
7 // Prog::SearchNFA, an NFA search. | |
8 // This is an actual NFA like the theorists talk about, | |
9 // not the pseudo-NFA found in backtracking regexp implementations. | |
10 // | |
11 // IMPLEMENTATION | |
12 // | |
13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor, | |
14 // which is a variant of the one described in Thompson's 1968 CACM paper. | |
15 // See http://swtch.com/~rsc/regexp/ for various history. The main feature | |
16 // over the DFA implementation is that it tracks submatch boundaries. | |
17 // | |
18 // When the choice of submatch boundaries is ambiguous, this particular | |
19 // implementation makes the same choices that traditional backtracking | |
20 // implementations (in particular, Perl and PCRE) do. | |
21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential | |
22 // time in the length of the input. | |
23 // | |
24 // Like Thompson's original machine and like the DFA implementation, this | |
25 // implementation notices a match only once it is one byte past it. | |
26 | |
27 #include "re2/prog.h" | |
28 #include "re2/regexp.h" | |
29 #include "util/sparse_array.h" | |
30 #include "util/sparse_set.h" | |
31 | |
32 namespace re2 { | |
33 | |
34 class NFA { | |
35 public: | |
36 NFA(Prog* prog); | |
37 ~NFA(); | |
38 | |
39 // Searches for a matching string. | |
40 // * If anchored is true, only considers matches starting at offset. | |
41 // Otherwise finds lefmost match at or after offset. | |
42 // * If longest is true, returns the longest match starting | |
43 // at the chosen start point. Otherwise returns the so-called | |
44 // left-biased match, the one traditional backtracking engines | |
45 // (like Perl and PCRE) find. | |
46 // Records submatch boundaries in submatch[1..nsubmatch-1]. | |
47 // Submatch[0] is the entire match. When there is a choice in | |
48 // which text matches each subexpression, the submatch boundaries | |
49 // are chosen to match what a backtracking implementation would choose. | |
50 bool Search(const StringPiece& text, const StringPiece& context, | |
51 bool anchored, bool longest, | |
52 StringPiece* submatch, int nsubmatch); | |
53 | |
54 static const int Debug = 0; | |
55 | |
56 private: | |
57 struct Thread { | |
58 union { | |
59 int id; | |
60 Thread* next; // when on free list | |
61 }; | |
62 const char** capture; | |
63 }; | |
64 | |
65 // State for explicit stack in AddToThreadq. | |
66 struct AddState { | |
67 int id; // Inst to process | |
68 int j; | |
69 const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip | |
70 | |
71 AddState() | |
72 : id(0), j(-1), cap_j(NULL) {} | |
73 explicit AddState(int id) | |
74 : id(id), j(-1), cap_j(NULL) {} | |
75 AddState(int id, const char* cap_j, int j) | |
76 : id(id), j(j), cap_j(cap_j) {} | |
77 }; | |
78 | |
79 // Threadq is a list of threads. The list is sorted by the order | |
80 // in which Perl would explore that particular state -- the earlier | |
81 // choices appear earlier in the list. | |
82 typedef SparseArray<Thread*> Threadq; | |
83 | |
84 inline Thread* AllocThread(); | |
85 inline void FreeThread(Thread*); | |
86 | |
87 // Add id (or its children, following unlabeled arrows) | |
88 // to the workqueue q with associated capture info. | |
89 void AddToThreadq(Threadq* q, int id, int flag, | |
90 const char* p, const char** capture); | |
91 | |
92 // Run runq on byte c, appending new states to nextq. | |
93 // Updates matched_ and match_ as new, better matches are found. | |
94 // p is position of the next byte (the one after c) | |
95 // in the input string, used when processing capturing parens. | |
96 // flag is the bitwise or of Bol, Eol, etc., specifying whether | |
97 // ^, $ and \b match the current input point (after c). | |
98 inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p)
; | |
99 | |
100 // Returns text version of capture information, for debugging. | |
101 string FormatCapture(const char** capture); | |
102 | |
103 inline void CopyCapture(const char** dst, const char** src); | |
104 | |
105 // Computes whether all matches must begin with the same first | |
106 // byte, and if so, returns that byte. If not, returns -1. | |
107 int ComputeFirstByte(); | |
108 | |
109 Prog* prog_; // underlying program | |
110 int start_; // start instruction in program | |
111 int ncapture_; // number of submatches to track | |
112 bool longest_; // whether searching for longest match | |
113 bool endmatch_; // whether match must end at text.end() | |
114 const char* btext_; // beginning of text being matched (for FormatSubmatch) | |
115 const char* etext_; // end of text being matched (for endmatch_) | |
116 Threadq q0_, q1_; // pre-allocated for Search. | |
117 const char** match_; // best match so far | |
118 bool matched_; // any match so far? | |
119 AddState* astack_; // pre-allocated for AddToThreadq | |
120 int nastack_; | |
121 int first_byte_; // required first byte for match, or -1 if none | |
122 | |
123 Thread* free_threads_; // free list | |
124 | |
125 DISALLOW_COPY_AND_ASSIGN(NFA); | |
126 }; | |
127 | |
128 NFA::NFA(Prog* prog) { | |
129 prog_ = prog; | |
130 start_ = prog->start(); | |
131 ncapture_ = 0; | |
132 longest_ = false; | |
133 endmatch_ = false; | |
134 btext_ = NULL; | |
135 etext_ = NULL; | |
136 q0_.resize(prog_->size()); | |
137 q1_.resize(prog_->size()); | |
138 nastack_ = 2*prog_->size(); | |
139 astack_ = new AddState[nastack_]; | |
140 match_ = NULL; | |
141 matched_ = false; | |
142 free_threads_ = NULL; | |
143 first_byte_ = ComputeFirstByte(); | |
144 } | |
145 | |
146 NFA::~NFA() { | |
147 delete[] match_; | |
148 delete[] astack_; | |
149 Thread* next; | |
150 for (Thread* t = free_threads_; t; t = next) { | |
151 next = t->next; | |
152 delete[] t->capture; | |
153 delete t; | |
154 } | |
155 } | |
156 | |
157 void NFA::FreeThread(Thread *t) { | |
158 if (t == NULL) | |
159 return; | |
160 t->next = free_threads_; | |
161 free_threads_ = t; | |
162 } | |
163 | |
164 NFA::Thread* NFA::AllocThread() { | |
165 Thread* t = free_threads_; | |
166 if (t == NULL) { | |
167 t = new Thread; | |
168 t->capture = new const char*[ncapture_]; | |
169 return t; | |
170 } | |
171 free_threads_ = t->next; | |
172 return t; | |
173 } | |
174 | |
175 void NFA::CopyCapture(const char** dst, const char** src) { | |
176 for (int i = 0; i < ncapture_; i+=2) { | |
177 dst[i] = src[i]; | |
178 dst[i+1] = src[i+1]; | |
179 } | |
180 } | |
181 | |
182 // Follows all empty arrows from id0 and enqueues all the states reached. | |
183 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. | |
184 // The pointer p is the current input position, and m is the | |
185 // current set of match boundaries. | |
186 void NFA::AddToThreadq(Threadq* q, int id0, int flag, | |
187 const char* p, const char** capture) { | |
188 if (id0 == 0) | |
189 return; | |
190 | |
191 // Astack_ is pre-allocated to avoid resize operations. | |
192 // It has room for 2*prog_->size() entries, which is enough: | |
193 // Each inst in prog can be processed at most once, | |
194 // pushing at most two entries on stk. | |
195 | |
196 int nstk = 0; | |
197 AddState* stk = astack_; | |
198 stk[nstk++] = AddState(id0); | |
199 | |
200 while (nstk > 0) { | |
201 DCHECK_LE(nstk, nastack_); | |
202 const AddState& a = stk[--nstk]; | |
203 if (a.j >= 0) | |
204 capture[a.j] = a.cap_j; | |
205 | |
206 int id = a.id; | |
207 if (id == 0) | |
208 continue; | |
209 if (q->has_index(id)) { | |
210 if (Debug) | |
211 fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str()); | |
212 continue; | |
213 } | |
214 | |
215 // Create entry in q no matter what. We might fill it in below, | |
216 // or we might not. Even if not, it is necessary to have it, | |
217 // so that we don't revisit id0 during the recursion. | |
218 q->set_new(id, NULL); | |
219 | |
220 Thread** tp = &q->find(id)->second; | |
221 int j; | |
222 Thread* t; | |
223 Prog::Inst* ip = prog_->inst(id); | |
224 switch (ip->opcode()) { | |
225 default: | |
226 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; | |
227 break; | |
228 | |
229 case kInstFail: | |
230 break; | |
231 | |
232 case kInstAltMatch: | |
233 // Save state; will pick up at next byte. | |
234 t = AllocThread(); | |
235 t->id = id; | |
236 CopyCapture(t->capture, capture); | |
237 *tp = t; | |
238 // fall through | |
239 | |
240 case kInstAlt: | |
241 // Explore alternatives. | |
242 stk[nstk++] = AddState(ip->out1()); | |
243 stk[nstk++] = AddState(ip->out()); | |
244 break; | |
245 | |
246 case kInstNop: | |
247 // Continue on. | |
248 stk[nstk++] = AddState(ip->out()); | |
249 break; | |
250 | |
251 case kInstCapture: | |
252 if ((j=ip->cap()) < ncapture_) { | |
253 // Push a dummy whose only job is to restore capture[j] | |
254 // once we finish exploring this possibility. | |
255 stk[nstk++] = AddState(0, capture[j], j); | |
256 | |
257 // Record capture. | |
258 capture[j] = p; | |
259 } | |
260 stk[nstk++] = AddState(ip->out()); | |
261 break; | |
262 | |
263 case kInstMatch: | |
264 case kInstByteRange: | |
265 // Save state; will pick up at next byte. | |
266 t = AllocThread(); | |
267 t->id = id; | |
268 CopyCapture(t->capture, capture); | |
269 *tp = t; | |
270 if (Debug) | |
271 fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(),
t); | |
272 break; | |
273 | |
274 case kInstEmptyWidth: | |
275 // Continue on if we have all the right flag bits. | |
276 if (ip->empty() & ~flag) | |
277 break; | |
278 stk[nstk++] = AddState(ip->out()); | |
279 break; | |
280 } | |
281 } | |
282 } | |
283 | |
284 // Run runq on byte c, appending new states to nextq. | |
285 // Updates match as new, better matches are found. | |
286 // p is position of the byte c in the input string, | |
287 // used when processing capturing parens. | |
288 // flag is the bitwise or of Bol, Eol, etc., specifying whether | |
289 // ^, $ and \b match the current input point (after c). | |
290 // Frees all the threads on runq. | |
291 // If there is a shortcut to the end, returns that shortcut. | |
292 int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { | |
293 nextq->clear(); | |
294 | |
295 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { | |
296 Thread* t = i->second; | |
297 if (t == NULL) | |
298 continue; | |
299 | |
300 if (longest_) { | |
301 // Can skip any threads started after our current best match. | |
302 if (matched_ && match_[0] < t->capture[0]) { | |
303 FreeThread(t); | |
304 continue; | |
305 } | |
306 } | |
307 | |
308 int id = t->id; | |
309 Prog::Inst* ip = prog_->inst(id); | |
310 | |
311 switch (ip->opcode()) { | |
312 default: | |
313 // Should only see the values handled below. | |
314 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; | |
315 break; | |
316 | |
317 case kInstByteRange: | |
318 if (ip->Matches(c)) | |
319 AddToThreadq(nextq, ip->out(), flag, p+1, t->capture); | |
320 break; | |
321 | |
322 case kInstAltMatch: | |
323 if (i != runq->begin()) | |
324 break; | |
325 // The match is ours if we want it. | |
326 if (ip->greedy(prog_) || longest_) { | |
327 CopyCapture((const char**)match_, t->capture); | |
328 FreeThread(t); | |
329 for (++i; i != runq->end(); ++i) | |
330 FreeThread(i->second); | |
331 runq->clear(); | |
332 matched_ = true; | |
333 if (ip->greedy(prog_)) | |
334 return ip->out1(); | |
335 return ip->out(); | |
336 } | |
337 break; | |
338 | |
339 case kInstMatch: | |
340 if (endmatch_ && p != etext_) | |
341 break; | |
342 | |
343 const char* old = t->capture[1]; // previous end pointer | |
344 t->capture[1] = p; | |
345 if (longest_) { | |
346 // Leftmost-longest mode: save this match only if | |
347 // it is either farther to the left or at the same | |
348 // point but longer than an existing match. | |
349 if (!matched_ || t->capture[0] < match_[0] || | |
350 (t->capture[0] == match_[0] && t->capture[1] > match_[1])) | |
351 CopyCapture((const char**)match_, t->capture); | |
352 } else { | |
353 // Leftmost-biased mode: this match is by definition | |
354 // better than what we've already found (see next line). | |
355 CopyCapture((const char**)match_, t->capture); | |
356 | |
357 // Cut off the threads that can only find matches | |
358 // worse than the one we just found: don't run the | |
359 // rest of the current Threadq. | |
360 t->capture[0] = old; | |
361 FreeThread(t); | |
362 for (++i; i != runq->end(); ++i) | |
363 FreeThread(i->second); | |
364 runq->clear(); | |
365 matched_ = true; | |
366 return 0; | |
367 } | |
368 t->capture[0] = old; | |
369 matched_ = true; | |
370 break; | |
371 } | |
372 FreeThread(t); | |
373 } | |
374 runq->clear(); | |
375 return 0; | |
376 } | |
377 | |
378 string NFA::FormatCapture(const char** capture) { | |
379 string s; | |
380 | |
381 for (int i = 0; i < ncapture_; i+=2) { | |
382 if (capture[i] == NULL) | |
383 StringAppendF(&s, "(?,?)"); | |
384 else if (capture[i+1] == NULL) | |
385 StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_)); | |
386 else | |
387 StringAppendF(&s, "(%d,%d)", | |
388 (int)(capture[i] - btext_), | |
389 (int)(capture[i+1] - btext_)); | |
390 } | |
391 return s; | |
392 } | |
393 | |
394 // Returns whether haystack contains needle's memory. | |
395 static bool StringPieceContains(const StringPiece haystack, const StringPiece ne
edle) { | |
396 return haystack.begin() <= needle.begin() && | |
397 haystack.end() >= needle.end(); | |
398 } | |
399 | |
400 bool NFA::Search(const StringPiece& text, const StringPiece& const_context, | |
401 bool anchored, bool longest, | |
402 StringPiece* submatch, int nsubmatch) { | |
403 if (start_ == 0) | |
404 return false; | |
405 | |
406 StringPiece context = const_context; | |
407 if (context.begin() == NULL) | |
408 context = text; | |
409 | |
410 if (!StringPieceContains(context, text)) { | |
411 LOG(FATAL) << "Bad args: context does not contain text " | |
412 << reinterpret_cast<const void*>(context.begin()) | |
413 << "+" << context.size() << " " | |
414 << reinterpret_cast<const void*>(text.begin()) | |
415 << "+" << text.size(); | |
416 return false; | |
417 } | |
418 | |
419 if (prog_->anchor_start() && context.begin() != text.begin()) | |
420 return false; | |
421 if (prog_->anchor_end() && context.end() != text.end()) | |
422 return false; | |
423 anchored |= prog_->anchor_start(); | |
424 if (prog_->anchor_end()) { | |
425 longest = true; | |
426 endmatch_ = true; | |
427 etext_ = text.end(); | |
428 } | |
429 | |
430 if (nsubmatch < 0) { | |
431 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; | |
432 return false; | |
433 } | |
434 | |
435 // Save search parameters. | |
436 ncapture_ = 2*nsubmatch; | |
437 longest_ = longest; | |
438 | |
439 if (nsubmatch == 0) { | |
440 // We need to maintain match[0], both to distinguish the | |
441 // longest match (if longest is true) and also to tell | |
442 // whether we've seen any matches at all. | |
443 ncapture_ = 2; | |
444 } | |
445 | |
446 match_ = new const char*[ncapture_]; | |
447 matched_ = false; | |
448 memset(match_, 0, ncapture_*sizeof match_[0]); | |
449 | |
450 // For debugging prints. | |
451 btext_ = context.begin(); | |
452 | |
453 if (Debug) { | |
454 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", | |
455 text.as_string().c_str(), context.as_string().c_str(), anchored, | |
456 longest); | |
457 } | |
458 | |
459 // Set up search. | |
460 Threadq* runq = &q0_; | |
461 Threadq* nextq = &q1_; | |
462 runq->clear(); | |
463 nextq->clear(); | |
464 memset(&match_[0], 0, ncapture_*sizeof match_[0]); | |
465 const char* bp = context.begin(); | |
466 int c = -1; | |
467 int wasword = 0; | |
468 | |
469 if (text.begin() > context.begin()) { | |
470 c = text.begin()[-1] & 0xFF; | |
471 wasword = Prog::IsWordChar(static_cast<uint8>(c)); | |
472 } | |
473 | |
474 // Loop over the text, stepping the machine. | |
475 for (const char* p = text.begin();; p++) { | |
476 // Check for empty-width specials. | |
477 int flag = 0; | |
478 | |
479 // ^ and \A | |
480 if (p == context.begin()) | |
481 flag |= kEmptyBeginText | kEmptyBeginLine; | |
482 else if (p <= context.end() && p[-1] == '\n') | |
483 flag |= kEmptyBeginLine; | |
484 | |
485 // $ and \z | |
486 if (p == context.end()) | |
487 flag |= kEmptyEndText | kEmptyEndLine; | |
488 else if (p < context.end() && p[0] == '\n') | |
489 flag |= kEmptyEndLine; | |
490 | |
491 // \b and \B | |
492 int isword = 0; | |
493 if (p < context.end()) | |
494 isword = Prog::IsWordChar(p[0] & 0xFF); | |
495 | |
496 if (isword != wasword) | |
497 flag |= kEmptyWordBoundary; | |
498 else | |
499 flag |= kEmptyNonWordBoundary; | |
500 | |
501 if (Debug) { | |
502 fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c
, flag, isword, wasword); | |
503 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { | |
504 Thread* t = i->second; | |
505 if (t == NULL) | |
506 continue; | |
507 fprintf(stderr, " %d%s", t->id, | |
508 FormatCapture((const char**)t->capture).c_str()); | |
509 } | |
510 fprintf(stderr, "\n"); | |
511 } | |
512 | |
513 // Process previous character (waited until now to avoid | |
514 // repeating the flag computation above). | |
515 // This is a no-op the first time around the loop, because | |
516 // runq is empty. | |
517 int id = Step(runq, nextq, c, flag, p-1); | |
518 DCHECK_EQ(runq->size(), 0); | |
519 swap(nextq, runq); | |
520 nextq->clear(); | |
521 if (id != 0) { | |
522 // We're done: full match ahead. | |
523 p = text.end(); | |
524 for (;;) { | |
525 Prog::Inst* ip = prog_->inst(id); | |
526 switch (ip->opcode()) { | |
527 default: | |
528 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(
); | |
529 break; | |
530 | |
531 case kInstCapture: | |
532 if (ip->cap() < ncapture_) | |
533 match_[ip->cap()] = p; | |
534 id = ip->out(); | |
535 continue; | |
536 | |
537 case kInstNop: | |
538 id = ip->out(); | |
539 continue; | |
540 | |
541 case kInstMatch: | |
542 match_[1] = p; | |
543 matched_ = true; | |
544 break; | |
545 | |
546 case kInstEmptyWidth: | |
547 if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) { | |
548 LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->
empty(); | |
549 break; | |
550 } | |
551 id = ip->out(); | |
552 continue; | |
553 } | |
554 break; | |
555 } | |
556 break; | |
557 } | |
558 | |
559 if (p > text.end()) | |
560 break; | |
561 | |
562 // Start a new thread if there have not been any matches. | |
563 // (No point in starting a new thread if there have been | |
564 // matches, since it would be to the right of the match | |
565 // we already found.) | |
566 if (!matched_ && (!anchored || p == text.begin())) { | |
567 // If there's a required first byte for an unanchored search | |
568 // and we're not in the middle of any possible matches, | |
569 // use memchr to search for the byte quickly. | |
570 if (!anchored && first_byte_ >= 0 && runq->size() == 0 && | |
571 p < text.end() && (p[0] & 0xFF) != first_byte_) { | |
572 p = reinterpret_cast<const char*>(memchr(p, first_byte_, | |
573 text.end() - p)); | |
574 if (p == NULL) { | |
575 p = text.end(); | |
576 isword = 0; | |
577 } else { | |
578 isword = Prog::IsWordChar(p[0] & 0xFF); | |
579 } | |
580 flag = Prog::EmptyFlags(context, p); | |
581 } | |
582 | |
583 // Steal match storage (cleared but unused as of yet) | |
584 // temporarily to hold match boundaries for new thread. | |
585 match_[0] = p; | |
586 AddToThreadq(runq, start_, flag, p, match_); | |
587 match_[0] = NULL; | |
588 } | |
589 | |
590 // If all the threads have died, stop early. | |
591 if (runq->size() == 0) { | |
592 if (Debug) | |
593 fprintf(stderr, "dead\n"); | |
594 break; | |
595 } | |
596 | |
597 if (p == text.end()) | |
598 c = 0; | |
599 else | |
600 c = *p & 0xFF; | |
601 wasword = isword; | |
602 | |
603 // Will run step(runq, nextq, c, ...) on next iteration. See above. | |
604 } | |
605 | |
606 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) | |
607 FreeThread(i->second); | |
608 | |
609 if (matched_) { | |
610 for (int i = 0; i < nsubmatch; i++) | |
611 submatch[i].set(match_[2*i], | |
612 static_cast<int>(match_[2*i+1] - match_[2*i])); | |
613 if (Debug) | |
614 fprintf(stderr, "match (%d,%d)\n", | |
615 static_cast<int>(match_[0] - btext_), | |
616 static_cast<int>(match_[1] - btext_)); | |
617 return true; | |
618 } | |
619 VLOG(1) << "No matches found"; | |
620 return false; | |
621 } | |
622 | |
623 // Computes whether all successful matches have a common first byte, | |
624 // and if so, returns that byte. If not, returns -1. | |
625 int NFA::ComputeFirstByte() { | |
626 if (start_ == 0) | |
627 return -1; | |
628 | |
629 int b = -1; // first byte, not yet computed | |
630 | |
631 typedef SparseSet Workq; | |
632 Workq q(prog_->size()); | |
633 q.insert(start_); | |
634 for (Workq::iterator it = q.begin(); it != q.end(); ++it) { | |
635 int id = *it; | |
636 Prog::Inst* ip = prog_->inst(id); | |
637 switch (ip->opcode()) { | |
638 default: | |
639 LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; | |
640 break; | |
641 | |
642 case kInstMatch: | |
643 // The empty string matches: no first byte. | |
644 return -1; | |
645 | |
646 case kInstByteRange: | |
647 // Must match only a single byte | |
648 if (ip->lo() != ip->hi()) | |
649 return -1; | |
650 if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') | |
651 return -1; | |
652 // If we haven't seen any bytes yet, record it; | |
653 // otherwise must match the one we saw before. | |
654 if (b == -1) | |
655 b = ip->lo(); | |
656 else if (b != ip->lo()) | |
657 return -1; | |
658 break; | |
659 | |
660 case kInstNop: | |
661 case kInstCapture: | |
662 case kInstEmptyWidth: | |
663 // Continue on. | |
664 // Ignore ip->empty() flags for kInstEmptyWidth | |
665 // in order to be as conservative as possible | |
666 // (assume all possible empty-width flags are true). | |
667 if (ip->out()) | |
668 q.insert(ip->out()); | |
669 break; | |
670 | |
671 case kInstAlt: | |
672 case kInstAltMatch: | |
673 // Explore alternatives. | |
674 if (ip->out()) | |
675 q.insert(ip->out()); | |
676 if (ip->out1()) | |
677 q.insert(ip->out1()); | |
678 break; | |
679 | |
680 case kInstFail: | |
681 break; | |
682 } | |
683 } | |
684 return b; | |
685 } | |
686 | |
687 bool | |
688 Prog::SearchNFA(const StringPiece& text, const StringPiece& context, | |
689 Anchor anchor, MatchKind kind, | |
690 StringPiece* match, int nmatch) { | |
691 if (NFA::Debug) | |
692 Dump(); | |
693 | |
694 NFA nfa(this); | |
695 StringPiece sp; | |
696 if (kind == kFullMatch) { | |
697 anchor = kAnchored; | |
698 if (nmatch == 0) { | |
699 match = &sp; | |
700 nmatch = 1; | |
701 } | |
702 } | |
703 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match
, nmatch)) | |
704 return false; | |
705 if (kind == kFullMatch && match[0].end() != text.end()) | |
706 return false; | |
707 return true; | |
708 } | |
709 | |
710 // For each instruction i in the program reachable from the start, compute the | |
711 // number of instructions reachable from i by following only empty transitions | |
712 // and record that count as fanout[i]. | |
713 // | |
714 // fanout holds the results and is also the work queue for the outer iteration. | |
715 // reachable holds the reached nodes for the inner iteration. | |
716 void Prog::Fanout(SparseArray<int>* fanout) { | |
717 DCHECK_EQ(fanout->max_size(), size()); | |
718 SparseSet reachable(size()); | |
719 fanout->clear(); | |
720 fanout->set_new(start(), 0); | |
721 for (SparseArray<int>::iterator i = fanout->begin(); i != fanout->end(); ++i)
{ | |
722 int* count = &i->second; | |
723 reachable.clear(); | |
724 reachable.insert(i->index()); | |
725 for (SparseSet::iterator j = reachable.begin(); j != reachable.end(); ++j) { | |
726 Prog::Inst* ip = inst(*j); | |
727 switch (ip->opcode()) { | |
728 default: | |
729 LOG(DFATAL) << "unhandled " << ip->opcode() << " in Prog::Fanout()"; | |
730 break; | |
731 | |
732 case kInstByteRange: | |
733 (*count)++; | |
734 if (!fanout->has_index(ip->out())) { | |
735 fanout->set_new(ip->out(), 0); | |
736 } | |
737 break; | |
738 | |
739 case kInstAlt: | |
740 case kInstAltMatch: | |
741 reachable.insert(ip->out1()); | |
742 // fall through | |
743 | |
744 case kInstCapture: | |
745 case kInstEmptyWidth: | |
746 case kInstNop: | |
747 reachable.insert(ip->out()); | |
748 break; | |
749 | |
750 case kInstMatch: | |
751 case kInstFail: | |
752 break; | |
753 } | |
754 } | |
755 } | |
756 } | |
757 | |
758 } // namespace re2 | |
OLD | NEW |