OLD | NEW |
(Empty) | |
| 1 // Copyright 2006-2007 The RE2 Authors. All Rights Reserved. |
| 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. |
| 4 |
| 5 // Tested by search_test.cc. |
| 6 // |
| 7 // Prog::SearchNFA, an NFA search. |
| 8 // This is an actual NFA like the theorists talk about, |
| 9 // not the pseudo-NFA found in backtracking regexp implementations. |
| 10 // |
| 11 // IMPLEMENTATION |
| 12 // |
| 13 // This algorithm is a variant of one that appeared in Rob Pike's sam editor, |
| 14 // which is a variant of the one described in Thompson's 1968 CACM paper. |
| 15 // See http://swtch.com/~rsc/regexp/ for various history. The main feature |
| 16 // over the DFA implementation is that it tracks submatch boundaries. |
| 17 // |
| 18 // When the choice of submatch boundaries is ambiguous, this particular |
| 19 // implementation makes the same choices that traditional backtracking |
| 20 // implementations (in particular, Perl and PCRE) do. |
| 21 // Note that unlike in Perl and PCRE, this algorithm *cannot* take exponential |
| 22 // time in the length of the input. |
| 23 // |
| 24 // Like Thompson's original machine and like the DFA implementation, this |
| 25 // implementation notices a match only once it is one byte past it. |
| 26 |
| 27 #include "re2/prog.h" |
| 28 #include "re2/regexp.h" |
| 29 #include "util/sparse_array.h" |
| 30 #include "util/sparse_set.h" |
| 31 |
| 32 namespace re2 { |
| 33 |
| 34 class NFA { |
| 35 public: |
| 36 NFA(Prog* prog); |
| 37 ~NFA(); |
| 38 |
| 39 // Searches for a matching string. |
| 40 // * If anchored is true, only considers matches starting at offset. |
| 41 // Otherwise finds lefmost match at or after offset. |
| 42 // * If longest is true, returns the longest match starting |
| 43 // at the chosen start point. Otherwise returns the so-called |
| 44 // left-biased match, the one traditional backtracking engines |
| 45 // (like Perl and PCRE) find. |
| 46 // Records submatch boundaries in submatch[1..nsubmatch-1]. |
| 47 // Submatch[0] is the entire match. When there is a choice in |
| 48 // which text matches each subexpression, the submatch boundaries |
| 49 // are chosen to match what a backtracking implementation would choose. |
| 50 bool Search(const StringPiece& text, const StringPiece& context, |
| 51 bool anchored, bool longest, |
| 52 StringPiece* submatch, int nsubmatch); |
| 53 |
| 54 static const int Debug = 0; |
| 55 |
| 56 private: |
| 57 struct Thread { |
| 58 union { |
| 59 int id; |
| 60 Thread* next; // when on free list |
| 61 }; |
| 62 const char** capture; |
| 63 }; |
| 64 |
| 65 // State for explicit stack in AddToThreadq. |
| 66 struct AddState { |
| 67 int id; // Inst to process |
| 68 int j; |
| 69 const char* cap_j; // if j>=0, set capture[j] = cap_j before processing ip |
| 70 |
| 71 AddState() |
| 72 : id(0), j(-1), cap_j(NULL) {} |
| 73 explicit AddState(int id) |
| 74 : id(id), j(-1), cap_j(NULL) {} |
| 75 AddState(int id, const char* cap_j, int j) |
| 76 : id(id), j(j), cap_j(cap_j) {} |
| 77 }; |
| 78 |
| 79 // Threadq is a list of threads. The list is sorted by the order |
| 80 // in which Perl would explore that particular state -- the earlier |
| 81 // choices appear earlier in the list. |
| 82 typedef SparseArray<Thread*> Threadq; |
| 83 |
| 84 inline Thread* AllocThread(); |
| 85 inline void FreeThread(Thread*); |
| 86 |
| 87 // Add id (or its children, following unlabeled arrows) |
| 88 // to the workqueue q with associated capture info. |
| 89 void AddToThreadq(Threadq* q, int id, int flag, |
| 90 const char* p, const char** capture); |
| 91 |
| 92 // Run runq on byte c, appending new states to nextq. |
| 93 // Updates matched_ and match_ as new, better matches are found. |
| 94 // p is position of the next byte (the one after c) |
| 95 // in the input string, used when processing capturing parens. |
| 96 // flag is the bitwise or of Bol, Eol, etc., specifying whether |
| 97 // ^, $ and \b match the current input point (after c). |
| 98 inline int Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p)
; |
| 99 |
| 100 // Returns text version of capture information, for debugging. |
| 101 string FormatCapture(const char** capture); |
| 102 |
| 103 inline void CopyCapture(const char** dst, const char** src); |
| 104 |
| 105 // Computes whether all matches must begin with the same first |
| 106 // byte, and if so, returns that byte. If not, returns -1. |
| 107 int ComputeFirstByte(); |
| 108 |
| 109 Prog* prog_; // underlying program |
| 110 int start_; // start instruction in program |
| 111 int ncapture_; // number of submatches to track |
| 112 bool longest_; // whether searching for longest match |
| 113 bool endmatch_; // whether match must end at text.end() |
| 114 const char* btext_; // beginning of text being matched (for FormatSubmatch) |
| 115 const char* etext_; // end of text being matched (for endmatch_) |
| 116 Threadq q0_, q1_; // pre-allocated for Search. |
| 117 const char** match_; // best match so far |
| 118 bool matched_; // any match so far? |
| 119 AddState* astack_; // pre-allocated for AddToThreadq |
| 120 int nastack_; |
| 121 int first_byte_; // required first byte for match, or -1 if none |
| 122 |
| 123 Thread* free_threads_; // free list |
| 124 |
| 125 DISALLOW_EVIL_CONSTRUCTORS(NFA); |
| 126 }; |
| 127 |
| 128 NFA::NFA(Prog* prog) { |
| 129 prog_ = prog; |
| 130 start_ = prog->start(); |
| 131 ncapture_ = 0; |
| 132 longest_ = false; |
| 133 endmatch_ = false; |
| 134 btext_ = NULL; |
| 135 etext_ = NULL; |
| 136 q0_.resize(prog_->size()); |
| 137 q1_.resize(prog_->size()); |
| 138 nastack_ = 2*prog_->size(); |
| 139 astack_ = new AddState[nastack_]; |
| 140 match_ = NULL; |
| 141 matched_ = false; |
| 142 free_threads_ = NULL; |
| 143 first_byte_ = ComputeFirstByte(); |
| 144 } |
| 145 |
| 146 NFA::~NFA() { |
| 147 delete[] match_; |
| 148 delete[] astack_; |
| 149 Thread* next; |
| 150 for (Thread* t = free_threads_; t; t = next) { |
| 151 next = t->next; |
| 152 delete[] t->capture; |
| 153 delete t; |
| 154 } |
| 155 } |
| 156 |
| 157 void NFA::FreeThread(Thread *t) { |
| 158 if (t == NULL) |
| 159 return; |
| 160 t->next = free_threads_; |
| 161 free_threads_ = t; |
| 162 } |
| 163 |
| 164 NFA::Thread* NFA::AllocThread() { |
| 165 Thread* t = free_threads_; |
| 166 if (t == NULL) { |
| 167 t = new Thread; |
| 168 t->capture = new const char*[ncapture_]; |
| 169 return t; |
| 170 } |
| 171 free_threads_ = t->next; |
| 172 return t; |
| 173 } |
| 174 |
| 175 void NFA::CopyCapture(const char** dst, const char** src) { |
| 176 for (int i = 0; i < ncapture_; i+=2) { |
| 177 dst[i] = src[i]; |
| 178 dst[i+1] = src[i+1]; |
| 179 } |
| 180 } |
| 181 |
| 182 // Follows all empty arrows from id0 and enqueues all the states reached. |
| 183 // The bits in flag (Bol, Eol, etc.) specify whether ^, $ and \b match. |
| 184 // The pointer p is the current input position, and m is the |
| 185 // current set of match boundaries. |
| 186 void NFA::AddToThreadq(Threadq* q, int id0, int flag, |
| 187 const char* p, const char** capture) { |
| 188 if (id0 == 0) |
| 189 return; |
| 190 |
| 191 // Astack_ is pre-allocated to avoid resize operations. |
| 192 // It has room for 2*prog_->size() entries, which is enough: |
| 193 // Each inst in prog can be processed at most once, |
| 194 // pushing at most two entries on stk. |
| 195 |
| 196 int nstk = 0; |
| 197 AddState* stk = astack_; |
| 198 stk[nstk++] = AddState(id0); |
| 199 |
| 200 while (nstk > 0) { |
| 201 DCHECK_LE(nstk, nastack_); |
| 202 const AddState& a = stk[--nstk]; |
| 203 if (a.j >= 0) |
| 204 capture[a.j] = a.cap_j; |
| 205 |
| 206 int id = a.id; |
| 207 if (id == 0) |
| 208 continue; |
| 209 if (q->has_index(id)) { |
| 210 if (Debug) |
| 211 fprintf(stderr, " [%d%s]\n", id, FormatCapture(capture).c_str()); |
| 212 continue; |
| 213 } |
| 214 |
| 215 // Create entry in q no matter what. We might fill it in below, |
| 216 // or we might not. Even if not, it is necessary to have it, |
| 217 // so that we don't revisit id0 during the recursion. |
| 218 q->set_new(id, NULL); |
| 219 |
| 220 Thread** tp = &q->find(id)->second; |
| 221 int j; |
| 222 Thread* t; |
| 223 Prog::Inst* ip = prog_->inst(id); |
| 224 switch (ip->opcode()) { |
| 225 default: |
| 226 LOG(DFATAL) << "unhandled " << ip->opcode() << " in AddToThreadq"; |
| 227 break; |
| 228 |
| 229 case kInstFail: |
| 230 break; |
| 231 |
| 232 case kInstAltMatch: |
| 233 // Save state; will pick up at next byte. |
| 234 t = AllocThread(); |
| 235 t->id = id; |
| 236 CopyCapture(t->capture, capture); |
| 237 *tp = t; |
| 238 // fall through |
| 239 |
| 240 case kInstAlt: |
| 241 // Explore alternatives. |
| 242 stk[nstk++] = AddState(ip->out1()); |
| 243 stk[nstk++] = AddState(ip->out()); |
| 244 break; |
| 245 |
| 246 case kInstNop: |
| 247 // Continue on. |
| 248 stk[nstk++] = AddState(ip->out()); |
| 249 break; |
| 250 |
| 251 case kInstCapture: |
| 252 if ((j=ip->cap()) < ncapture_) { |
| 253 // Push a dummy whose only job is to restore capture[j] |
| 254 // once we finish exploring this possibility. |
| 255 stk[nstk++] = AddState(0, capture[j], j); |
| 256 |
| 257 // Record capture. |
| 258 capture[j] = p; |
| 259 } |
| 260 stk[nstk++] = AddState(ip->out()); |
| 261 break; |
| 262 |
| 263 case kInstMatch: |
| 264 case kInstByteRange: |
| 265 // Save state; will pick up at next byte. |
| 266 t = AllocThread(); |
| 267 t->id = id; |
| 268 CopyCapture(t->capture, capture); |
| 269 *tp = t; |
| 270 if (Debug) |
| 271 fprintf(stderr, " + %d%s [%p]\n", id, FormatCapture(t->capture).c_str(),
t); |
| 272 break; |
| 273 |
| 274 case kInstEmptyWidth: |
| 275 // Continue on if we have all the right flag bits. |
| 276 if (ip->empty() & ~flag) |
| 277 break; |
| 278 stk[nstk++] = AddState(ip->out()); |
| 279 break; |
| 280 } |
| 281 } |
| 282 } |
| 283 |
| 284 // Run runq on byte c, appending new states to nextq. |
| 285 // Updates match as new, better matches are found. |
| 286 // p is position of the byte c in the input string, |
| 287 // used when processing capturing parens. |
| 288 // flag is the bitwise or of Bol, Eol, etc., specifying whether |
| 289 // ^, $ and \b match the current input point (after c). |
| 290 // Frees all the threads on runq. |
| 291 // If there is a shortcut to the end, returns that shortcut. |
| 292 int NFA::Step(Threadq* runq, Threadq* nextq, int c, int flag, const char* p) { |
| 293 nextq->clear(); |
| 294 |
| 295 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
| 296 Thread* t = i->second; |
| 297 if (t == NULL) |
| 298 continue; |
| 299 |
| 300 if (longest_) { |
| 301 // Can skip any threads started after our current best match. |
| 302 if (matched_ && match_[0] < t->capture[0]) { |
| 303 FreeThread(t); |
| 304 continue; |
| 305 } |
| 306 } |
| 307 |
| 308 int id = t->id; |
| 309 Prog::Inst* ip = prog_->inst(id); |
| 310 |
| 311 switch (ip->opcode()) { |
| 312 default: |
| 313 // Should only see the values handled below. |
| 314 LOG(DFATAL) << "Unhandled " << ip->opcode() << " in step"; |
| 315 break; |
| 316 |
| 317 case kInstByteRange: |
| 318 if (ip->Matches(c)) |
| 319 AddToThreadq(nextq, ip->out(), flag, p+1, t->capture); |
| 320 break; |
| 321 |
| 322 case kInstAltMatch: |
| 323 if (i != runq->begin()) |
| 324 break; |
| 325 // The match is ours if we want it. |
| 326 if (ip->greedy(prog_) || longest_) { |
| 327 CopyCapture((const char**)match_, t->capture); |
| 328 FreeThread(t); |
| 329 for (++i; i != runq->end(); ++i) |
| 330 FreeThread(i->second); |
| 331 runq->clear(); |
| 332 matched_ = true; |
| 333 if (ip->greedy(prog_)) |
| 334 return ip->out1(); |
| 335 return ip->out(); |
| 336 } |
| 337 break; |
| 338 |
| 339 case kInstMatch: |
| 340 if (endmatch_ && p != etext_) |
| 341 break; |
| 342 |
| 343 const char* old = t->capture[1]; // previous end pointer |
| 344 t->capture[1] = p; |
| 345 if (longest_) { |
| 346 // Leftmost-longest mode: save this match only if |
| 347 // it is either farther to the left or at the same |
| 348 // point but longer than an existing match. |
| 349 if (!matched_ || t->capture[0] < match_[0] || |
| 350 (t->capture[0] == match_[0] && t->capture[1] > match_[1])) |
| 351 CopyCapture((const char**)match_, t->capture); |
| 352 } else { |
| 353 // Leftmost-biased mode: this match is by definition |
| 354 // better than what we've already found (see next line). |
| 355 CopyCapture((const char**)match_, t->capture); |
| 356 |
| 357 // Cut off the threads that can only find matches |
| 358 // worse than the one we just found: don't run the |
| 359 // rest of the current Threadq. |
| 360 t->capture[0] = old; |
| 361 FreeThread(t); |
| 362 for (++i; i != runq->end(); ++i) |
| 363 FreeThread(i->second); |
| 364 runq->clear(); |
| 365 matched_ = true; |
| 366 return 0; |
| 367 } |
| 368 t->capture[0] = old; |
| 369 matched_ = true; |
| 370 break; |
| 371 } |
| 372 FreeThread(t); |
| 373 } |
| 374 runq->clear(); |
| 375 return 0; |
| 376 } |
| 377 |
| 378 string NFA::FormatCapture(const char** capture) { |
| 379 string s; |
| 380 |
| 381 for (int i = 0; i < ncapture_; i+=2) { |
| 382 if (capture[i] == NULL) |
| 383 StringAppendF(&s, "(?,?)"); |
| 384 else if (capture[i+1] == NULL) |
| 385 StringAppendF(&s, "(%d,?)", (int)(capture[i] - btext_)); |
| 386 else |
| 387 StringAppendF(&s, "(%d,%d)", |
| 388 (int)(capture[i] - btext_), |
| 389 (int)(capture[i+1] - btext_)); |
| 390 } |
| 391 return s; |
| 392 } |
| 393 |
| 394 // Returns whether haystack contains needle's memory. |
| 395 static bool StringPieceContains(const StringPiece haystack, const StringPiece ne
edle) { |
| 396 return haystack.begin() <= needle.begin() && |
| 397 haystack.end() >= needle.end(); |
| 398 } |
| 399 |
| 400 bool NFA::Search(const StringPiece& text, const StringPiece& const_context, |
| 401 bool anchored, bool longest, |
| 402 StringPiece* submatch, int nsubmatch) { |
| 403 if (start_ == 0) |
| 404 return false; |
| 405 |
| 406 StringPiece context = const_context; |
| 407 if (context.begin() == NULL) |
| 408 context = text; |
| 409 |
| 410 if (!StringPieceContains(context, text)) { |
| 411 LOG(FATAL) << "Bad args: context does not contain text " |
| 412 << reinterpret_cast<const void*>(context.begin()) |
| 413 << "+" << context.size() << " " |
| 414 << reinterpret_cast<const void*>(text.begin()) |
| 415 << "+" << text.size(); |
| 416 return false; |
| 417 } |
| 418 |
| 419 if (prog_->anchor_start() && context.begin() != text.begin()) |
| 420 return false; |
| 421 if (prog_->anchor_end() && context.end() != text.end()) |
| 422 return false; |
| 423 anchored |= prog_->anchor_start(); |
| 424 if (prog_->anchor_end()) { |
| 425 longest = true; |
| 426 endmatch_ = true; |
| 427 etext_ = text.end(); |
| 428 } |
| 429 |
| 430 if (nsubmatch < 0) { |
| 431 LOG(DFATAL) << "Bad args: nsubmatch=" << nsubmatch; |
| 432 return false; |
| 433 } |
| 434 |
| 435 // Save search parameters. |
| 436 ncapture_ = 2*nsubmatch; |
| 437 longest_ = longest; |
| 438 |
| 439 if (nsubmatch == 0) { |
| 440 // We need to maintain match[0], both to distinguish the |
| 441 // longest match (if longest is true) and also to tell |
| 442 // whether we've seen any matches at all. |
| 443 ncapture_ = 2; |
| 444 } |
| 445 |
| 446 match_ = new const char*[ncapture_]; |
| 447 matched_ = false; |
| 448 memset(match_, 0, ncapture_*sizeof match_[0]); |
| 449 |
| 450 // For debugging prints. |
| 451 btext_ = context.begin(); |
| 452 |
| 453 if (Debug) { |
| 454 fprintf(stderr, "NFA::Search %s (context: %s) anchored=%d longest=%d\n", |
| 455 text.as_string().c_str(), context.as_string().c_str(), anchored, |
| 456 longest); |
| 457 } |
| 458 |
| 459 // Set up search. |
| 460 Threadq* runq = &q0_; |
| 461 Threadq* nextq = &q1_; |
| 462 runq->clear(); |
| 463 nextq->clear(); |
| 464 memset(&match_[0], 0, ncapture_*sizeof match_[0]); |
| 465 const char* bp = context.begin(); |
| 466 int c = -1; |
| 467 int wasword = 0; |
| 468 |
| 469 if (text.begin() > context.begin()) { |
| 470 c = text.begin()[-1] & 0xFF; |
| 471 wasword = Prog::IsWordChar(c); |
| 472 } |
| 473 |
| 474 // Loop over the text, stepping the machine. |
| 475 for (const char* p = text.begin();; p++) { |
| 476 // Check for empty-width specials. |
| 477 int flag = 0; |
| 478 |
| 479 // ^ and \A |
| 480 if (p == context.begin()) |
| 481 flag |= kEmptyBeginText | kEmptyBeginLine; |
| 482 else if (p <= context.end() && p[-1] == '\n') |
| 483 flag |= kEmptyBeginLine; |
| 484 |
| 485 // $ and \z |
| 486 if (p == context.end()) |
| 487 flag |= kEmptyEndText | kEmptyEndLine; |
| 488 else if (p < context.end() && p[0] == '\n') |
| 489 flag |= kEmptyEndLine; |
| 490 |
| 491 // \b and \B |
| 492 int isword = 0; |
| 493 if (p < context.end()) |
| 494 isword = Prog::IsWordChar(p[0] & 0xFF); |
| 495 |
| 496 if (isword != wasword) |
| 497 flag |= kEmptyWordBoundary; |
| 498 else |
| 499 flag |= kEmptyNonWordBoundary; |
| 500 |
| 501 if (Debug) { |
| 502 fprintf(stderr, "%c[%#x/%d/%d]:", p > text.end() ? '$' : p == bp ? '^' : c
, flag, isword, wasword); |
| 503 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) { |
| 504 Thread* t = i->second; |
| 505 if (t == NULL) |
| 506 continue; |
| 507 fprintf(stderr, " %d%s", t->id, |
| 508 FormatCapture((const char**)t->capture).c_str()); |
| 509 } |
| 510 fprintf(stderr, "\n"); |
| 511 } |
| 512 |
| 513 // Process previous character (waited until now to avoid |
| 514 // repeating the flag computation above). |
| 515 // This is a no-op the first time around the loop, because |
| 516 // runq is empty. |
| 517 int id = Step(runq, nextq, c, flag, p-1); |
| 518 DCHECK_EQ(runq->size(), 0); |
| 519 swap(nextq, runq); |
| 520 nextq->clear(); |
| 521 if (id != 0) { |
| 522 // We're done: full match ahead. |
| 523 p = text.end(); |
| 524 for (;;) { |
| 525 Prog::Inst* ip = prog_->inst(id); |
| 526 switch (ip->opcode()) { |
| 527 default: |
| 528 LOG(DFATAL) << "Unexpected opcode in short circuit: " << ip->opcode(
); |
| 529 break; |
| 530 |
| 531 case kInstCapture: |
| 532 match_[ip->cap()] = p; |
| 533 id = ip->out(); |
| 534 continue; |
| 535 |
| 536 case kInstNop: |
| 537 id = ip->out(); |
| 538 continue; |
| 539 |
| 540 case kInstMatch: |
| 541 match_[1] = p; |
| 542 matched_ = true; |
| 543 break; |
| 544 |
| 545 case kInstEmptyWidth: |
| 546 if (ip->empty() & ~(kEmptyEndLine|kEmptyEndText)) { |
| 547 LOG(DFATAL) << "Unexpected empty-width in short circuit: " << ip->
empty(); |
| 548 break; |
| 549 } |
| 550 id = ip->out(); |
| 551 continue; |
| 552 } |
| 553 break; |
| 554 } |
| 555 break; |
| 556 } |
| 557 |
| 558 if (p > text.end()) |
| 559 break; |
| 560 |
| 561 // Start a new thread if there have not been any matches. |
| 562 // (No point in starting a new thread if there have been |
| 563 // matches, since it would be to the right of the match |
| 564 // we already found.) |
| 565 if (!matched_ && (!anchored || p == text.begin())) { |
| 566 // If there's a required first byte for an unanchored search |
| 567 // and we're not in the middle of any possible matches, |
| 568 // use memchr to search for the byte quickly. |
| 569 if (!anchored && first_byte_ >= 0 && runq->size() == 0 && |
| 570 p < text.end() && (p[0] & 0xFF) != first_byte_) { |
| 571 p = reinterpret_cast<const char*>(memchr(p, first_byte_, |
| 572 text.end() - p)); |
| 573 if (p == NULL) { |
| 574 p = text.end(); |
| 575 isword = 0; |
| 576 } else { |
| 577 isword = Prog::IsWordChar(p[0] & 0xFF); |
| 578 } |
| 579 flag = Prog::EmptyFlags(context, p); |
| 580 } |
| 581 |
| 582 // Steal match storage (cleared but unused as of yet) |
| 583 // temporarily to hold match boundaries for new thread. |
| 584 match_[0] = p; |
| 585 AddToThreadq(runq, start_, flag, p, match_); |
| 586 match_[0] = NULL; |
| 587 } |
| 588 |
| 589 // If all the threads have died, stop early. |
| 590 if (runq->size() == 0) { |
| 591 if (Debug) |
| 592 fprintf(stderr, "dead\n"); |
| 593 break; |
| 594 } |
| 595 |
| 596 if (p == text.end()) |
| 597 c = 0; |
| 598 else |
| 599 c = *p & 0xFF; |
| 600 wasword = isword; |
| 601 |
| 602 // Will run step(runq, nextq, c, ...) on next iteration. See above. |
| 603 } |
| 604 |
| 605 for (Threadq::iterator i = runq->begin(); i != runq->end(); ++i) |
| 606 FreeThread(i->second); |
| 607 |
| 608 if (matched_) { |
| 609 for (int i = 0; i < nsubmatch; i++) |
| 610 submatch[i].set(match_[2*i], match_[2*i+1] - match_[2*i]); |
| 611 if (Debug) |
| 612 fprintf(stderr, "match (%d,%d)\n", |
| 613 static_cast<int>(match_[0] - btext_), |
| 614 static_cast<int>(match_[1] - btext_)); |
| 615 return true; |
| 616 } |
| 617 VLOG(1) << "No matches found"; |
| 618 return false; |
| 619 } |
| 620 |
| 621 // Computes whether all successful matches have a common first byte, |
| 622 // and if so, returns that byte. If not, returns -1. |
| 623 int NFA::ComputeFirstByte() { |
| 624 if (start_ == 0) |
| 625 return -1; |
| 626 |
| 627 int b = -1; // first byte, not yet computed |
| 628 |
| 629 typedef SparseSet Workq; |
| 630 Workq q(prog_->size()); |
| 631 q.insert(start_); |
| 632 for (Workq::iterator it = q.begin(); it != q.end(); ++it) { |
| 633 int id = *it; |
| 634 Prog::Inst* ip = prog_->inst(id); |
| 635 switch (ip->opcode()) { |
| 636 default: |
| 637 LOG(DFATAL) << "unhandled " << ip->opcode() << " in ComputeFirstByte"; |
| 638 break; |
| 639 |
| 640 case kInstMatch: |
| 641 // The empty string matches: no first byte. |
| 642 return -1; |
| 643 |
| 644 case kInstByteRange: |
| 645 // Must match only a single byte |
| 646 if (ip->lo() != ip->hi()) |
| 647 return -1; |
| 648 if (ip->foldcase() && 'a' <= ip->lo() && ip->lo() <= 'z') |
| 649 return -1; |
| 650 // If we haven't seen any bytes yet, record it; |
| 651 // otherwise must match the one we saw before. |
| 652 if (b == -1) |
| 653 b = ip->lo(); |
| 654 else if (b != ip->lo()) |
| 655 return -1; |
| 656 break; |
| 657 |
| 658 case kInstNop: |
| 659 case kInstCapture: |
| 660 case kInstEmptyWidth: |
| 661 // Continue on. |
| 662 // Ignore ip->empty() flags for kInstEmptyWidth |
| 663 // in order to be as conservative as possible |
| 664 // (assume all possible empty-width flags are true). |
| 665 if (ip->out()) |
| 666 q.insert(ip->out()); |
| 667 break; |
| 668 |
| 669 case kInstAlt: |
| 670 case kInstAltMatch: |
| 671 // Explore alternatives. |
| 672 if (ip->out()) |
| 673 q.insert(ip->out()); |
| 674 if (ip->out1()) |
| 675 q.insert(ip->out1()); |
| 676 break; |
| 677 |
| 678 case kInstFail: |
| 679 break; |
| 680 } |
| 681 } |
| 682 return b; |
| 683 } |
| 684 |
| 685 bool |
| 686 Prog::SearchNFA(const StringPiece& text, const StringPiece& context, |
| 687 Anchor anchor, MatchKind kind, |
| 688 StringPiece* match, int nmatch) { |
| 689 if (NFA::Debug) |
| 690 Dump(); |
| 691 |
| 692 NFA nfa(this); |
| 693 StringPiece sp; |
| 694 if (kind == kFullMatch) { |
| 695 anchor = kAnchored; |
| 696 if (nmatch == 0) { |
| 697 match = &sp; |
| 698 nmatch = 1; |
| 699 } |
| 700 } |
| 701 if (!nfa.Search(text, context, anchor == kAnchored, kind != kFirstMatch, match
, nmatch)) |
| 702 return false; |
| 703 if (kind == kFullMatch && match[0].end() != text.end()) |
| 704 return false; |
| 705 return true; |
| 706 } |
| 707 |
| 708 } // namespace re2 |
| 709 |
OLD | NEW |