| OLD | NEW |
| 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. |
| 2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
| 3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
| 4 | 4 |
| 5 // Regular expression engine tester -- test all the implementations against each
other. | 5 // Regular expression engine tester -- test all the implementations against each
other. |
| 6 | 6 |
| 7 #include "util/util.h" | 7 #include "util/util.h" |
| 8 #include "util/flags.h" | 8 #include "util/flags.h" |
| 9 #include "re2/testing/tester.h" | 9 #include "re2/testing/tester.h" |
| 10 #include "re2/prog.h" | 10 #include "re2/prog.h" |
| (...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 239 } | 239 } |
| 240 | 240 |
| 241 // Compile regexp to RE. | 241 // Compile regexp to RE. |
| 242 // PCRE as exposed by the RE interface isn't always usable. | 242 // PCRE as exposed by the RE interface isn't always usable. |
| 243 // 1. It disagrees about handling of empty-string reptitions | 243 // 1. It disagrees about handling of empty-string reptitions |
| 244 // like matching (a*)* against "b". PCRE treats the (a*) as | 244 // like matching (a*)* against "b". PCRE treats the (a*) as |
| 245 // occurring once, while we treat it as occurring not at all. | 245 // occurring once, while we treat it as occurring not at all. |
| 246 // 2. It treats $ as this weird thing meaning end of string | 246 // 2. It treats $ as this weird thing meaning end of string |
| 247 // or before the \n at the end of the string. | 247 // or before the \n at the end of the string. |
| 248 // 3. It doesn't implement POSIX leftmost-longest matching. | 248 // 3. It doesn't implement POSIX leftmost-longest matching. |
| 249 // 4. It lets \s match vertical tab. | |
| 250 // MimicsPCRE() detects 1 and 2. | 249 // MimicsPCRE() detects 1 and 2. |
| 251 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && | 250 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && |
| 252 kind_ != Prog::kLongestMatch) { | 251 kind_ != Prog::kLongestMatch) { |
| 253 PCRE_Options o; | 252 PCRE_Options o; |
| 254 o.set_option(PCRE::UTF8); | 253 o.set_option(PCRE::UTF8); |
| 255 if (flags & Regexp::Latin1) | 254 if (flags & Regexp::Latin1) |
| 256 o.set_option(PCRE::None); | 255 o.set_option(PCRE::None); |
| 257 // PCRE has interface bug keeping us from finding $0, so | 256 // PCRE has interface bug keeping us from finding $0, so |
| 258 // add one more layer of parens. | 257 // add one more layer of parens. |
| 259 re_ = new PCRE("("+re+")", o); | 258 re_ = new PCRE("("+re+")", o); |
| (...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 337 result->matched = | 336 result->matched = |
| 338 prog_->SearchDFA(text, context, anchor, kind_, result->submatch, | 337 prog_->SearchDFA(text, context, anchor, kind_, result->submatch, |
| 339 &result->skipped, NULL); | 338 &result->skipped, NULL); |
| 340 // If anchored, no need for second run, | 339 // If anchored, no need for second run, |
| 341 // but do it anyway to find more bugs. | 340 // but do it anyway to find more bugs. |
| 342 if (result->matched) { | 341 if (result->matched) { |
| 343 if (!rprog_->SearchDFA(result->submatch[0], context, | 342 if (!rprog_->SearchDFA(result->submatch[0], context, |
| 344 Prog::kAnchored, Prog::kLongestMatch, | 343 Prog::kAnchored, Prog::kLongestMatch, |
| 345 result->submatch, | 344 result->submatch, |
| 346 &result->skipped, NULL)) { | 345 &result->skipped, NULL)) { |
| 347 LOG(ERROR) << "Reverse DFA inconsistency: " | 346 LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_) |
| 348 << CEscape(regexp_str_) | |
| 349 << " on " << CEscape(text); | 347 << " on " << CEscape(text); |
| 350 result->matched = false; | 348 result->matched = false; |
| 351 } | 349 } |
| 352 } | 350 } |
| 353 result->have_submatch0 = true; | 351 result->have_submatch0 = true; |
| 354 break; | 352 break; |
| 355 | 353 |
| 356 case kEngineOnePass: | 354 case kEngineOnePass: |
| 357 if (prog_ == NULL || | 355 if (prog_ == NULL || |
| 358 anchor == Prog::kUnanchored || | 356 anchor == Prog::kUnanchored || |
| (...skipping 26 matching lines...) Expand all Loading... |
| 385 } | 383 } |
| 386 | 384 |
| 387 RE2::Anchor re_anchor; | 385 RE2::Anchor re_anchor; |
| 388 if (anchor == Prog::kAnchored) | 386 if (anchor == Prog::kAnchored) |
| 389 re_anchor = RE2::ANCHOR_START; | 387 re_anchor = RE2::ANCHOR_START; |
| 390 else | 388 else |
| 391 re_anchor = RE2::UNANCHORED; | 389 re_anchor = RE2::UNANCHORED; |
| 392 if (kind_ == Prog::kFullMatch) | 390 if (kind_ == Prog::kFullMatch) |
| 393 re_anchor = RE2::ANCHOR_BOTH; | 391 re_anchor = RE2::ANCHOR_BOTH; |
| 394 | 392 |
| 395 result->matched = re2_->Match( | 393 result->matched = re2_->Match(context, |
| 396 context, | 394 text.begin() - context.begin(), |
| 397 static_cast<int>(text.begin() - context.begin()), | 395 text.end() - context.begin(), |
| 398 static_cast<int>(text.end() - context.begin()), | 396 re_anchor, result->submatch, nsubmatch); |
| 399 re_anchor, | |
| 400 result->submatch, | |
| 401 nsubmatch); | |
| 402 result->have_submatch = nsubmatch > 0; | 397 result->have_submatch = nsubmatch > 0; |
| 403 break; | 398 break; |
| 404 } | 399 } |
| 405 | 400 |
| 406 case kEnginePCRE: { | 401 case kEnginePCRE: { |
| 407 if (!re_ || text.begin() != context.begin() || | 402 if (!re_ || text.begin() != context.begin() || |
| 408 text.end() != context.end()) { | 403 text.end() != context.end()) { |
| 409 result->skipped = true; | 404 result->skipped = true; |
| 410 break; | 405 break; |
| 411 } | 406 } |
| 412 | |
| 413 // PCRE 8.34 or so started allowing vertical tab to match \s, | |
| 414 // following a change made in Perl 5.18. RE2 does not. | |
| 415 if ((regexp_str_.contains("\\s") || regexp_str_.contains("\\S")) && | |
| 416 text.contains("\v")) { | |
| 417 result->skipped = true; | |
| 418 break; | |
| 419 } | |
| 420 | 407 |
| 421 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; | 408 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; |
| 422 PCRE::Arg *a = new PCRE::Arg[nsubmatch]; | 409 PCRE::Arg *a = new PCRE::Arg[nsubmatch]; |
| 423 for (int i = 0; i < nsubmatch; i++) { | 410 for (int i = 0; i < nsubmatch; i++) { |
| 424 a[i] = PCRE::Arg(&result->submatch[i]); | 411 a[i] = PCRE::Arg(&result->submatch[i]); |
| 425 argptr[i] = &a[i]; | 412 argptr[i] = &a[i]; |
| 426 } | 413 } |
| 427 int consumed; | 414 int consumed; |
| 428 PCRE::Anchor pcre_anchor; | 415 PCRE::Anchor pcre_anchor; |
| 429 if (anchor == Prog::kAnchored) | 416 if (anchor == Prog::kAnchored) |
| (...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 511 | 498 |
| 512 Result r; | 499 Result r; |
| 513 RunSearch(i, text, context, anchor, &r); | 500 RunSearch(i, text, context, anchor, &r); |
| 514 if (ResultOkay(r, correct)) { | 501 if (ResultOkay(r, correct)) { |
| 515 if (FLAGS_log_okay) | 502 if (FLAGS_log_okay) |
| 516 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); | 503 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); |
| 517 continue; | 504 continue; |
| 518 } | 505 } |
| 519 | 506 |
| 520 // We disagree with PCRE on the meaning of some Unicode matches. | 507 // We disagree with PCRE on the meaning of some Unicode matches. |
| 521 // In particular, we treat non-ASCII UTF-8 as non-word characters. | 508 // In particular, we treat all non-ASCII UTF-8 as word characters. |
| 522 // We also treat "empty" character sets like [^\w\W] as being | 509 // We also treat "empty" character sets like [^\w\W] as being |
| 523 // impossible to match, while PCRE apparently excludes some code | 510 // impossible to match, while PCRE apparently excludes some code |
| 524 // points (e.g., 0x0080) from both \w and \W. | 511 // points (e.g., 0x0080) from both \w and \W. |
| 525 if (i == kEnginePCRE && NonASCII(text)) | 512 if (i == kEnginePCRE && NonASCII(text)) |
| 526 continue; | 513 continue; |
| 527 | 514 |
| 528 if (!r.untrusted) | 515 if (!r.untrusted) |
| 529 all_okay = false; | 516 all_okay = false; |
| 530 | 517 |
| 531 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, | 518 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, |
| (...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 598 for (int j = 0; j < arraysize(parse_modes); j++) { | 585 for (int j = 0; j < arraysize(parse_modes); j++) { |
| 599 TestInstance* t = new TestInstance(regexp, kinds[i], | 586 TestInstance* t = new TestInstance(regexp, kinds[i], |
| 600 parse_modes[j].parse_flags); | 587 parse_modes[j].parse_flags); |
| 601 error_ |= t->error(); | 588 error_ |= t->error(); |
| 602 v_.push_back(t); | 589 v_.push_back(t); |
| 603 } | 590 } |
| 604 } | 591 } |
| 605 } | 592 } |
| 606 | 593 |
| 607 Tester::~Tester() { | 594 Tester::~Tester() { |
| 608 for (size_t i = 0; i < v_.size(); i++) | 595 for (int i = 0; i < v_.size(); i++) |
| 609 delete v_[i]; | 596 delete v_[i]; |
| 610 } | 597 } |
| 611 | 598 |
| 612 bool Tester::TestCase(const StringPiece& text, const StringPiece& context, | 599 bool Tester::TestCase(const StringPiece& text, const StringPiece& context, |
| 613 Prog::Anchor anchor) { | 600 Prog::Anchor anchor) { |
| 614 bool okay = true; | 601 bool okay = true; |
| 615 for (size_t i = 0; i < v_.size(); i++) | 602 for (int i = 0; i < v_.size(); i++) |
| 616 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); | 603 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); |
| 617 return okay; | 604 return okay; |
| 618 } | 605 } |
| 619 | 606 |
| 620 static Prog::Anchor anchors[] = { | 607 static Prog::Anchor anchors[] = { |
| 621 Prog::kAnchored, | 608 Prog::kAnchored, |
| 622 Prog::kUnanchored | 609 Prog::kUnanchored |
| 623 }; | 610 }; |
| 624 | 611 |
| 625 bool Tester::TestInput(const StringPiece& text) { | 612 bool Tester::TestInput(const StringPiece& text) { |
| (...skipping 18 matching lines...) Expand all Loading... |
| 644 return okay; | 631 return okay; |
| 645 } | 632 } |
| 646 | 633 |
| 647 bool TestRegexpOnText(const StringPiece& regexp, | 634 bool TestRegexpOnText(const StringPiece& regexp, |
| 648 const StringPiece& text) { | 635 const StringPiece& text) { |
| 649 Tester t(regexp); | 636 Tester t(regexp); |
| 650 return t.TestInput(text); | 637 return t.TestInput(text); |
| 651 } | 638 } |
| 652 | 639 |
| 653 } // namespace re2 | 640 } // namespace re2 |
| OLD | NEW |