OLD | NEW |
1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. |
2 // Use of this source code is governed by a BSD-style | 2 // Use of this source code is governed by a BSD-style |
3 // license that can be found in the LICENSE file. | 3 // license that can be found in the LICENSE file. |
4 | 4 |
5 // Regular expression engine tester -- test all the implementations against each
other. | 5 // Regular expression engine tester -- test all the implementations against each
other. |
6 | 6 |
7 #include "util/util.h" | 7 #include "util/util.h" |
8 #include "util/flags.h" | 8 #include "util/flags.h" |
9 #include "re2/testing/tester.h" | 9 #include "re2/testing/tester.h" |
10 #include "re2/prog.h" | 10 #include "re2/prog.h" |
(...skipping 228 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
239 } | 239 } |
240 | 240 |
241 // Compile regexp to RE. | 241 // Compile regexp to RE. |
242 // PCRE as exposed by the RE interface isn't always usable. | 242 // PCRE as exposed by the RE interface isn't always usable. |
243 // 1. It disagrees about handling of empty-string reptitions | 243 // 1. It disagrees about handling of empty-string reptitions |
244 // like matching (a*)* against "b". PCRE treats the (a*) as | 244 // like matching (a*)* against "b". PCRE treats the (a*) as |
245 // occurring once, while we treat it as occurring not at all. | 245 // occurring once, while we treat it as occurring not at all. |
246 // 2. It treats $ as this weird thing meaning end of string | 246 // 2. It treats $ as this weird thing meaning end of string |
247 // or before the \n at the end of the string. | 247 // or before the \n at the end of the string. |
248 // 3. It doesn't implement POSIX leftmost-longest matching. | 248 // 3. It doesn't implement POSIX leftmost-longest matching. |
249 // 4. It lets \s match vertical tab. | |
250 // MimicsPCRE() detects 1 and 2. | 249 // MimicsPCRE() detects 1 and 2. |
251 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && | 250 if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() && |
252 kind_ != Prog::kLongestMatch) { | 251 kind_ != Prog::kLongestMatch) { |
253 PCRE_Options o; | 252 PCRE_Options o; |
254 o.set_option(PCRE::UTF8); | 253 o.set_option(PCRE::UTF8); |
255 if (flags & Regexp::Latin1) | 254 if (flags & Regexp::Latin1) |
256 o.set_option(PCRE::None); | 255 o.set_option(PCRE::None); |
257 // PCRE has interface bug keeping us from finding $0, so | 256 // PCRE has interface bug keeping us from finding $0, so |
258 // add one more layer of parens. | 257 // add one more layer of parens. |
259 re_ = new PCRE("("+re+")", o); | 258 re_ = new PCRE("("+re+")", o); |
(...skipping 77 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
337 result->matched = | 336 result->matched = |
338 prog_->SearchDFA(text, context, anchor, kind_, result->submatch, | 337 prog_->SearchDFA(text, context, anchor, kind_, result->submatch, |
339 &result->skipped, NULL); | 338 &result->skipped, NULL); |
340 // If anchored, no need for second run, | 339 // If anchored, no need for second run, |
341 // but do it anyway to find more bugs. | 340 // but do it anyway to find more bugs. |
342 if (result->matched) { | 341 if (result->matched) { |
343 if (!rprog_->SearchDFA(result->submatch[0], context, | 342 if (!rprog_->SearchDFA(result->submatch[0], context, |
344 Prog::kAnchored, Prog::kLongestMatch, | 343 Prog::kAnchored, Prog::kLongestMatch, |
345 result->submatch, | 344 result->submatch, |
346 &result->skipped, NULL)) { | 345 &result->skipped, NULL)) { |
347 LOG(ERROR) << "Reverse DFA inconsistency: " | 346 LOG(ERROR) << "Reverse DFA inconsistency: " << CEscape(regexp_str_) |
348 << CEscape(regexp_str_) | |
349 << " on " << CEscape(text); | 347 << " on " << CEscape(text); |
350 result->matched = false; | 348 result->matched = false; |
351 } | 349 } |
352 } | 350 } |
353 result->have_submatch0 = true; | 351 result->have_submatch0 = true; |
354 break; | 352 break; |
355 | 353 |
356 case kEngineOnePass: | 354 case kEngineOnePass: |
357 if (prog_ == NULL || | 355 if (prog_ == NULL || |
358 anchor == Prog::kUnanchored || | 356 anchor == Prog::kUnanchored || |
(...skipping 26 matching lines...) Expand all Loading... |
385 } | 383 } |
386 | 384 |
387 RE2::Anchor re_anchor; | 385 RE2::Anchor re_anchor; |
388 if (anchor == Prog::kAnchored) | 386 if (anchor == Prog::kAnchored) |
389 re_anchor = RE2::ANCHOR_START; | 387 re_anchor = RE2::ANCHOR_START; |
390 else | 388 else |
391 re_anchor = RE2::UNANCHORED; | 389 re_anchor = RE2::UNANCHORED; |
392 if (kind_ == Prog::kFullMatch) | 390 if (kind_ == Prog::kFullMatch) |
393 re_anchor = RE2::ANCHOR_BOTH; | 391 re_anchor = RE2::ANCHOR_BOTH; |
394 | 392 |
395 result->matched = re2_->Match( | 393 result->matched = re2_->Match(context, |
396 context, | 394 text.begin() - context.begin(), |
397 static_cast<int>(text.begin() - context.begin()), | 395 text.end() - context.begin(), |
398 static_cast<int>(text.end() - context.begin()), | 396 re_anchor, result->submatch, nsubmatch); |
399 re_anchor, | |
400 result->submatch, | |
401 nsubmatch); | |
402 result->have_submatch = nsubmatch > 0; | 397 result->have_submatch = nsubmatch > 0; |
403 break; | 398 break; |
404 } | 399 } |
405 | 400 |
406 case kEnginePCRE: { | 401 case kEnginePCRE: { |
407 if (!re_ || text.begin() != context.begin() || | 402 if (!re_ || text.begin() != context.begin() || |
408 text.end() != context.end()) { | 403 text.end() != context.end()) { |
409 result->skipped = true; | 404 result->skipped = true; |
410 break; | 405 break; |
411 } | 406 } |
412 | |
413 // PCRE 8.34 or so started allowing vertical tab to match \s, | |
414 // following a change made in Perl 5.18. RE2 does not. | |
415 if ((regexp_str_.contains("\\s") || regexp_str_.contains("\\S")) && | |
416 text.contains("\v")) { | |
417 result->skipped = true; | |
418 break; | |
419 } | |
420 | 407 |
421 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; | 408 const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch]; |
422 PCRE::Arg *a = new PCRE::Arg[nsubmatch]; | 409 PCRE::Arg *a = new PCRE::Arg[nsubmatch]; |
423 for (int i = 0; i < nsubmatch; i++) { | 410 for (int i = 0; i < nsubmatch; i++) { |
424 a[i] = PCRE::Arg(&result->submatch[i]); | 411 a[i] = PCRE::Arg(&result->submatch[i]); |
425 argptr[i] = &a[i]; | 412 argptr[i] = &a[i]; |
426 } | 413 } |
427 int consumed; | 414 int consumed; |
428 PCRE::Anchor pcre_anchor; | 415 PCRE::Anchor pcre_anchor; |
429 if (anchor == Prog::kAnchored) | 416 if (anchor == Prog::kAnchored) |
(...skipping 81 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
511 | 498 |
512 Result r; | 499 Result r; |
513 RunSearch(i, text, context, anchor, &r); | 500 RunSearch(i, text, context, anchor, &r); |
514 if (ResultOkay(r, correct)) { | 501 if (ResultOkay(r, correct)) { |
515 if (FLAGS_log_okay) | 502 if (FLAGS_log_okay) |
516 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); | 503 LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor); |
517 continue; | 504 continue; |
518 } | 505 } |
519 | 506 |
520 // We disagree with PCRE on the meaning of some Unicode matches. | 507 // We disagree with PCRE on the meaning of some Unicode matches. |
521 // In particular, we treat non-ASCII UTF-8 as non-word characters. | 508 // In particular, we treat all non-ASCII UTF-8 as word characters. |
522 // We also treat "empty" character sets like [^\w\W] as being | 509 // We also treat "empty" character sets like [^\w\W] as being |
523 // impossible to match, while PCRE apparently excludes some code | 510 // impossible to match, while PCRE apparently excludes some code |
524 // points (e.g., 0x0080) from both \w and \W. | 511 // points (e.g., 0x0080) from both \w and \W. |
525 if (i == kEnginePCRE && NonASCII(text)) | 512 if (i == kEnginePCRE && NonASCII(text)) |
526 continue; | 513 continue; |
527 | 514 |
528 if (!r.untrusted) | 515 if (!r.untrusted) |
529 all_okay = false; | 516 all_okay = false; |
530 | 517 |
531 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, | 518 LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text, |
(...skipping 66 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
598 for (int j = 0; j < arraysize(parse_modes); j++) { | 585 for (int j = 0; j < arraysize(parse_modes); j++) { |
599 TestInstance* t = new TestInstance(regexp, kinds[i], | 586 TestInstance* t = new TestInstance(regexp, kinds[i], |
600 parse_modes[j].parse_flags); | 587 parse_modes[j].parse_flags); |
601 error_ |= t->error(); | 588 error_ |= t->error(); |
602 v_.push_back(t); | 589 v_.push_back(t); |
603 } | 590 } |
604 } | 591 } |
605 } | 592 } |
606 | 593 |
607 Tester::~Tester() { | 594 Tester::~Tester() { |
608 for (size_t i = 0; i < v_.size(); i++) | 595 for (int i = 0; i < v_.size(); i++) |
609 delete v_[i]; | 596 delete v_[i]; |
610 } | 597 } |
611 | 598 |
612 bool Tester::TestCase(const StringPiece& text, const StringPiece& context, | 599 bool Tester::TestCase(const StringPiece& text, const StringPiece& context, |
613 Prog::Anchor anchor) { | 600 Prog::Anchor anchor) { |
614 bool okay = true; | 601 bool okay = true; |
615 for (size_t i = 0; i < v_.size(); i++) | 602 for (int i = 0; i < v_.size(); i++) |
616 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); | 603 okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor)); |
617 return okay; | 604 return okay; |
618 } | 605 } |
619 | 606 |
620 static Prog::Anchor anchors[] = { | 607 static Prog::Anchor anchors[] = { |
621 Prog::kAnchored, | 608 Prog::kAnchored, |
622 Prog::kUnanchored | 609 Prog::kUnanchored |
623 }; | 610 }; |
624 | 611 |
625 bool Tester::TestInput(const StringPiece& text) { | 612 bool Tester::TestInput(const StringPiece& text) { |
(...skipping 18 matching lines...) Expand all Loading... |
644 return okay; | 631 return okay; |
645 } | 632 } |
646 | 633 |
647 bool TestRegexpOnText(const StringPiece& regexp, | 634 bool TestRegexpOnText(const StringPiece& regexp, |
648 const StringPiece& text) { | 635 const StringPiece& text) { |
649 Tester t(regexp); | 636 Tester t(regexp); |
650 return t.TestInput(text); | 637 return t.TestInput(text); |
651 } | 638 } |
652 | 639 |
653 } // namespace re2 | 640 } // namespace re2 |
OLD | NEW |