Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(105)

Unified Diff: third_party/re2/re2/testing/tester.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/tostring.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/testing/tester.cc
diff --git a/third_party/re2/re2/testing/tester.cc b/third_party/re2/re2/testing/tester.cc
deleted file mode 100644
index cb12badebfa897888924954c488440d97e555cf9..0000000000000000000000000000000000000000
--- a/third_party/re2/re2/testing/tester.cc
+++ /dev/null
@@ -1,653 +0,0 @@
-// Copyright 2008 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Regular expression engine tester -- test all the implementations against each other.
-
-#include "util/util.h"
-#include "util/flags.h"
-#include "re2/testing/tester.h"
-#include "re2/prog.h"
-#include "re2/re2.h"
-#include "re2/regexp.h"
-
-DEFINE_bool(dump_prog, false, "dump regexp program");
-DEFINE_bool(log_okay, false, "log successful runs");
-DEFINE_bool(dump_rprog, false, "dump reversed regexp program");
-
-DEFINE_int32(max_regexp_failures, 100,
- "maximum number of regexp test failures (-1 = unlimited)");
-
-DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");
-
-namespace re2 {
-
-enum {
- kMaxSubmatch = 1+16, // $0...$16
-};
-
-const char* engine_types[kEngineMax] = {
- "Backtrack",
- "NFA",
- "DFA",
- "DFA1",
- "OnePass",
- "BitState",
- "RE2",
- "RE2a",
- "RE2b",
- "PCRE",
-};
-
-// Returns the name string for the type t.
-static string EngineString(Engine t) {
- if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {
- return StringPrintf("type%d", static_cast<int>(t));
- }
- return engine_types[t];
-}
-
-// Returns bit mask of engines to use.
-static uint32 Engines() {
- static uint32 cached_engines;
- static bool did_parse;
-
- if (did_parse)
- return cached_engines;
-
- if (FLAGS_regexp_engines.empty()) {
- cached_engines = ~0;
- } else {
- for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)
- if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))
- cached_engines |= 1<<i;
- }
-
- if (cached_engines == 0)
- LOG(INFO) << "Warning: no engines enabled.";
- if (!UsingPCRE)
- cached_engines &= ~(1<<kEnginePCRE);
- for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {
- if (cached_engines & (1<<i))
- LOG(INFO) << EngineString(i) << " enabled";
- }
- did_parse = true;
- return cached_engines;
-}
-
-// The result of running a match.
-struct TestInstance::Result {
- bool skipped; // test skipped: wasn't applicable
- bool matched; // found a match
- bool untrusted; // don't really trust the answer
- bool have_submatch; // computed all submatch info
- bool have_submatch0; // computed just submatch[0]
- StringPiece submatch[kMaxSubmatch];
-};
-
-typedef TestInstance::Result Result;
-
-// Formats a single capture range s in text in the form (a,b)
-// where a and b are the starting and ending offsets of s in text.
-static string FormatCapture(const StringPiece& text, const StringPiece& s) {
- if (s.begin() == NULL)
- return "(?,?)";
- return StringPrintf("(%d,%d)",
- static_cast<int>(s.begin() - text.begin()),
- static_cast<int>(s.end() - text.begin()));
-}
-
-// Returns whether text contains non-ASCII (>= 0x80) bytes.
-static bool NonASCII(const StringPiece& text) {
- for (int i = 0; i < text.size(); i++)
- if ((uint8)text[i] >= 0x80)
- return true;
- return false;
-}
-
-// Returns string representation of match kind.
-static string FormatKind(Prog::MatchKind kind) {
- switch (kind) {
- case Prog::kFullMatch:
- return "full match";
- case Prog::kLongestMatch:
- return "longest match";
- case Prog::kFirstMatch:
- return "first match";
- case Prog::kManyMatch:
- return "many match";
- }
- return "???";
-}
-
-// Returns string representation of anchor kind.
-static string FormatAnchor(Prog::Anchor anchor) {
- switch (anchor) {
- case Prog::kAnchored:
- return "anchored";
- case Prog::kUnanchored:
- return "unanchored";
- }
- return "???";
-}
-
-struct ParseMode {
- Regexp::ParseFlags parse_flags;
- string desc;
-};
-
-static const Regexp::ParseFlags single_line =
- Regexp::LikePerl;
-static const Regexp::ParseFlags multi_line =
- static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);
-
-static ParseMode parse_modes[] = {
- { single_line, "single-line" },
- { single_line|Regexp::Latin1, "single-line, latin1" },
- { multi_line, "multiline" },
- { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },
- { multi_line|Regexp::Latin1, "multiline, latin1" },
-};
-
-static string FormatMode(Regexp::ParseFlags flags) {
- for (int i = 0; i < arraysize(parse_modes); i++)
- if (parse_modes[i].parse_flags == flags)
- return parse_modes[i].desc;
- return StringPrintf("%#x", static_cast<uint>(flags));
-}
-
-// Constructs and saves all the matching engines that
-// will be required for the given tests.
-TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,
- Regexp::ParseFlags flags)
- : regexp_str_(regexp_str),
- kind_(kind),
- flags_(flags),
- error_(false),
- regexp_(NULL),
- num_captures_(0),
- prog_(NULL),
- rprog_(NULL),
- re_(NULL),
- re2_(NULL) {
-
- VLOG(1) << CEscape(regexp_str);
-
- // Compile regexp to prog.
- // Always required - needed for backtracking (reference implementation).
- RegexpStatus status;
- regexp_ = Regexp::Parse(regexp_str, flags, &status);
- if (regexp_ == NULL) {
- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)
- << " mode: " << FormatMode(flags);
- error_ = true;
- return;
- }
- num_captures_ = regexp_->NumCaptures();
- prog_ = regexp_->CompileToProg(0);
- if (prog_ == NULL) {
- LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);
- error_ = true;
- return;
- }
- if (FLAGS_dump_prog) {
- LOG(INFO) << "Prog for "
- << " regexp "
- << CEscape(regexp_str_)
- << " (" << FormatKind(kind_)
- << ", " << FormatMode(flags_)
- << ")\n"
- << prog_->Dump();
- }
-
- // Compile regexp to reversed prog. Only needed for DFA engines.
- if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {
- rprog_ = regexp_->CompileToReverseProg(0);
- if (rprog_ == NULL) {
- LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);
- error_ = true;
- return;
- }
- if (FLAGS_dump_rprog)
- LOG(INFO) << rprog_->Dump();
- }
-
- // Create re string that will be used for RE and RE2.
- string re = regexp_str.as_string();
- // Accomodate flags.
- // Regexp::Latin1 will be accomodated below.
- if (!(flags & Regexp::OneLine))
- re = "(?m)" + re;
- if (flags & Regexp::NonGreedy)
- re = "(?U)" + re;
- if (flags & Regexp::DotNL)
- re = "(?s)" + re;
-
- // Compile regexp to RE2.
- if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {
- RE2::Options options;
- if (flags & Regexp::Latin1)
- options.set_encoding(RE2::Options::EncodingLatin1);
- if (kind_ == Prog::kLongestMatch)
- options.set_longest_match(true);
- re2_ = new RE2(re, options);
- if (!re2_->error().empty()) {
- LOG(INFO) << "Cannot RE2: " << CEscape(re);
- error_ = true;
- return;
- }
- }
-
- // Compile regexp to RE.
- // PCRE as exposed by the RE interface isn't always usable.
- // 1. It disagrees about handling of empty-string reptitions
- // like matching (a*)* against "b". PCRE treats the (a*) as
- // occurring once, while we treat it as occurring not at all.
- // 2. It treats $ as this weird thing meaning end of string
- // or before the \n at the end of the string.
- // 3. It doesn't implement POSIX leftmost-longest matching.
- // 4. It lets \s match vertical tab.
- // MimicsPCRE() detects 1 and 2.
- if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&
- kind_ != Prog::kLongestMatch) {
- PCRE_Options o;
- o.set_option(PCRE::UTF8);
- if (flags & Regexp::Latin1)
- o.set_option(PCRE::None);
- // PCRE has interface bug keeping us from finding $0, so
- // add one more layer of parens.
- re_ = new PCRE("("+re+")", o);
- if (!re_->error().empty()) {
- LOG(INFO) << "Cannot PCRE: " << CEscape(re);
- error_ = true;
- return;
- }
- }
-}
-
-TestInstance::~TestInstance() {
- if (regexp_)
- regexp_->Decref();
- delete prog_;
- delete rprog_;
- delete re_;
- delete re2_;
-}
-
-// Runs a single search using the named engine type.
-// This interface hides all the irregularities of the various
-// engine interfaces from the rest of this file.
-void TestInstance::RunSearch(Engine type,
- const StringPiece& orig_text,
- const StringPiece& orig_context,
- Prog::Anchor anchor,
- Result *result) {
- memset(result, 0, sizeof *result);
- if (regexp_ == NULL) {
- result->skipped = true;
- return;
- }
- int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0
- if (nsubmatch > kMaxSubmatch)
- nsubmatch = kMaxSubmatch;
-
- StringPiece text = orig_text;
- StringPiece context = orig_context;
-
- switch (type) {
- default:
- LOG(FATAL) << "Bad RunSearch type: " << (int)type;
-
- case kEngineBacktrack:
- if (prog_ == NULL) {
- result->skipped = true;
- break;
- }
- result->matched =
- prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,
- result->submatch, nsubmatch);
- result->have_submatch = true;
- break;
-
- case kEngineNFA:
- if (prog_ == NULL) {
- result->skipped = true;
- break;
- }
- result->matched =
- prog_->SearchNFA(text, context, anchor, kind_,
- result->submatch, nsubmatch);
- result->have_submatch = true;
- break;
-
- case kEngineDFA:
- if (prog_ == NULL) {
- result->skipped = true;
- break;
- }
- result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,
- &result->skipped, NULL);
- break;
-
- case kEngineDFA1:
- if (prog_ == NULL || rprog_ == NULL) {
- result->skipped = true;
- break;
- }
- result->matched =
- prog_->SearchDFA(text, context, anchor, kind_, result->submatch,
- &result->skipped, NULL);
- // If anchored, no need for second run,
- // but do it anyway to find more bugs.
- if (result->matched) {
- if (!rprog_->SearchDFA(result->submatch[0], context,
- Prog::kAnchored, Prog::kLongestMatch,
- result->submatch,
- &result->skipped, NULL)) {
- LOG(ERROR) << "Reverse DFA inconsistency: "
- << CEscape(regexp_str_)
- << " on " << CEscape(text);
- result->matched = false;
- }
- }
- result->have_submatch0 = true;
- break;
-
- case kEngineOnePass:
- if (prog_ == NULL ||
- anchor == Prog::kUnanchored ||
- !prog_->IsOnePass() ||
- nsubmatch > Prog::kMaxOnePassCapture) {
- result->skipped = true;
- break;
- }
- result->matched = prog_->SearchOnePass(text, context, anchor, kind_,
- result->submatch, nsubmatch);
- result->have_submatch = true;
- break;
-
- case kEngineBitState:
- if (prog_ == NULL) {
- result->skipped = true;
- break;
- }
- result->matched = prog_->SearchBitState(text, context, anchor, kind_,
- result->submatch, nsubmatch);
- result->have_submatch = true;
- break;
-
- case kEngineRE2:
- case kEngineRE2a:
- case kEngineRE2b: {
- if (!re2_ || text.end() != context.end()) {
- result->skipped = true;
- break;
- }
-
- RE2::Anchor re_anchor;
- if (anchor == Prog::kAnchored)
- re_anchor = RE2::ANCHOR_START;
- else
- re_anchor = RE2::UNANCHORED;
- if (kind_ == Prog::kFullMatch)
- re_anchor = RE2::ANCHOR_BOTH;
-
- result->matched = re2_->Match(
- context,
- static_cast<int>(text.begin() - context.begin()),
- static_cast<int>(text.end() - context.begin()),
- re_anchor,
- result->submatch,
- nsubmatch);
- result->have_submatch = nsubmatch > 0;
- break;
- }
-
- case kEnginePCRE: {
- if (!re_ || text.begin() != context.begin() ||
- text.end() != context.end()) {
- result->skipped = true;
- break;
- }
-
- // PCRE 8.34 or so started allowing vertical tab to match \s,
- // following a change made in Perl 5.18. RE2 does not.
- if ((regexp_str_.contains("\\s") || regexp_str_.contains("\\S")) &&
- text.contains("\v")) {
- result->skipped = true;
- break;
- }
-
- const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];
- PCRE::Arg *a = new PCRE::Arg[nsubmatch];
- for (int i = 0; i < nsubmatch; i++) {
- a[i] = PCRE::Arg(&result->submatch[i]);
- argptr[i] = &a[i];
- }
- int consumed;
- PCRE::Anchor pcre_anchor;
- if (anchor == Prog::kAnchored)
- pcre_anchor = PCRE::ANCHOR_START;
- else
- pcre_anchor = PCRE::UNANCHORED;
- if (kind_ == Prog::kFullMatch)
- pcre_anchor = PCRE::ANCHOR_BOTH;
- re_->ClearHitLimit();
- result->matched =
- re_->DoMatch(text,
- pcre_anchor,
- &consumed,
- argptr, nsubmatch);
- if (re_->HitLimit()) {
- result->untrusted = true;
- delete[] argptr;
- delete[] a;
- break;
- }
- result->have_submatch = true;
-
- // Work around RE interface bug: PCRE returns -1 as the
- // offsets for an unmatched subexpression, and RE should
- // turn that into StringPiece(NULL) but in fact it uses
- // StringPiece(text.begin() - 1, 0). Oops.
- for (int i = 0; i < nsubmatch; i++)
- if (result->submatch[i].begin() == text.begin() - 1)
- result->submatch[i] = NULL;
- delete[] argptr;
- delete[] a;
- break;
- }
- }
-
- if (!result->matched)
- memset(result->submatch, 0, sizeof result->submatch);
-}
-
-// Checks whether r is okay given that correct is the right answer.
-// Specifically, r's answers have to match (but it doesn't have to
-// claim to have all the answers).
-static bool ResultOkay(const Result& r, const Result& correct) {
- if (r.skipped)
- return true;
- if (r.matched != correct.matched)
- return false;
- if (r.have_submatch || r.have_submatch0) {
- for (int i = 0; i < kMaxSubmatch; i++) {
- if (correct.submatch[i].begin() != r.submatch[i].begin() ||
- correct.submatch[i].size() != r.submatch[i].size())
- return false;
- if (!r.have_submatch)
- break;
- }
- }
- return true;
-}
-
-// Runs a single test.
-bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,
- Prog::Anchor anchor) {
- // Backtracking is the gold standard.
- Result correct;
- RunSearch(kEngineBacktrack, text, context, anchor, &correct);
- if (correct.skipped) {
- if (regexp_ == NULL)
- return true;
- LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)
- << " " << FormatMode(flags_);
- return false;
- }
- VLOG(1) << "Try: regexp " << CEscape(regexp_str_)
- << " text " << CEscape(text)
- << " (" << FormatKind(kind_)
- << ", " << FormatAnchor(anchor)
- << ", " << FormatMode(flags_)
- << ")";
-
- // Compare the others.
- bool all_okay = true;
- for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {
- if (!(Engines() & (1<<i)))
- continue;
-
- Result r;
- RunSearch(i, text, context, anchor, &r);
- if (ResultOkay(r, correct)) {
- if (FLAGS_log_okay)
- LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);
- continue;
- }
-
- // We disagree with PCRE on the meaning of some Unicode matches.
- // In particular, we treat non-ASCII UTF-8 as non-word characters.
- // We also treat "empty" character sets like [^\w\W] as being
- // impossible to match, while PCRE apparently excludes some code
- // points (e.g., 0x0080) from both \w and \W.
- if (i == kEnginePCRE && NonASCII(text))
- continue;
-
- if (!r.untrusted)
- all_okay = false;
-
- LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,
- context, anchor);
- if (r.matched != correct.matched) {
- if (r.matched) {
- LOG(INFO) << " Should not match (but does).";
- } else {
- LOG(INFO) << " Should match (but does not).";
- continue;
- }
- }
- for (int i = 0; i < 1+num_captures_; i++) {
- if (r.submatch[i].begin() != correct.submatch[i].begin() ||
- r.submatch[i].end() != correct.submatch[i].end()) {
- LOG(INFO) <<
- StringPrintf(" $%d: should be %s is %s",
- i,
- FormatCapture(text, correct.submatch[i]).c_str(),
- FormatCapture(text, r.submatch[i]).c_str());
- } else {
- LOG(INFO) <<
- StringPrintf(" $%d: %s ok", i,
- FormatCapture(text, r.submatch[i]).c_str());
- }
- }
- }
-
- if (!all_okay) {
- if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)
- LOG(QFATAL) << "Too many regexp failures.";
- }
-
- return all_okay;
-}
-
-void TestInstance::LogMatch(const char* prefix, Engine e,
- const StringPiece& text, const StringPiece& context,
- Prog::Anchor anchor) {
- LOG(INFO) << prefix
- << EngineString(e)
- << " regexp "
- << CEscape(regexp_str_)
- << " "
- << CEscape(regexp_->ToString())
- << " text "
- << CEscape(text)
- << " ("
- << text.begin() - context.begin()
- << ","
- << text.end() - context.begin()
- << ") of context "
- << CEscape(context)
- << " (" << FormatKind(kind_)
- << ", " << FormatAnchor(anchor)
- << ", " << FormatMode(flags_)
- << ")";
-}
-
-static Prog::MatchKind kinds[] = {
- Prog::kFirstMatch,
- Prog::kLongestMatch,
- Prog::kFullMatch,
-};
-
-// Test all possible match kinds and parse modes.
-Tester::Tester(const StringPiece& regexp) {
- error_ = false;
- for (int i = 0; i < arraysize(kinds); i++) {
- for (int j = 0; j < arraysize(parse_modes); j++) {
- TestInstance* t = new TestInstance(regexp, kinds[i],
- parse_modes[j].parse_flags);
- error_ |= t->error();
- v_.push_back(t);
- }
- }
-}
-
-Tester::~Tester() {
- for (size_t i = 0; i < v_.size(); i++)
- delete v_[i];
-}
-
-bool Tester::TestCase(const StringPiece& text, const StringPiece& context,
- Prog::Anchor anchor) {
- bool okay = true;
- for (size_t i = 0; i < v_.size(); i++)
- okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));
- return okay;
-}
-
-static Prog::Anchor anchors[] = {
- Prog::kAnchored,
- Prog::kUnanchored
-};
-
-bool Tester::TestInput(const StringPiece& text) {
- bool okay = TestInputInContext(text, text);
- if (text.size() > 0) {
- StringPiece sp;
- sp = text;
- sp.remove_prefix(1);
- okay &= TestInputInContext(sp, text);
- sp = text;
- sp.remove_suffix(1);
- okay &= TestInputInContext(sp, text);
- }
- return okay;
-}
-
-bool Tester::TestInputInContext(const StringPiece& text,
- const StringPiece& context) {
- bool okay = true;
- for (int i = 0; i < arraysize(anchors); i++)
- okay &= TestCase(text, context, anchors[i]);
- return okay;
-}
-
-bool TestRegexpOnText(const StringPiece& regexp,
- const StringPiece& text) {
- Tester t(regexp);
- return t.TestInput(text);
-}
-
-} // namespace re2
« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/tostring.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698