third_party/re2/re2/testing/tester.cc - Issue 1544433002: Replace RE2 import with a dependency

Unified Diff: third_party/re2/re2/testing/tester.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/re2/re2/testing/tester.cc

diff --git a/third_party/re2/re2/testing/tester.cc b/third_party/re2/re2/testing/tester.cc

deleted file mode 100644

index cb12badebfa897888924954c488440d97e555cf9..0000000000000000000000000000000000000000

--- a/third_party/re2/re2/testing/tester.cc

+++ /dev/null

@@ -1,653 +0,0 @@

-// Use of this source code is governed by a BSD-style

-// license that can be found in the LICENSE file.

-// Regular expression engine tester -- test all the implementations against each other.

-#include "util/util.h"

-#include "util/flags.h"

-#include "re2/testing/tester.h"

-#include "re2/prog.h"

-#include "re2/re2.h"

-#include "re2/regexp.h"

-DEFINE_bool(dump_prog, false, "dump regexp program");

-DEFINE_bool(log_okay, false, "log successful runs");

-DEFINE_bool(dump_rprog, false, "dump reversed regexp program");

-DEFINE_int32(max_regexp_failures, 100,

- "maximum number of regexp test failures (-1 = unlimited)");

-DEFINE_string(regexp_engines, "", "pattern to select regexp engines to test");

-namespace re2 {

-enum {

- kMaxSubmatch = 1+16, // $0...$16

-};

-const char* engine_types[kEngineMax] = {

- "Backtrack",

- "NFA",

- "DFA",

- "DFA1",

- "OnePass",

- "BitState",

- "RE2",

- "RE2a",

- "RE2b",

- "PCRE",

-};

-// Returns the name string for the type t.

-static string EngineString(Engine t) {

- if (t < 0 || t >= arraysize(engine_types) || engine_types[t] == NULL) {

- return StringPrintf("type%d", static_cast<int>(t));

- }

- return engine_types[t];

-// Returns bit mask of engines to use.

-static uint32 Engines() {

- static uint32 cached_engines;

- static bool did_parse;

- if (did_parse)

- return cached_engines;

- if (FLAGS_regexp_engines.empty()) {

- cached_engines = ~0;

- } else {

- for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++)

- if (strstr(EngineString(i).c_str(), FLAGS_regexp_engines.c_str()))

- cached_engines |= 1<<i;

- }

- if (cached_engines == 0)

- LOG(INFO) << "Warning: no engines enabled.";

- if (!UsingPCRE)

- cached_engines &= ~(1<<kEnginePCRE);

- for (Engine i = static_cast<Engine>(0); i < kEngineMax; i++) {

- if (cached_engines & (1<<i))

- LOG(INFO) << EngineString(i) << " enabled";

- }

- did_parse = true;

- return cached_engines;

-// The result of running a match.

-struct TestInstance::Result {

- bool skipped; // test skipped: wasn't applicable

- bool matched; // found a match

- bool untrusted; // don't really trust the answer

- bool have_submatch; // computed all submatch info

- bool have_submatch0; // computed just submatch[0]

- StringPiece submatch[kMaxSubmatch];

-};

-typedef TestInstance::Result Result;

-// Formats a single capture range s in text in the form (a,b)

-// where a and b are the starting and ending offsets of s in text.

-static string FormatCapture(const StringPiece& text, const StringPiece& s) {

- if (s.begin() == NULL)

- return "(?,?)";

- return StringPrintf("(%d,%d)",

- static_cast<int>(s.begin() - text.begin()),

- static_cast<int>(s.end() - text.begin()));

-// Returns whether text contains non-ASCII (>= 0x80) bytes.

-static bool NonASCII(const StringPiece& text) {

- for (int i = 0; i < text.size(); i++)

- if ((uint8)text[i] >= 0x80)

- return true;

- return false;

-// Returns string representation of match kind.

-static string FormatKind(Prog::MatchKind kind) {

- switch (kind) {

- case Prog::kFullMatch:

- return "full match";

- case Prog::kLongestMatch:

- return "longest match";

- case Prog::kFirstMatch:

- return "first match";

- case Prog::kManyMatch:

- return "many match";

- }

- return "???";

-// Returns string representation of anchor kind.

-static string FormatAnchor(Prog::Anchor anchor) {

- switch (anchor) {

- case Prog::kAnchored:

- return "anchored";

- case Prog::kUnanchored:

- return "unanchored";

- }

- return "???";

-struct ParseMode {

- Regexp::ParseFlags parse_flags;

- string desc;

-};

-static const Regexp::ParseFlags single_line =

- Regexp::LikePerl;

-static const Regexp::ParseFlags multi_line =

- static_cast<Regexp::ParseFlags>(Regexp::LikePerl & ~Regexp::OneLine);

-static ParseMode parse_modes[] = {

- { single_line, "single-line" },

- { single_line|Regexp::Latin1, "single-line, latin1" },

- { multi_line, "multiline" },

- { multi_line|Regexp::NonGreedy, "multiline, nongreedy" },

- { multi_line|Regexp::Latin1, "multiline, latin1" },

-};

-static string FormatMode(Regexp::ParseFlags flags) {

- for (int i = 0; i < arraysize(parse_modes); i++)

- if (parse_modes[i].parse_flags == flags)

- return parse_modes[i].desc;

- return StringPrintf("%#x", static_cast<uint>(flags));

-// Constructs and saves all the matching engines that

-// will be required for the given tests.

-TestInstance::TestInstance(const StringPiece& regexp_str, Prog::MatchKind kind,

- Regexp::ParseFlags flags)

- : regexp_str_(regexp_str),

- kind_(kind),

- flags_(flags),

- error_(false),

- regexp_(NULL),

- num_captures_(0),

- prog_(NULL),

- rprog_(NULL),

- re_(NULL),

- re2_(NULL) {

- VLOG(1) << CEscape(regexp_str);

- // Compile regexp to prog.

- // Always required - needed for backtracking (reference implementation).

- RegexpStatus status;

- regexp_ = Regexp::Parse(regexp_str, flags, &status);

- if (regexp_ == NULL) {

- LOG(INFO) << "Cannot parse: " << CEscape(regexp_str_)

- << " mode: " << FormatMode(flags);

- error_ = true;

- return;

- }

- num_captures_ = regexp_->NumCaptures();

- prog_ = regexp_->CompileToProg(0);

- if (prog_ == NULL) {

- LOG(INFO) << "Cannot compile: " << CEscape(regexp_str_);

- error_ = true;

- return;

- }

- if (FLAGS_dump_prog) {

- LOG(INFO) << "Prog for "

- << " regexp "

- << CEscape(regexp_str_)

- << " (" << FormatKind(kind_)

- << ", " << FormatMode(flags_)

- << ")\n"

- << prog_->Dump();

- }

- // Compile regexp to reversed prog. Only needed for DFA engines.

- if (Engines() & ((1<<kEngineDFA)|(1<<kEngineDFA1))) {

- rprog_ = regexp_->CompileToReverseProg(0);

- if (rprog_ == NULL) {

- LOG(INFO) << "Cannot reverse compile: " << CEscape(regexp_str_);

- error_ = true;

- return;

- }

- if (FLAGS_dump_rprog)

- LOG(INFO) << rprog_->Dump();

- }

- // Create re string that will be used for RE and RE2.

- string re = regexp_str.as_string();

- // Accomodate flags.

- // Regexp::Latin1 will be accomodated below.

- if (!(flags & Regexp::OneLine))

- re = "(?m)" + re;

- if (flags & Regexp::NonGreedy)

- re = "(?U)" + re;

- if (flags & Regexp::DotNL)

- re = "(?s)" + re;

- // Compile regexp to RE2.

- if (Engines() & ((1<<kEngineRE2)|(1<<kEngineRE2a)|(1<<kEngineRE2b))) {

- RE2::Options options;

- if (flags & Regexp::Latin1)

- options.set_encoding(RE2::Options::EncodingLatin1);

- if (kind_ == Prog::kLongestMatch)

- options.set_longest_match(true);

- re2_ = new RE2(re, options);

- if (!re2_->error().empty()) {

- LOG(INFO) << "Cannot RE2: " << CEscape(re);

- error_ = true;

- return;

- }

- // Compile regexp to RE.

- // PCRE as exposed by the RE interface isn't always usable.

- // 1. It disagrees about handling of empty-string reptitions

- // like matching (a*)* against "b". PCRE treats the (a*) as

- // occurring once, while we treat it as occurring not at all.

- // 2. It treats $ as this weird thing meaning end of string

- // or before the \n at the end of the string.

- // 3. It doesn't implement POSIX leftmost-longest matching.

- // 4. It lets \s match vertical tab.

- // MimicsPCRE() detects 1 and 2.

- if ((Engines() & (1<<kEnginePCRE)) && regexp_->MimicsPCRE() &&

- kind_ != Prog::kLongestMatch) {

- PCRE_Options o;

- o.set_option(PCRE::UTF8);

- if (flags & Regexp::Latin1)

- o.set_option(PCRE::None);

- // PCRE has interface bug keeping us from finding $0, so

- // add one more layer of parens.

- re_ = new PCRE("("+re+")", o);

- if (!re_->error().empty()) {

- LOG(INFO) << "Cannot PCRE: " << CEscape(re);

- error_ = true;

- return;

- }

-TestInstance::~TestInstance() {

- if (regexp_)

- regexp_->Decref();

- delete prog_;

- delete rprog_;

- delete re_;

- delete re2_;

-// Runs a single search using the named engine type.

-// This interface hides all the irregularities of the various

-// engine interfaces from the rest of this file.

-void TestInstance::RunSearch(Engine type,

- const StringPiece& orig_text,

- const StringPiece& orig_context,

- Prog::Anchor anchor,

- Result *result) {

- memset(result, 0, sizeof *result);

- if (regexp_ == NULL) {

- result->skipped = true;

- return;

- }

- int nsubmatch = 1 + num_captures_; // NumCaptures doesn't count $0

- if (nsubmatch > kMaxSubmatch)

- nsubmatch = kMaxSubmatch;

- StringPiece text = orig_text;

- StringPiece context = orig_context;

- switch (type) {

- default:

- LOG(FATAL) << "Bad RunSearch type: " << (int)type;

- case kEngineBacktrack:

- if (prog_ == NULL) {

- result->skipped = true;

- break;

- }

- result->matched =

- prog_->UnsafeSearchBacktrack(text, context, anchor, kind_,

- result->submatch, nsubmatch);

- result->have_submatch = true;

- break;

- case kEngineNFA:

- if (prog_ == NULL) {

- result->skipped = true;

- break;

- }

- result->matched =

- prog_->SearchNFA(text, context, anchor, kind_,

- result->submatch, nsubmatch);

- result->have_submatch = true;

- break;

- case kEngineDFA:

- if (prog_ == NULL) {

- result->skipped = true;

- break;

- }

- result->matched = prog_->SearchDFA(text, context, anchor, kind_, NULL,

- &result->skipped, NULL);

- break;

- case kEngineDFA1:

- if (prog_ == NULL || rprog_ == NULL) {

- result->skipped = true;

- break;

- }

- result->matched =

- prog_->SearchDFA(text, context, anchor, kind_, result->submatch,

- &result->skipped, NULL);

- // If anchored, no need for second run,

- // but do it anyway to find more bugs.

- if (result->matched) {

- if (!rprog_->SearchDFA(result->submatch[0], context,

- Prog::kAnchored, Prog::kLongestMatch,

- result->submatch,

- &result->skipped, NULL)) {

- LOG(ERROR) << "Reverse DFA inconsistency: "

- << CEscape(regexp_str_)

- << " on " << CEscape(text);

- result->matched = false;

- }

- result->have_submatch0 = true;

- break;

- case kEngineOnePass:

- if (prog_ == NULL ||

- anchor == Prog::kUnanchored ||

- !prog_->IsOnePass() ||

- nsubmatch > Prog::kMaxOnePassCapture) {

- result->skipped = true;

- break;

- }

- result->matched = prog_->SearchOnePass(text, context, anchor, kind_,

- result->submatch, nsubmatch);

- result->have_submatch = true;

- break;

- case kEngineBitState:

- if (prog_ == NULL) {

- result->skipped = true;

- break;

- }

- result->matched = prog_->SearchBitState(text, context, anchor, kind_,

- result->submatch, nsubmatch);

- result->have_submatch = true;

- break;

- case kEngineRE2:

- case kEngineRE2a:

- case kEngineRE2b: {

- if (!re2_ || text.end() != context.end()) {

- result->skipped = true;

- break;

- }

- RE2::Anchor re_anchor;

- if (anchor == Prog::kAnchored)

- re_anchor = RE2::ANCHOR_START;

- else

- re_anchor = RE2::UNANCHORED;

- if (kind_ == Prog::kFullMatch)

- re_anchor = RE2::ANCHOR_BOTH;

- result->matched = re2_->Match(

- context,

- static_cast<int>(text.begin() - context.begin()),

- static_cast<int>(text.end() - context.begin()),

- re_anchor,

- result->submatch,

- nsubmatch);

- result->have_submatch = nsubmatch > 0;

- break;

- }

- case kEnginePCRE: {

- if (!re_ || text.begin() != context.begin() ||

- text.end() != context.end()) {

- result->skipped = true;

- break;

- }

- // PCRE 8.34 or so started allowing vertical tab to match \s,

- // following a change made in Perl 5.18. RE2 does not.

- if ((regexp_str_.contains("\\s") || regexp_str_.contains("\\S")) &&

- text.contains("\v")) {

- result->skipped = true;

- break;

- }

- const PCRE::Arg **argptr = new const PCRE::Arg*[nsubmatch];

- PCRE::Arg *a = new PCRE::Arg[nsubmatch];

- for (int i = 0; i < nsubmatch; i++) {

- a[i] = PCRE::Arg(&result->submatch[i]);

- argptr[i] = &a[i];

- }

- int consumed;

- PCRE::Anchor pcre_anchor;

- if (anchor == Prog::kAnchored)

- pcre_anchor = PCRE::ANCHOR_START;

- else

- pcre_anchor = PCRE::UNANCHORED;

- if (kind_ == Prog::kFullMatch)

- pcre_anchor = PCRE::ANCHOR_BOTH;

- re_->ClearHitLimit();

- result->matched =

- re_->DoMatch(text,

- pcre_anchor,

- &consumed,

- argptr, nsubmatch);

- if (re_->HitLimit()) {

- result->untrusted = true;

- delete[] argptr;

- delete[] a;

- break;

- }

- result->have_submatch = true;

- // Work around RE interface bug: PCRE returns -1 as the

- // offsets for an unmatched subexpression, and RE should

- // turn that into StringPiece(NULL) but in fact it uses

- // StringPiece(text.begin() - 1, 0). Oops.

- for (int i = 0; i < nsubmatch; i++)

- if (result->submatch[i].begin() == text.begin() - 1)

- result->submatch[i] = NULL;

- delete[] argptr;

- delete[] a;

- break;

- }

- if (!result->matched)

- memset(result->submatch, 0, sizeof result->submatch);

-// Checks whether r is okay given that correct is the right answer.

-// Specifically, r's answers have to match (but it doesn't have to

-// claim to have all the answers).

-static bool ResultOkay(const Result& r, const Result& correct) {

- if (r.skipped)

- return true;

- if (r.matched != correct.matched)

- return false;

- if (r.have_submatch || r.have_submatch0) {

- for (int i = 0; i < kMaxSubmatch; i++) {

- if (correct.submatch[i].begin() != r.submatch[i].begin() ||

- correct.submatch[i].size() != r.submatch[i].size())

- return false;

- if (!r.have_submatch)

- break;

- }

- return true;

-// Runs a single test.

-bool TestInstance::RunCase(const StringPiece& text, const StringPiece& context,

- Prog::Anchor anchor) {

- // Backtracking is the gold standard.

- Result correct;

- RunSearch(kEngineBacktrack, text, context, anchor, &correct);

- if (correct.skipped) {

- if (regexp_ == NULL)

- return true;

- LOG(ERROR) << "Skipped backtracking! " << CEscape(regexp_str_)

- << " " << FormatMode(flags_);

- return false;

- }

- VLOG(1) << "Try: regexp " << CEscape(regexp_str_)

- << " text " << CEscape(text)

- << " (" << FormatKind(kind_)

- << ", " << FormatAnchor(anchor)

- << ", " << FormatMode(flags_)

- << ")";

- // Compare the others.

- bool all_okay = true;

- for (Engine i = kEngineBacktrack+1; i < kEngineMax; i++) {

- if (!(Engines() & (1<<i)))

- continue;

- Result r;

- RunSearch(i, text, context, anchor, &r);

- if (ResultOkay(r, correct)) {

- if (FLAGS_log_okay)

- LogMatch(r.skipped ? "Skipped: " : "Okay: ", i, text, context, anchor);

- continue;

- }

- // We disagree with PCRE on the meaning of some Unicode matches.

- // In particular, we treat non-ASCII UTF-8 as non-word characters.

- // We also treat "empty" character sets like [^\w\W] as being

- // impossible to match, while PCRE apparently excludes some code

- // points (e.g., 0x0080) from both \w and \W.

- if (i == kEnginePCRE && NonASCII(text))

- continue;

- if (!r.untrusted)

- all_okay = false;

- LogMatch(r.untrusted ? "(Untrusted) Mismatch: " : "Mismatch: ", i, text,

- context, anchor);

- if (r.matched != correct.matched) {

- if (r.matched) {

- LOG(INFO) << " Should not match (but does).";

- } else {

- LOG(INFO) << " Should match (but does not).";

- continue;

- }

- for (int i = 0; i < 1+num_captures_; i++) {

- if (r.submatch[i].begin() != correct.submatch[i].begin() ||

- r.submatch[i].end() != correct.submatch[i].end()) {

- LOG(INFO) <<

- StringPrintf(" $%d: should be %s is %s",

- i,

- FormatCapture(text, correct.submatch[i]).c_str(),

- FormatCapture(text, r.submatch[i]).c_str());

- } else {

- LOG(INFO) <<

- StringPrintf(" $%d: %s ok", i,

- FormatCapture(text, r.submatch[i]).c_str());

- }

- if (!all_okay) {

- if (FLAGS_max_regexp_failures > 0 && --FLAGS_max_regexp_failures == 0)

- LOG(QFATAL) << "Too many regexp failures.";

- }

- return all_okay;

-void TestInstance::LogMatch(const char* prefix, Engine e,

- const StringPiece& text, const StringPiece& context,

- Prog::Anchor anchor) {

- LOG(INFO) << prefix

- << EngineString(e)

- << " regexp "

- << CEscape(regexp_str_)

- << " "

- << CEscape(regexp_->ToString())

- << " text "

- << CEscape(text)

- << " ("

- << text.begin() - context.begin()

- << ","

- << text.end() - context.begin()

- << ") of context "

- << CEscape(context)

- << " (" << FormatKind(kind_)

- << ", " << FormatAnchor(anchor)

- << ", " << FormatMode(flags_)

- << ")";

-static Prog::MatchKind kinds[] = {

- Prog::kFirstMatch,

- Prog::kLongestMatch,

- Prog::kFullMatch,

-};

-// Test all possible match kinds and parse modes.

-Tester::Tester(const StringPiece& regexp) {

- error_ = false;

- for (int i = 0; i < arraysize(kinds); i++) {

- for (int j = 0; j < arraysize(parse_modes); j++) {

- TestInstance* t = new TestInstance(regexp, kinds[i],

- parse_modes[j].parse_flags);

- error_ |= t->error();

- v_.push_back(t);

- }

-Tester::~Tester() {

- for (size_t i = 0; i < v_.size(); i++)

- delete v_[i];

-bool Tester::TestCase(const StringPiece& text, const StringPiece& context,

- Prog::Anchor anchor) {

- bool okay = true;

- for (size_t i = 0; i < v_.size(); i++)

- okay &= (!v_[i]->error() && v_[i]->RunCase(text, context, anchor));

- return okay;

-static Prog::Anchor anchors[] = {

- Prog::kAnchored,

- Prog::kUnanchored

-};

-bool Tester::TestInput(const StringPiece& text) {

- bool okay = TestInputInContext(text, text);

- if (text.size() > 0) {

- StringPiece sp;

- sp = text;

- sp.remove_prefix(1);

- okay &= TestInputInContext(sp, text);

- sp = text;

- sp.remove_suffix(1);

- okay &= TestInputInContext(sp, text);

- }

- return okay;

-bool Tester::TestInputInContext(const StringPiece& text,

- const StringPiece& context) {

- bool okay = true;

- for (int i = 0; i < arraysize(anchors); i++)

- okay &= TestCase(text, context, anchors[i]);

- return okay;

-bool TestRegexpOnText(const StringPiece& regexp,

- const StringPiece& text) {

- Tester t(regexp);

- return t.TestInput(text);

-} // namespace re2

« no previous file with comments | « third_party/re2/re2/testing/tester.h ('k') | third_party/re2/re2/tostring.cc » ('j') | no next file with comments »