third_party/re2/re2/simplify.cc - Issue 1544433002: Replace RE2 import with a dependency

Unified Diff: third_party/re2/re2/simplify.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: third_party/re2/re2/simplify.cc

diff --git a/third_party/re2/re2/simplify.cc b/third_party/re2/re2/simplify.cc

deleted file mode 100644

index ecc60e7d8b127960534182ba03171c6bc1f0e2f0..0000000000000000000000000000000000000000

--- a/third_party/re2/re2/simplify.cc

+++ /dev/null

@@ -1,652 +0,0 @@

-// Use of this source code is governed by a BSD-style

-// license that can be found in the LICENSE file.

-// Rewrite POSIX and other features in re

-// to use simple extended regular expression features.

-// Also sort and simplify character classes.

-#include "util/util.h"

-#include "re2/regexp.h"

-#include "re2/walker-inl.h"

-namespace re2 {

-// Parses the regexp src and then simplifies it and sets *dst to the

-// string representation of the simplified form. Returns true on success.

-// Returns false and sets *error (if error != NULL) on error.

-bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,

- string* dst,

- RegexpStatus* status) {

- Regexp* re = Parse(src, flags, status);

- if (re == NULL)

- return false;

- Regexp* sre = re->Simplify();

- re->Decref();

- if (sre == NULL) {

- // Should not happen, since Simplify never fails.

- LOG(ERROR) << "Simplify failed on " << src;

- if (status) {

- status->set_code(kRegexpInternalError);

- status->set_error_arg(src);

- }

- return false;

- }

- *dst = sre->ToString();

- sre->Decref();

- return true;

-// Assuming the simple_ flags on the children are accurate,

-// is this Regexp* simple?

-bool Regexp::ComputeSimple() {

- Regexp** subs;

- switch (op_) {

- case kRegexpNoMatch:

- case kRegexpEmptyMatch:

- case kRegexpLiteral:

- case kRegexpLiteralString:

- case kRegexpBeginLine:

- case kRegexpEndLine:

- case kRegexpBeginText:

- case kRegexpWordBoundary:

- case kRegexpNoWordBoundary:

- case kRegexpEndText:

- case kRegexpAnyChar:

- case kRegexpAnyByte:

- case kRegexpHaveMatch:

- return true;

- case kRegexpConcat:

- case kRegexpAlternate:

- // These are simple as long as the subpieces are simple.

- subs = sub();

- for (int i = 0; i < nsub_; i++)

- if (!subs[i]->simple())

- return false;

- return true;

- case kRegexpCharClass:

- // Simple as long as the char class is not empty, not full.

- if (ccb_ != NULL)

- return !ccb_->empty() && !ccb_->full();

- return !cc_->empty() && !cc_->full();

- case kRegexpCapture:

- subs = sub();

- return subs[0]->simple();

- case kRegexpStar:

- case kRegexpPlus:

- case kRegexpQuest:

- subs = sub();

- if (!subs[0]->simple())

- return false;

- switch (subs[0]->op_) {

- case kRegexpStar:

- case kRegexpPlus:

- case kRegexpQuest:

- case kRegexpEmptyMatch:

- case kRegexpNoMatch:

- return false;

- default:

- break;

- }

- return true;

- case kRegexpRepeat:

- return false;

- }

- LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;

- return false;

-// Walker subclass used by Simplify.

-// Coalesces runs of star/plus/quest/repeat of the same literal along with any

-// occurrences of that literal into repeats of that literal. It also works for

-// char classes, any char and any byte.

-// PostVisit creates the coalesced result, which should then be simplified.

-class CoalesceWalker : public Regexp::Walker<Regexp*> {

- public:

- CoalesceWalker() {}

- virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,

- Regexp** child_args, int nchild_args);

- virtual Regexp* Copy(Regexp* re);

- virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

- private:

- // These functions are declared inside CoalesceWalker so that

- // they can edit the private fields of the Regexps they construct.

- // Returns true if r1 and r2 can be coalesced. In particular, ensures that

- // the parse flags are consistent. (They will not be checked again later.)

- static bool CanCoalesce(Regexp* r1, Regexp* r2);

- // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards

- // will be empty match and the coalesced op. In other cases, where part of a

- // literal string was removed to be coalesced, the array elements afterwards

- // will be the coalesced op and the remainder of the literal string.

- static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);

- DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);

-};

-// Walker subclass used by Simplify.

-// The simplify walk is purely post-recursive: given the simplified children,

-// PostVisit creates the simplified result.

-// The child_args are simplified Regexp*s.

-class SimplifyWalker : public Regexp::Walker<Regexp*> {

- public:

- SimplifyWalker() {}

- virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);

- virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,

- Regexp** child_args, int nchild_args);

- virtual Regexp* Copy(Regexp* re);

- virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);

- private:

- // These functions are declared inside SimplifyWalker so that

- // they can edit the private fields of the Regexps they construct.

- // Creates a concatenation of two Regexp, consuming refs to re1 and re2.

- // Caller must Decref return value when done with it.

- static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);

- // Simplifies the expression re{min,max} in terms of *, +, and ?.

- // Returns a new regexp. Does not edit re. Does not consume reference to re.

- // Caller must Decref return value when done with it.

- static Regexp* SimplifyRepeat(Regexp* re, int min, int max,

- Regexp::ParseFlags parse_flags);

- // Simplifies a character class by expanding any named classes

- // into rune ranges. Does not edit re. Does not consume ref to re.

- // Caller must Decref return value when done with it.

- static Regexp* SimplifyCharClass(Regexp* re);

- DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);

-};

-// Simplifies a regular expression, returning a new regexp.

-// The new regexp uses traditional Unix egrep features only,

-// plus the Perl (?:) non-capturing parentheses.

-// Otherwise, no POSIX or Perl additions. The new regexp

-// captures exactly the same subexpressions (with the same indices)

-// as the original.

-// Does not edit current object.

-// Caller must Decref() return value when done with it.

-Regexp* Regexp::Simplify() {

- CoalesceWalker cw;

- Regexp* cre = cw.Walk(this, NULL);

- if (cre == NULL)

- return cre;

- SimplifyWalker sw;

- Regexp* sre = sw.Walk(cre, NULL);

- cre->Decref();

- return sre;

-#define Simplify DontCallSimplify // Avoid accidental recursion

-// Utility function for PostVisit implementations that compares re->sub() with

-// child_args to determine whether any child_args changed. In the common case,

-// where nothing changed, calls Decref() for all child_args and returns false,

-// so PostVisit must return re->Incref(). Otherwise, returns true.

-static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {

- for (int i = 0; i < re->nsub(); i++) {

- Regexp* sub = re->sub()[i];

- Regexp* newsub = child_args[i];

- if (newsub != sub)

- return true;

- }

- for (int i = 0; i < re->nsub(); i++) {

- Regexp* newsub = child_args[i];

- newsub->Decref();

- }

- return false;

-Regexp* CoalesceWalker::Copy(Regexp* re) {

- return re->Incref();

-Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {

- // This should never be called, since we use Walk and not

- // WalkExponential.

- LOG(DFATAL) << "CoalesceWalker::ShortVisit called";

- return re->Incref();

-Regexp* CoalesceWalker::PostVisit(Regexp* re,

- Regexp* parent_arg,

- Regexp* pre_arg,

- Regexp** child_args,

- int nchild_args) {

- if (re->nsub() == 0)

- return re->Incref();

- if (re->op() != kRegexpConcat) {

- if (!ChildArgsChanged(re, child_args))

- return re->Incref();

- // Something changed. Build a new op.

- Regexp* nre = new Regexp(re->op(), re->parse_flags());

- nre->AllocSub(re->nsub());

- Regexp** nre_subs = nre->sub();

- for (int i = 0; i < re->nsub(); i++)

- nre_subs[i] = child_args[i];

- // Repeats and Captures have additional data that must be copied.

- if (re->op() == kRegexpRepeat) {

- nre->min_ = re->min();

- nre->max_ = re->max();

- } else if (re->op() == kRegexpCapture) {

- nre->cap_ = re->cap();

- }

- return nre;

- }

- bool can_coalesce = false;

- for (int i = 0; i < re->nsub(); i++) {

- if (i+1 < re->nsub() &&

- CanCoalesce(child_args[i], child_args[i+1])) {

- can_coalesce = true;

- break;

- }

- if (!can_coalesce) {

- if (!ChildArgsChanged(re, child_args))

- return re->Incref();

- // Something changed. Build a new op.

- Regexp* nre = new Regexp(re->op(), re->parse_flags());

- nre->AllocSub(re->nsub());

- Regexp** nre_subs = nre->sub();

- for (int i = 0; i < re->nsub(); i++)

- nre_subs[i] = child_args[i];

- return nre;

- }

- for (int i = 0; i < re->nsub(); i++) {

- if (i+1 < re->nsub() &&

- CanCoalesce(child_args[i], child_args[i+1]))

- DoCoalesce(&child_args[i], &child_args[i+1]);

- }

- // Determine how many empty matches were left by DoCoalesce.

- int n = 0;

- for (int i = n; i < re->nsub(); i++) {

- if (child_args[i]->op() == kRegexpEmptyMatch)

- n++;

- }

- // Build a new op.

- Regexp* nre = new Regexp(re->op(), re->parse_flags());

- nre->AllocSub(re->nsub() - n);

- Regexp** nre_subs = nre->sub();

- for (int i = 0, j = 0; i < re->nsub(); i++) {

- if (child_args[i]->op() == kRegexpEmptyMatch) {

- child_args[i]->Decref();

- continue;

- }

- nre_subs[j] = child_args[i];

- j++;

- }

- return nre;

-bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {

- // r1 must be a star/plus/quest/repeat of a literal, char class, any char or

- // any byte.

- if ((r1->op() == kRegexpStar ||

- r1->op() == kRegexpPlus ||

- r1->op() == kRegexpQuest ||

- r1->op() == kRegexpRepeat) &&

- (r1->sub()[0]->op() == kRegexpLiteral ||

- r1->sub()[0]->op() == kRegexpCharClass ||

- r1->sub()[0]->op() == kRegexpAnyChar ||

- r1->sub()[0]->op() == kRegexpAnyByte)) {

- // r2 must be a star/plus/quest/repeat of the same literal, char class,

- // any char or any byte.

- if ((r2->op() == kRegexpStar ||

- r2->op() == kRegexpPlus ||

- r2->op() == kRegexpQuest ||

- r2->op() == kRegexpRepeat) &&

- Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&

- // The parse flags must be consistent.

- ((r1->parse_flags() & Regexp::NonGreedy) ==

- (r2->parse_flags() & Regexp::NonGreedy))) {

- return true;

- }

- // ... OR an occurrence of that literal, char class, any char or any byte

- if (Regexp::Equal(r1->sub()[0], r2)) {

- return true;

- }

- // ... OR a literal string that begins with that literal.

- if (r1->sub()[0]->op() == kRegexpLiteral &&

- r2->op() == kRegexpLiteralString &&

- r2->runes()[0] == r1->sub()[0]->rune() &&

- // The parse flags must be consistent.

- ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==

- (r2->parse_flags() & Regexp::FoldCase))) {

- return true;

- }

- return false;

-void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {

- Regexp* r1 = *r1ptr;

- Regexp* r2 = *r2ptr;

- Regexp* nre = Regexp::Repeat(

- r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);

- switch (r1->op()) {

- case kRegexpStar:

- nre->min_ = 0;

- nre->max_ = -1;

- break;

- case kRegexpPlus:

- nre->min_ = 1;

- nre->max_ = -1;

- break;

- case kRegexpQuest:

- nre->min_ = 0;

- nre->max_ = 1;

- break;

- case kRegexpRepeat:

- nre->min_ = r1->min();

- nre->max_ = r1->max();

- break;

- default:

- LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();

- nre->Decref();

- return;

- }

- switch (r2->op()) {

- case kRegexpStar:

- nre->max_ = -1;

- goto LeaveEmpty;

- case kRegexpPlus:

- nre->min_++;

- nre->max_ = -1;

- goto LeaveEmpty;

- case kRegexpQuest:

- if (nre->max() != -1)

- nre->max_++;

- goto LeaveEmpty;

- case kRegexpRepeat:

- nre->min_ += r2->min();

- if (r2->max() == -1)

- nre->max_ = -1;

- else if (nre->max() != -1)

- nre->max_ += r2->max();

- goto LeaveEmpty;

- case kRegexpLiteral:

- case kRegexpCharClass:

- case kRegexpAnyChar:

- case kRegexpAnyByte:

- nre->min_++;

- if (nre->max() != -1)

- nre->max_++;

- goto LeaveEmpty;

- LeaveEmpty:

- *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);

- *r2ptr = nre;

- break;

- case kRegexpLiteralString: {

- Rune r = r1->sub()[0]->rune();

- // Determine how much of the literal string is removed.

- // We know that we have at least one rune. :)

- int n = 1;

- while (n < r2->nrunes() && r2->runes()[n] == r)

- n++;

- nre->min_ += n;

- if (nre->max() != -1)

- nre->max_ += n;

- if (n == r2->nrunes())

- goto LeaveEmpty;

- *r1ptr = nre;

- *r2ptr = Regexp::LiteralString(

- &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());

- break;

- }

- default:

- LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();

- nre->Decref();

- return;

- }

- r1->Decref();

- r2->Decref();

-Regexp* SimplifyWalker::Copy(Regexp* re) {

- return re->Incref();

-Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {

- // This should never be called, since we use Walk and not

- // WalkExponential.

- LOG(DFATAL) << "SimplifyWalker::ShortVisit called";

- return re->Incref();

-Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {

- if (re->simple()) {

- *stop = true;

- return re->Incref();

- }

- return NULL;

-Regexp* SimplifyWalker::PostVisit(Regexp* re,

- Regexp* parent_arg,

- Regexp* pre_arg,

- Regexp** child_args,

- int nchild_args) {

- switch (re->op()) {

- case kRegexpNoMatch:

- case kRegexpEmptyMatch:

- case kRegexpLiteral:

- case kRegexpLiteralString:

- case kRegexpBeginLine:

- case kRegexpEndLine:

- case kRegexpBeginText:

- case kRegexpWordBoundary:

- case kRegexpNoWordBoundary:

- case kRegexpEndText:

- case kRegexpAnyChar:

- case kRegexpAnyByte:

- case kRegexpHaveMatch:

- // All these are always simple.

- re->simple_ = true;

- return re->Incref();

- case kRegexpConcat:

- case kRegexpAlternate: {

- // These are simple as long as the subpieces are simple.

- if (!ChildArgsChanged(re, child_args)) {

- re->simple_ = true;

- return re->Incref();

- }

- Regexp* nre = new Regexp(re->op(), re->parse_flags());

- nre->AllocSub(re->nsub());

- Regexp** nre_subs = nre->sub();

- for (int i = 0; i < re->nsub(); i++)

- nre_subs[i] = child_args[i];

- nre->simple_ = true;

- return nre;

- }

- case kRegexpCapture: {

- Regexp* newsub = child_args[0];

- if (newsub == re->sub()[0]) {

- newsub->Decref();

- re->simple_ = true;

- return re->Incref();

- }

- Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());

- nre->AllocSub(1);

- nre->sub()[0] = newsub;

- nre->cap_ = re->cap();

- nre->simple_ = true;

- return nre;

- }

- case kRegexpStar:

- case kRegexpPlus:

- case kRegexpQuest: {

- Regexp* newsub = child_args[0];

- // Special case: repeat the empty string as much as

- // you want, but it's still the empty string.

- if (newsub->op() == kRegexpEmptyMatch)

- return newsub;

- // These are simple as long as the subpiece is simple.

- if (newsub == re->sub()[0]) {

- newsub->Decref();

- re->simple_ = true;

- return re->Incref();

- }

- // These are also idempotent if flags are constant.

- if (re->op() == newsub->op() &&

- re->parse_flags() == newsub->parse_flags())

- return newsub;

- Regexp* nre = new Regexp(re->op(), re->parse_flags());

- nre->AllocSub(1);

- nre->sub()[0] = newsub;

- nre->simple_ = true;

- return nre;

- }

- case kRegexpRepeat: {

- Regexp* newsub = child_args[0];

- // Special case: repeat the empty string as much as

- // you want, but it's still the empty string.

- if (newsub->op() == kRegexpEmptyMatch)

- return newsub;

- Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,

- re->parse_flags());

- newsub->Decref();

- nre->simple_ = true;

- return nre;

- }

- case kRegexpCharClass: {

- Regexp* nre = SimplifyCharClass(re);

- nre->simple_ = true;

- return nre;

- }

- LOG(ERROR) << "Simplify case not handled: " << re->op();

- return re->Incref();

-// Creates a concatenation of two Regexp, consuming refs to re1 and re2.

-// Returns a new Regexp, handing the ref to the caller.

-Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,

- Regexp::ParseFlags parse_flags) {

- Regexp* re = new Regexp(kRegexpConcat, parse_flags);

- re->AllocSub(2);

- Regexp** subs = re->sub();

- subs[0] = re1;

- subs[1] = re2;

- return re;

-// Simplifies the expression re{min,max} in terms of *, +, and ?.

-// Returns a new regexp. Does not edit re. Does not consume reference to re.

-// Caller must Decref return value when done with it.

-// The result will *not* necessarily have the right capturing parens

-// if you call ToString() and re-parse it: (x){2} becomes (x)(x),

-// but in the Regexp* representation, both (x) are marked as $1.

-Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,

- Regexp::ParseFlags f) {

- // x{n,} means at least n matches of x.

- if (max == -1) {

- // Special case: x{0,} is x*

- if (min == 0)

- return Regexp::Star(re->Incref(), f);

- // Special case: x{1,} is x+

- if (min == 1)

- return Regexp::Plus(re->Incref(), f);

- // General case: x{4,} is xxxx+

- Regexp* nre = new Regexp(kRegexpConcat, f);

- nre->AllocSub(min);

- Regexp** nre_subs = nre->sub();

- for (int i = 0; i < min-1; i++)

- nre_subs[i] = re->Incref();

- nre_subs[min-1] = Regexp::Plus(re->Incref(), f);

- return nre;

- }

- // Special case: (x){0} matches only empty string.

- if (min == 0 && max == 0)

- return new Regexp(kRegexpEmptyMatch, f);

- // Special case: x{1} is just x.

- if (min == 1 && max == 1)

- return re->Incref();

- // General case: x{n,m} means n copies of x and m copies of x?.

- // The machine will do less work if we nest the final m copies,

- // so that x{2,5} = xx(x(x(x)?)?)?

- // Build leading prefix: xx. Capturing only on the last one.

- Regexp* nre = NULL;

- if (min > 0) {

- nre = new Regexp(kRegexpConcat, f);

- nre->AllocSub(min);

- Regexp** nre_subs = nre->sub();

- for (int i = 0; i < min; i++)

- nre_subs[i] = re->Incref();

- }

- // Build and attach suffix: (x(x(x)?)?)?

- if (max > min) {

- Regexp* suf = Regexp::Quest(re->Incref(), f);

- for (int i = min+1; i < max; i++)

- suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);

- if (nre == NULL)

- nre = suf;

- else

- nre = Concat2(nre, suf, f);

- }

- if (nre == NULL) {

- // Some degenerate case, like min > max, or min < max < 0.

- // This shouldn't happen, because the parser rejects such regexps.

- LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;

- return new Regexp(kRegexpNoMatch, f);

- }

- return nre;

-// Simplifies a character class.

-// Caller must Decref return value when done with it.

-Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {

- CharClass* cc = re->cc();

- // Special cases

- if (cc->empty())

- return new Regexp(kRegexpNoMatch, re->parse_flags());

- if (cc->full())

- return new Regexp(kRegexpAnyChar, re->parse_flags());

- return re->Incref();

-} // namespace re2

« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »