| Index: third_party/re2/re2/simplify.cc
|
| diff --git a/third_party/re2/re2/simplify.cc b/third_party/re2/re2/simplify.cc
|
| index faf32084e05f9f171b33fdc076d465e0ec693600..ecc60e7d8b127960534182ba03171c6bc1f0e2f0 100644
|
| --- a/third_party/re2/re2/simplify.cc
|
| +++ b/third_party/re2/re2/simplify.cc
|
| @@ -61,7 +61,7 @@ bool Regexp::ComputeSimple() {
|
| // These are simple as long as the subpieces are simple.
|
| subs = sub();
|
| for (int i = 0; i < nsub_; i++)
|
| - if (!subs[i]->simple_)
|
| + if (!subs[i]->simple())
|
| return false;
|
| return true;
|
| case kRegexpCharClass:
|
| @@ -71,12 +71,12 @@ bool Regexp::ComputeSimple() {
|
| return !cc_->empty() && !cc_->full();
|
| case kRegexpCapture:
|
| subs = sub();
|
| - return subs[0]->simple_;
|
| + return subs[0]->simple();
|
| case kRegexpStar:
|
| case kRegexpPlus:
|
| case kRegexpQuest:
|
| subs = sub();
|
| - if (!subs[0]->simple_)
|
| + if (!subs[0]->simple())
|
| return false;
|
| switch (subs[0]->op_) {
|
| case kRegexpStar:
|
| @@ -97,6 +97,36 @@ bool Regexp::ComputeSimple() {
|
| }
|
|
|
| // Walker subclass used by Simplify.
|
| +// Coalesces runs of star/plus/quest/repeat of the same literal along with any
|
| +// occurrences of that literal into repeats of that literal. It also works for
|
| +// char classes, any char and any byte.
|
| +// PostVisit creates the coalesced result, which should then be simplified.
|
| +class CoalesceWalker : public Regexp::Walker<Regexp*> {
|
| + public:
|
| + CoalesceWalker() {}
|
| + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
| + Regexp** child_args, int nchild_args);
|
| + virtual Regexp* Copy(Regexp* re);
|
| + virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
| +
|
| + private:
|
| + // These functions are declared inside CoalesceWalker so that
|
| + // they can edit the private fields of the Regexps they construct.
|
| +
|
| + // Returns true if r1 and r2 can be coalesced. In particular, ensures that
|
| + // the parse flags are consistent. (They will not be checked again later.)
|
| + static bool CanCoalesce(Regexp* r1, Regexp* r2);
|
| +
|
| + // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
|
| + // will be empty match and the coalesced op. In other cases, where part of a
|
| + // literal string was removed to be coalesced, the array elements afterwards
|
| + // will be the coalesced op and the remainder of the literal string.
|
| + static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
|
| +
|
| + DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);
|
| +};
|
| +
|
| +// Walker subclass used by Simplify.
|
| // The simplify walk is purely post-recursive: given the simplified children,
|
| // PostVisit creates the simplified result.
|
| // The child_args are simplified Regexp*s.
|
| @@ -104,9 +134,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
| public:
|
| SimplifyWalker() {}
|
| virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
|
| - virtual Regexp* PostVisit(Regexp* re,
|
| - Regexp* parent_arg,
|
| - Regexp* pre_arg,
|
| + virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
|
| Regexp** child_args, int nchild_args);
|
| virtual Regexp* Copy(Regexp* re);
|
| virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
|
| @@ -130,7 +158,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
| // Caller must Decref return value when done with it.
|
| static Regexp* SimplifyCharClass(Regexp* re);
|
|
|
| - DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
|
| + DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);
|
| };
|
|
|
| // Simplifies a regular expression, returning a new regexp.
|
| @@ -143,14 +171,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
|
| // Caller must Decref() return value when done with it.
|
|
|
| Regexp* Regexp::Simplify() {
|
| - if (simple_)
|
| - return Incref();
|
| - SimplifyWalker w;
|
| - return w.Walk(this, NULL);
|
| + CoalesceWalker cw;
|
| + Regexp* cre = cw.Walk(this, NULL);
|
| + if (cre == NULL)
|
| + return cre;
|
| + SimplifyWalker sw;
|
| + Regexp* sre = sw.Walk(cre, NULL);
|
| + cre->Decref();
|
| + return sre;
|
| }
|
|
|
| #define Simplify DontCallSimplify // Avoid accidental recursion
|
|
|
| +// Utility function for PostVisit implementations that compares re->sub() with
|
| +// child_args to determine whether any child_args changed. In the common case,
|
| +// where nothing changed, calls Decref() for all child_args and returns false,
|
| +// so PostVisit must return re->Incref(). Otherwise, returns true.
|
| +static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
|
| + for (int i = 0; i < re->nsub(); i++) {
|
| + Regexp* sub = re->sub()[i];
|
| + Regexp* newsub = child_args[i];
|
| + if (newsub != sub)
|
| + return true;
|
| + }
|
| + for (int i = 0; i < re->nsub(); i++) {
|
| + Regexp* newsub = child_args[i];
|
| + newsub->Decref();
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +Regexp* CoalesceWalker::Copy(Regexp* re) {
|
| + return re->Incref();
|
| +}
|
| +
|
| +Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
| + // This should never be called, since we use Walk and not
|
| + // WalkExponential.
|
| + LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
|
| + return re->Incref();
|
| +}
|
| +
|
| +Regexp* CoalesceWalker::PostVisit(Regexp* re,
|
| + Regexp* parent_arg,
|
| + Regexp* pre_arg,
|
| + Regexp** child_args,
|
| + int nchild_args) {
|
| + if (re->nsub() == 0)
|
| + return re->Incref();
|
| +
|
| + if (re->op() != kRegexpConcat) {
|
| + if (!ChildArgsChanged(re, child_args))
|
| + return re->Incref();
|
| +
|
| + // Something changed. Build a new op.
|
| + Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
| + nre->AllocSub(re->nsub());
|
| + Regexp** nre_subs = nre->sub();
|
| + for (int i = 0; i < re->nsub(); i++)
|
| + nre_subs[i] = child_args[i];
|
| + // Repeats and Captures have additional data that must be copied.
|
| + if (re->op() == kRegexpRepeat) {
|
| + nre->min_ = re->min();
|
| + nre->max_ = re->max();
|
| + } else if (re->op() == kRegexpCapture) {
|
| + nre->cap_ = re->cap();
|
| + }
|
| + return nre;
|
| + }
|
| +
|
| + bool can_coalesce = false;
|
| + for (int i = 0; i < re->nsub(); i++) {
|
| + if (i+1 < re->nsub() &&
|
| + CanCoalesce(child_args[i], child_args[i+1])) {
|
| + can_coalesce = true;
|
| + break;
|
| + }
|
| + }
|
| + if (!can_coalesce) {
|
| + if (!ChildArgsChanged(re, child_args))
|
| + return re->Incref();
|
| +
|
| + // Something changed. Build a new op.
|
| + Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
| + nre->AllocSub(re->nsub());
|
| + Regexp** nre_subs = nre->sub();
|
| + for (int i = 0; i < re->nsub(); i++)
|
| + nre_subs[i] = child_args[i];
|
| + return nre;
|
| + }
|
| +
|
| + for (int i = 0; i < re->nsub(); i++) {
|
| + if (i+1 < re->nsub() &&
|
| + CanCoalesce(child_args[i], child_args[i+1]))
|
| + DoCoalesce(&child_args[i], &child_args[i+1]);
|
| + }
|
| + // Determine how many empty matches were left by DoCoalesce.
|
| + int n = 0;
|
| + for (int i = n; i < re->nsub(); i++) {
|
| + if (child_args[i]->op() == kRegexpEmptyMatch)
|
| + n++;
|
| + }
|
| + // Build a new op.
|
| + Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
| + nre->AllocSub(re->nsub() - n);
|
| + Regexp** nre_subs = nre->sub();
|
| + for (int i = 0, j = 0; i < re->nsub(); i++) {
|
| + if (child_args[i]->op() == kRegexpEmptyMatch) {
|
| + child_args[i]->Decref();
|
| + continue;
|
| + }
|
| + nre_subs[j] = child_args[i];
|
| + j++;
|
| + }
|
| + return nre;
|
| +}
|
| +
|
| +bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
|
| + // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
|
| + // any byte.
|
| + if ((r1->op() == kRegexpStar ||
|
| + r1->op() == kRegexpPlus ||
|
| + r1->op() == kRegexpQuest ||
|
| + r1->op() == kRegexpRepeat) &&
|
| + (r1->sub()[0]->op() == kRegexpLiteral ||
|
| + r1->sub()[0]->op() == kRegexpCharClass ||
|
| + r1->sub()[0]->op() == kRegexpAnyChar ||
|
| + r1->sub()[0]->op() == kRegexpAnyByte)) {
|
| + // r2 must be a star/plus/quest/repeat of the same literal, char class,
|
| + // any char or any byte.
|
| + if ((r2->op() == kRegexpStar ||
|
| + r2->op() == kRegexpPlus ||
|
| + r2->op() == kRegexpQuest ||
|
| + r2->op() == kRegexpRepeat) &&
|
| + Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
|
| + // The parse flags must be consistent.
|
| + ((r1->parse_flags() & Regexp::NonGreedy) ==
|
| + (r2->parse_flags() & Regexp::NonGreedy))) {
|
| + return true;
|
| + }
|
| + // ... OR an occurrence of that literal, char class, any char or any byte
|
| + if (Regexp::Equal(r1->sub()[0], r2)) {
|
| + return true;
|
| + }
|
| + // ... OR a literal string that begins with that literal.
|
| + if (r1->sub()[0]->op() == kRegexpLiteral &&
|
| + r2->op() == kRegexpLiteralString &&
|
| + r2->runes()[0] == r1->sub()[0]->rune() &&
|
| + // The parse flags must be consistent.
|
| + ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
|
| + (r2->parse_flags() & Regexp::FoldCase))) {
|
| + return true;
|
| + }
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
|
| + Regexp* r1 = *r1ptr;
|
| + Regexp* r2 = *r2ptr;
|
| +
|
| + Regexp* nre = Regexp::Repeat(
|
| + r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
|
| +
|
| + switch (r1->op()) {
|
| + case kRegexpStar:
|
| + nre->min_ = 0;
|
| + nre->max_ = -1;
|
| + break;
|
| +
|
| + case kRegexpPlus:
|
| + nre->min_ = 1;
|
| + nre->max_ = -1;
|
| + break;
|
| +
|
| + case kRegexpQuest:
|
| + nre->min_ = 0;
|
| + nre->max_ = 1;
|
| + break;
|
| +
|
| + case kRegexpRepeat:
|
| + nre->min_ = r1->min();
|
| + nre->max_ = r1->max();
|
| + break;
|
| +
|
| + default:
|
| + LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
|
| + nre->Decref();
|
| + return;
|
| + }
|
| +
|
| + switch (r2->op()) {
|
| + case kRegexpStar:
|
| + nre->max_ = -1;
|
| + goto LeaveEmpty;
|
| +
|
| + case kRegexpPlus:
|
| + nre->min_++;
|
| + nre->max_ = -1;
|
| + goto LeaveEmpty;
|
| +
|
| + case kRegexpQuest:
|
| + if (nre->max() != -1)
|
| + nre->max_++;
|
| + goto LeaveEmpty;
|
| +
|
| + case kRegexpRepeat:
|
| + nre->min_ += r2->min();
|
| + if (r2->max() == -1)
|
| + nre->max_ = -1;
|
| + else if (nre->max() != -1)
|
| + nre->max_ += r2->max();
|
| + goto LeaveEmpty;
|
| +
|
| + case kRegexpLiteral:
|
| + case kRegexpCharClass:
|
| + case kRegexpAnyChar:
|
| + case kRegexpAnyByte:
|
| + nre->min_++;
|
| + if (nre->max() != -1)
|
| + nre->max_++;
|
| + goto LeaveEmpty;
|
| +
|
| + LeaveEmpty:
|
| + *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
|
| + *r2ptr = nre;
|
| + break;
|
| +
|
| + case kRegexpLiteralString: {
|
| + Rune r = r1->sub()[0]->rune();
|
| + // Determine how much of the literal string is removed.
|
| + // We know that we have at least one rune. :)
|
| + int n = 1;
|
| + while (n < r2->nrunes() && r2->runes()[n] == r)
|
| + n++;
|
| + nre->min_ += n;
|
| + if (nre->max() != -1)
|
| + nre->max_ += n;
|
| + if (n == r2->nrunes())
|
| + goto LeaveEmpty;
|
| + *r1ptr = nre;
|
| + *r2ptr = Regexp::LiteralString(
|
| + &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
|
| + break;
|
| + }
|
| +
|
| + default:
|
| + LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
|
| + nre->Decref();
|
| + return;
|
| + }
|
| +
|
| + r1->Decref();
|
| + r2->Decref();
|
| +}
|
| +
|
| Regexp* SimplifyWalker::Copy(Regexp* re) {
|
| return re->Incref();
|
| }
|
| @@ -163,7 +438,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
|
| }
|
|
|
| Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
|
| - if (re->simple_) {
|
| + if (re->simple()) {
|
| *stop = true;
|
| return re->Incref();
|
| }
|
| @@ -196,29 +471,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
| case kRegexpConcat:
|
| case kRegexpAlternate: {
|
| // These are simple as long as the subpieces are simple.
|
| - // Two passes to avoid allocation in the common case.
|
| - bool changed = false;
|
| - Regexp** subs = re->sub();
|
| - for (int i = 0; i < re->nsub_; i++) {
|
| - Regexp* sub = subs[i];
|
| - Regexp* newsub = child_args[i];
|
| - if (newsub != sub) {
|
| - changed = true;
|
| - break;
|
| - }
|
| - }
|
| - if (!changed) {
|
| - for (int i = 0; i < re->nsub_; i++) {
|
| - Regexp* newsub = child_args[i];
|
| - newsub->Decref();
|
| - }
|
| + if (!ChildArgsChanged(re, child_args)) {
|
| re->simple_ = true;
|
| return re->Incref();
|
| }
|
| Regexp* nre = new Regexp(re->op(), re->parse_flags());
|
| - nre->AllocSub(re->nsub_);
|
| + nre->AllocSub(re->nsub());
|
| Regexp** nre_subs = nre->sub();
|
| - for (int i = 0; i <re->nsub_; i++)
|
| + for (int i = 0; i < re->nsub(); i++)
|
| nre_subs[i] = child_args[i];
|
| nre->simple_ = true;
|
| return nre;
|
| @@ -234,7 +494,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
|
| Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
|
| nre->AllocSub(1);
|
| nre->sub()[0] = newsub;
|
| - nre->cap_ = re->cap_;
|
| + nre->cap_ = re->cap();
|
| nre->simple_ = true;
|
| return nre;
|
| }
|
| @@ -325,7 +585,6 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
|
| // General case: x{4,} is xxxx+
|
| Regexp* nre = new Regexp(kRegexpConcat, f);
|
| nre->AllocSub(min);
|
| - VLOG(1) << "Simplify " << min;
|
| Regexp** nre_subs = nre->sub();
|
| for (int i = 0; i < min-1; i++)
|
| nre_subs[i] = re->Incref();
|
|
|