Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(105)

Unified Diff: third_party/re2/re2/simplify.cc

Issue 1516543002: Update re2 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: updated update instructions Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/simplify.cc
diff --git a/third_party/re2/re2/simplify.cc b/third_party/re2/re2/simplify.cc
index faf32084e05f9f171b33fdc076d465e0ec693600..ecc60e7d8b127960534182ba03171c6bc1f0e2f0 100644
--- a/third_party/re2/re2/simplify.cc
+++ b/third_party/re2/re2/simplify.cc
@@ -61,7 +61,7 @@ bool Regexp::ComputeSimple() {
// These are simple as long as the subpieces are simple.
subs = sub();
for (int i = 0; i < nsub_; i++)
- if (!subs[i]->simple_)
+ if (!subs[i]->simple())
return false;
return true;
case kRegexpCharClass:
@@ -71,12 +71,12 @@ bool Regexp::ComputeSimple() {
return !cc_->empty() && !cc_->full();
case kRegexpCapture:
subs = sub();
- return subs[0]->simple_;
+ return subs[0]->simple();
case kRegexpStar:
case kRegexpPlus:
case kRegexpQuest:
subs = sub();
- if (!subs[0]->simple_)
+ if (!subs[0]->simple())
return false;
switch (subs[0]->op_) {
case kRegexpStar:
@@ -97,6 +97,36 @@ bool Regexp::ComputeSimple() {
}
// Walker subclass used by Simplify.
+// Coalesces runs of star/plus/quest/repeat of the same literal along with any
+// occurrences of that literal into repeats of that literal. It also works for
+// char classes, any char and any byte.
+// PostVisit creates the coalesced result, which should then be simplified.
+class CoalesceWalker : public Regexp::Walker<Regexp*> {
+ public:
+ CoalesceWalker() {}
+ virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
+ Regexp** child_args, int nchild_args);
+ virtual Regexp* Copy(Regexp* re);
+ virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
+
+ private:
+ // These functions are declared inside CoalesceWalker so that
+ // they can edit the private fields of the Regexps they construct.
+
+ // Returns true if r1 and r2 can be coalesced. In particular, ensures that
+ // the parse flags are consistent. (They will not be checked again later.)
+ static bool CanCoalesce(Regexp* r1, Regexp* r2);
+
+ // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
+ // will be empty match and the coalesced op. In other cases, where part of a
+ // literal string was removed to be coalesced, the array elements afterwards
+ // will be the coalesced op and the remainder of the literal string.
+ static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
+
+ DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);
+};
+
+// Walker subclass used by Simplify.
// The simplify walk is purely post-recursive: given the simplified children,
// PostVisit creates the simplified result.
// The child_args are simplified Regexp*s.
@@ -104,9 +134,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
public:
SimplifyWalker() {}
virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
- virtual Regexp* PostVisit(Regexp* re,
- Regexp* parent_arg,
- Regexp* pre_arg,
+ virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
Regexp** child_args, int nchild_args);
virtual Regexp* Copy(Regexp* re);
virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
@@ -130,7 +158,7 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
// Caller must Decref return value when done with it.
static Regexp* SimplifyCharClass(Regexp* re);
- DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
+ DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);
};
// Simplifies a regular expression, returning a new regexp.
@@ -143,14 +171,261 @@ class SimplifyWalker : public Regexp::Walker<Regexp*> {
// Caller must Decref() return value when done with it.
Regexp* Regexp::Simplify() {
- if (simple_)
- return Incref();
- SimplifyWalker w;
- return w.Walk(this, NULL);
+ CoalesceWalker cw;
+ Regexp* cre = cw.Walk(this, NULL);
+ if (cre == NULL)
+ return cre;
+ SimplifyWalker sw;
+ Regexp* sre = sw.Walk(cre, NULL);
+ cre->Decref();
+ return sre;
}
#define Simplify DontCallSimplify // Avoid accidental recursion
+// Utility function for PostVisit implementations that compares re->sub() with
+// child_args to determine whether any child_args changed. In the common case,
+// where nothing changed, calls Decref() for all child_args and returns false,
+// so PostVisit must return re->Incref(). Otherwise, returns true.
+static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
+ for (int i = 0; i < re->nsub(); i++) {
+ Regexp* sub = re->sub()[i];
+ Regexp* newsub = child_args[i];
+ if (newsub != sub)
+ return true;
+ }
+ for (int i = 0; i < re->nsub(); i++) {
+ Regexp* newsub = child_args[i];
+ newsub->Decref();
+ }
+ return false;
+}
+
+Regexp* CoalesceWalker::Copy(Regexp* re) {
+ return re->Incref();
+}
+
+Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
+ // This should never be called, since we use Walk and not
+ // WalkExponential.
+ LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
+ return re->Incref();
+}
+
+Regexp* CoalesceWalker::PostVisit(Regexp* re,
+ Regexp* parent_arg,
+ Regexp* pre_arg,
+ Regexp** child_args,
+ int nchild_args) {
+ if (re->nsub() == 0)
+ return re->Incref();
+
+ if (re->op() != kRegexpConcat) {
+ if (!ChildArgsChanged(re, child_args))
+ return re->Incref();
+
+ // Something changed. Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub());
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0; i < re->nsub(); i++)
+ nre_subs[i] = child_args[i];
+ // Repeats and Captures have additional data that must be copied.
+ if (re->op() == kRegexpRepeat) {
+ nre->min_ = re->min();
+ nre->max_ = re->max();
+ } else if (re->op() == kRegexpCapture) {
+ nre->cap_ = re->cap();
+ }
+ return nre;
+ }
+
+ bool can_coalesce = false;
+ for (int i = 0; i < re->nsub(); i++) {
+ if (i+1 < re->nsub() &&
+ CanCoalesce(child_args[i], child_args[i+1])) {
+ can_coalesce = true;
+ break;
+ }
+ }
+ if (!can_coalesce) {
+ if (!ChildArgsChanged(re, child_args))
+ return re->Incref();
+
+ // Something changed. Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub());
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0; i < re->nsub(); i++)
+ nre_subs[i] = child_args[i];
+ return nre;
+ }
+
+ for (int i = 0; i < re->nsub(); i++) {
+ if (i+1 < re->nsub() &&
+ CanCoalesce(child_args[i], child_args[i+1]))
+ DoCoalesce(&child_args[i], &child_args[i+1]);
+ }
+ // Determine how many empty matches were left by DoCoalesce.
+ int n = 0;
+ for (int i = n; i < re->nsub(); i++) {
+ if (child_args[i]->op() == kRegexpEmptyMatch)
+ n++;
+ }
+ // Build a new op.
+ Regexp* nre = new Regexp(re->op(), re->parse_flags());
+ nre->AllocSub(re->nsub() - n);
+ Regexp** nre_subs = nre->sub();
+ for (int i = 0, j = 0; i < re->nsub(); i++) {
+ if (child_args[i]->op() == kRegexpEmptyMatch) {
+ child_args[i]->Decref();
+ continue;
+ }
+ nre_subs[j] = child_args[i];
+ j++;
+ }
+ return nre;
+}
+
+bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
+ // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
+ // any byte.
+ if ((r1->op() == kRegexpStar ||
+ r1->op() == kRegexpPlus ||
+ r1->op() == kRegexpQuest ||
+ r1->op() == kRegexpRepeat) &&
+ (r1->sub()[0]->op() == kRegexpLiteral ||
+ r1->sub()[0]->op() == kRegexpCharClass ||
+ r1->sub()[0]->op() == kRegexpAnyChar ||
+ r1->sub()[0]->op() == kRegexpAnyByte)) {
+ // r2 must be a star/plus/quest/repeat of the same literal, char class,
+ // any char or any byte.
+ if ((r2->op() == kRegexpStar ||
+ r2->op() == kRegexpPlus ||
+ r2->op() == kRegexpQuest ||
+ r2->op() == kRegexpRepeat) &&
+ Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
+ // The parse flags must be consistent.
+ ((r1->parse_flags() & Regexp::NonGreedy) ==
+ (r2->parse_flags() & Regexp::NonGreedy))) {
+ return true;
+ }
+ // ... OR an occurrence of that literal, char class, any char or any byte
+ if (Regexp::Equal(r1->sub()[0], r2)) {
+ return true;
+ }
+ // ... OR a literal string that begins with that literal.
+ if (r1->sub()[0]->op() == kRegexpLiteral &&
+ r2->op() == kRegexpLiteralString &&
+ r2->runes()[0] == r1->sub()[0]->rune() &&
+ // The parse flags must be consistent.
+ ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
+ (r2->parse_flags() & Regexp::FoldCase))) {
+ return true;
+ }
+ }
+ return false;
+}
+
+void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
+ Regexp* r1 = *r1ptr;
+ Regexp* r2 = *r2ptr;
+
+ Regexp* nre = Regexp::Repeat(
+ r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
+
+ switch (r1->op()) {
+ case kRegexpStar:
+ nre->min_ = 0;
+ nre->max_ = -1;
+ break;
+
+ case kRegexpPlus:
+ nre->min_ = 1;
+ nre->max_ = -1;
+ break;
+
+ case kRegexpQuest:
+ nre->min_ = 0;
+ nre->max_ = 1;
+ break;
+
+ case kRegexpRepeat:
+ nre->min_ = r1->min();
+ nre->max_ = r1->max();
+ break;
+
+ default:
+ LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
+ nre->Decref();
+ return;
+ }
+
+ switch (r2->op()) {
+ case kRegexpStar:
+ nre->max_ = -1;
+ goto LeaveEmpty;
+
+ case kRegexpPlus:
+ nre->min_++;
+ nre->max_ = -1;
+ goto LeaveEmpty;
+
+ case kRegexpQuest:
+ if (nre->max() != -1)
+ nre->max_++;
+ goto LeaveEmpty;
+
+ case kRegexpRepeat:
+ nre->min_ += r2->min();
+ if (r2->max() == -1)
+ nre->max_ = -1;
+ else if (nre->max() != -1)
+ nre->max_ += r2->max();
+ goto LeaveEmpty;
+
+ case kRegexpLiteral:
+ case kRegexpCharClass:
+ case kRegexpAnyChar:
+ case kRegexpAnyByte:
+ nre->min_++;
+ if (nre->max() != -1)
+ nre->max_++;
+ goto LeaveEmpty;
+
+ LeaveEmpty:
+ *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
+ *r2ptr = nre;
+ break;
+
+ case kRegexpLiteralString: {
+ Rune r = r1->sub()[0]->rune();
+ // Determine how much of the literal string is removed.
+ // We know that we have at least one rune. :)
+ int n = 1;
+ while (n < r2->nrunes() && r2->runes()[n] == r)
+ n++;
+ nre->min_ += n;
+ if (nre->max() != -1)
+ nre->max_ += n;
+ if (n == r2->nrunes())
+ goto LeaveEmpty;
+ *r1ptr = nre;
+ *r2ptr = Regexp::LiteralString(
+ &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
+ break;
+ }
+
+ default:
+ LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
+ nre->Decref();
+ return;
+ }
+
+ r1->Decref();
+ r2->Decref();
+}
+
Regexp* SimplifyWalker::Copy(Regexp* re) {
return re->Incref();
}
@@ -163,7 +438,7 @@ Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
}
Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
- if (re->simple_) {
+ if (re->simple()) {
*stop = true;
return re->Incref();
}
@@ -196,29 +471,14 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
case kRegexpConcat:
case kRegexpAlternate: {
// These are simple as long as the subpieces are simple.
- // Two passes to avoid allocation in the common case.
- bool changed = false;
- Regexp** subs = re->sub();
- for (int i = 0; i < re->nsub_; i++) {
- Regexp* sub = subs[i];
- Regexp* newsub = child_args[i];
- if (newsub != sub) {
- changed = true;
- break;
- }
- }
- if (!changed) {
- for (int i = 0; i < re->nsub_; i++) {
- Regexp* newsub = child_args[i];
- newsub->Decref();
- }
+ if (!ChildArgsChanged(re, child_args)) {
re->simple_ = true;
return re->Incref();
}
Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(re->nsub_);
+ nre->AllocSub(re->nsub());
Regexp** nre_subs = nre->sub();
- for (int i = 0; i <re->nsub_; i++)
+ for (int i = 0; i < re->nsub(); i++)
nre_subs[i] = child_args[i];
nre->simple_ = true;
return nre;
@@ -234,7 +494,7 @@ Regexp* SimplifyWalker::PostVisit(Regexp* re,
Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
nre->AllocSub(1);
nre->sub()[0] = newsub;
- nre->cap_ = re->cap_;
+ nre->cap_ = re->cap();
nre->simple_ = true;
return nre;
}
@@ -325,7 +585,6 @@ Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
// General case: x{4,} is xxxx+
Regexp* nre = new Regexp(kRegexpConcat, f);
nre->AllocSub(min);
- VLOG(1) << "Simplify " << min;
Regexp** nre_subs = nre->sub();
for (int i = 0; i < min-1; i++)
nre_subs[i] = re->Incref();
« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698