Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(252)

Unified Diff: third_party/re2/re2/simplify.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/simplify.cc
diff --git a/third_party/re2/re2/simplify.cc b/third_party/re2/re2/simplify.cc
deleted file mode 100644
index ecc60e7d8b127960534182ba03171c6bc1f0e2f0..0000000000000000000000000000000000000000
--- a/third_party/re2/re2/simplify.cc
+++ /dev/null
@@ -1,652 +0,0 @@
-// Copyright 2006 The RE2 Authors. All Rights Reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Rewrite POSIX and other features in re
-// to use simple extended regular expression features.
-// Also sort and simplify character classes.
-
-#include "util/util.h"
-#include "re2/regexp.h"
-#include "re2/walker-inl.h"
-
-namespace re2 {
-
-// Parses the regexp src and then simplifies it and sets *dst to the
-// string representation of the simplified form. Returns true on success.
-// Returns false and sets *error (if error != NULL) on error.
-bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
- string* dst,
- RegexpStatus* status) {
- Regexp* re = Parse(src, flags, status);
- if (re == NULL)
- return false;
- Regexp* sre = re->Simplify();
- re->Decref();
- if (sre == NULL) {
- // Should not happen, since Simplify never fails.
- LOG(ERROR) << "Simplify failed on " << src;
- if (status) {
- status->set_code(kRegexpInternalError);
- status->set_error_arg(src);
- }
- return false;
- }
- *dst = sre->ToString();
- sre->Decref();
- return true;
-}
-
-// Assuming the simple_ flags on the children are accurate,
-// is this Regexp* simple?
-bool Regexp::ComputeSimple() {
- Regexp** subs;
- switch (op_) {
- case kRegexpNoMatch:
- case kRegexpEmptyMatch:
- case kRegexpLiteral:
- case kRegexpLiteralString:
- case kRegexpBeginLine:
- case kRegexpEndLine:
- case kRegexpBeginText:
- case kRegexpWordBoundary:
- case kRegexpNoWordBoundary:
- case kRegexpEndText:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- case kRegexpHaveMatch:
- return true;
- case kRegexpConcat:
- case kRegexpAlternate:
- // These are simple as long as the subpieces are simple.
- subs = sub();
- for (int i = 0; i < nsub_; i++)
- if (!subs[i]->simple())
- return false;
- return true;
- case kRegexpCharClass:
- // Simple as long as the char class is not empty, not full.
- if (ccb_ != NULL)
- return !ccb_->empty() && !ccb_->full();
- return !cc_->empty() && !cc_->full();
- case kRegexpCapture:
- subs = sub();
- return subs[0]->simple();
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- subs = sub();
- if (!subs[0]->simple())
- return false;
- switch (subs[0]->op_) {
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest:
- case kRegexpEmptyMatch:
- case kRegexpNoMatch:
- return false;
- default:
- break;
- }
- return true;
- case kRegexpRepeat:
- return false;
- }
- LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
- return false;
-}
-
-// Walker subclass used by Simplify.
-// Coalesces runs of star/plus/quest/repeat of the same literal along with any
-// occurrences of that literal into repeats of that literal. It also works for
-// char classes, any char and any byte.
-// PostVisit creates the coalesced result, which should then be simplified.
-class CoalesceWalker : public Regexp::Walker<Regexp*> {
- public:
- CoalesceWalker() {}
- virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
- Regexp** child_args, int nchild_args);
- virtual Regexp* Copy(Regexp* re);
- virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
-
- private:
- // These functions are declared inside CoalesceWalker so that
- // they can edit the private fields of the Regexps they construct.
-
- // Returns true if r1 and r2 can be coalesced. In particular, ensures that
- // the parse flags are consistent. (They will not be checked again later.)
- static bool CanCoalesce(Regexp* r1, Regexp* r2);
-
- // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
- // will be empty match and the coalesced op. In other cases, where part of a
- // literal string was removed to be coalesced, the array elements afterwards
- // will be the coalesced op and the remainder of the literal string.
- static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
-
- DISALLOW_COPY_AND_ASSIGN(CoalesceWalker);
-};
-
-// Walker subclass used by Simplify.
-// The simplify walk is purely post-recursive: given the simplified children,
-// PostVisit creates the simplified result.
-// The child_args are simplified Regexp*s.
-class SimplifyWalker : public Regexp::Walker<Regexp*> {
- public:
- SimplifyWalker() {}
- virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
- virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
- Regexp** child_args, int nchild_args);
- virtual Regexp* Copy(Regexp* re);
- virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
-
- private:
- // These functions are declared inside SimplifyWalker so that
- // they can edit the private fields of the Regexps they construct.
-
- // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
- // Caller must Decref return value when done with it.
- static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
-
- // Simplifies the expression re{min,max} in terms of *, +, and ?.
- // Returns a new regexp. Does not edit re. Does not consume reference to re.
- // Caller must Decref return value when done with it.
- static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
- Regexp::ParseFlags parse_flags);
-
- // Simplifies a character class by expanding any named classes
- // into rune ranges. Does not edit re. Does not consume ref to re.
- // Caller must Decref return value when done with it.
- static Regexp* SimplifyCharClass(Regexp* re);
-
- DISALLOW_COPY_AND_ASSIGN(SimplifyWalker);
-};
-
-// Simplifies a regular expression, returning a new regexp.
-// The new regexp uses traditional Unix egrep features only,
-// plus the Perl (?:) non-capturing parentheses.
-// Otherwise, no POSIX or Perl additions. The new regexp
-// captures exactly the same subexpressions (with the same indices)
-// as the original.
-// Does not edit current object.
-// Caller must Decref() return value when done with it.
-
-Regexp* Regexp::Simplify() {
- CoalesceWalker cw;
- Regexp* cre = cw.Walk(this, NULL);
- if (cre == NULL)
- return cre;
- SimplifyWalker sw;
- Regexp* sre = sw.Walk(cre, NULL);
- cre->Decref();
- return sre;
-}
-
-#define Simplify DontCallSimplify // Avoid accidental recursion
-
-// Utility function for PostVisit implementations that compares re->sub() with
-// child_args to determine whether any child_args changed. In the common case,
-// where nothing changed, calls Decref() for all child_args and returns false,
-// so PostVisit must return re->Incref(). Otherwise, returns true.
-static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
- for (int i = 0; i < re->nsub(); i++) {
- Regexp* sub = re->sub()[i];
- Regexp* newsub = child_args[i];
- if (newsub != sub)
- return true;
- }
- for (int i = 0; i < re->nsub(); i++) {
- Regexp* newsub = child_args[i];
- newsub->Decref();
- }
- return false;
-}
-
-Regexp* CoalesceWalker::Copy(Regexp* re) {
- return re->Incref();
-}
-
-Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
- // This should never be called, since we use Walk and not
- // WalkExponential.
- LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
- return re->Incref();
-}
-
-Regexp* CoalesceWalker::PostVisit(Regexp* re,
- Regexp* parent_arg,
- Regexp* pre_arg,
- Regexp** child_args,
- int nchild_args) {
- if (re->nsub() == 0)
- return re->Incref();
-
- if (re->op() != kRegexpConcat) {
- if (!ChildArgsChanged(re, child_args))
- return re->Incref();
-
- // Something changed. Build a new op.
- Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(re->nsub());
- Regexp** nre_subs = nre->sub();
- for (int i = 0; i < re->nsub(); i++)
- nre_subs[i] = child_args[i];
- // Repeats and Captures have additional data that must be copied.
- if (re->op() == kRegexpRepeat) {
- nre->min_ = re->min();
- nre->max_ = re->max();
- } else if (re->op() == kRegexpCapture) {
- nre->cap_ = re->cap();
- }
- return nre;
- }
-
- bool can_coalesce = false;
- for (int i = 0; i < re->nsub(); i++) {
- if (i+1 < re->nsub() &&
- CanCoalesce(child_args[i], child_args[i+1])) {
- can_coalesce = true;
- break;
- }
- }
- if (!can_coalesce) {
- if (!ChildArgsChanged(re, child_args))
- return re->Incref();
-
- // Something changed. Build a new op.
- Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(re->nsub());
- Regexp** nre_subs = nre->sub();
- for (int i = 0; i < re->nsub(); i++)
- nre_subs[i] = child_args[i];
- return nre;
- }
-
- for (int i = 0; i < re->nsub(); i++) {
- if (i+1 < re->nsub() &&
- CanCoalesce(child_args[i], child_args[i+1]))
- DoCoalesce(&child_args[i], &child_args[i+1]);
- }
- // Determine how many empty matches were left by DoCoalesce.
- int n = 0;
- for (int i = n; i < re->nsub(); i++) {
- if (child_args[i]->op() == kRegexpEmptyMatch)
- n++;
- }
- // Build a new op.
- Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(re->nsub() - n);
- Regexp** nre_subs = nre->sub();
- for (int i = 0, j = 0; i < re->nsub(); i++) {
- if (child_args[i]->op() == kRegexpEmptyMatch) {
- child_args[i]->Decref();
- continue;
- }
- nre_subs[j] = child_args[i];
- j++;
- }
- return nre;
-}
-
-bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
- // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
- // any byte.
- if ((r1->op() == kRegexpStar ||
- r1->op() == kRegexpPlus ||
- r1->op() == kRegexpQuest ||
- r1->op() == kRegexpRepeat) &&
- (r1->sub()[0]->op() == kRegexpLiteral ||
- r1->sub()[0]->op() == kRegexpCharClass ||
- r1->sub()[0]->op() == kRegexpAnyChar ||
- r1->sub()[0]->op() == kRegexpAnyByte)) {
- // r2 must be a star/plus/quest/repeat of the same literal, char class,
- // any char or any byte.
- if ((r2->op() == kRegexpStar ||
- r2->op() == kRegexpPlus ||
- r2->op() == kRegexpQuest ||
- r2->op() == kRegexpRepeat) &&
- Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
- // The parse flags must be consistent.
- ((r1->parse_flags() & Regexp::NonGreedy) ==
- (r2->parse_flags() & Regexp::NonGreedy))) {
- return true;
- }
- // ... OR an occurrence of that literal, char class, any char or any byte
- if (Regexp::Equal(r1->sub()[0], r2)) {
- return true;
- }
- // ... OR a literal string that begins with that literal.
- if (r1->sub()[0]->op() == kRegexpLiteral &&
- r2->op() == kRegexpLiteralString &&
- r2->runes()[0] == r1->sub()[0]->rune() &&
- // The parse flags must be consistent.
- ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
- (r2->parse_flags() & Regexp::FoldCase))) {
- return true;
- }
- }
- return false;
-}
-
-void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
- Regexp* r1 = *r1ptr;
- Regexp* r2 = *r2ptr;
-
- Regexp* nre = Regexp::Repeat(
- r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
-
- switch (r1->op()) {
- case kRegexpStar:
- nre->min_ = 0;
- nre->max_ = -1;
- break;
-
- case kRegexpPlus:
- nre->min_ = 1;
- nre->max_ = -1;
- break;
-
- case kRegexpQuest:
- nre->min_ = 0;
- nre->max_ = 1;
- break;
-
- case kRegexpRepeat:
- nre->min_ = r1->min();
- nre->max_ = r1->max();
- break;
-
- default:
- LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
- nre->Decref();
- return;
- }
-
- switch (r2->op()) {
- case kRegexpStar:
- nre->max_ = -1;
- goto LeaveEmpty;
-
- case kRegexpPlus:
- nre->min_++;
- nre->max_ = -1;
- goto LeaveEmpty;
-
- case kRegexpQuest:
- if (nre->max() != -1)
- nre->max_++;
- goto LeaveEmpty;
-
- case kRegexpRepeat:
- nre->min_ += r2->min();
- if (r2->max() == -1)
- nre->max_ = -1;
- else if (nre->max() != -1)
- nre->max_ += r2->max();
- goto LeaveEmpty;
-
- case kRegexpLiteral:
- case kRegexpCharClass:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- nre->min_++;
- if (nre->max() != -1)
- nre->max_++;
- goto LeaveEmpty;
-
- LeaveEmpty:
- *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
- *r2ptr = nre;
- break;
-
- case kRegexpLiteralString: {
- Rune r = r1->sub()[0]->rune();
- // Determine how much of the literal string is removed.
- // We know that we have at least one rune. :)
- int n = 1;
- while (n < r2->nrunes() && r2->runes()[n] == r)
- n++;
- nre->min_ += n;
- if (nre->max() != -1)
- nre->max_ += n;
- if (n == r2->nrunes())
- goto LeaveEmpty;
- *r1ptr = nre;
- *r2ptr = Regexp::LiteralString(
- &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
- break;
- }
-
- default:
- LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
- nre->Decref();
- return;
- }
-
- r1->Decref();
- r2->Decref();
-}
-
-Regexp* SimplifyWalker::Copy(Regexp* re) {
- return re->Incref();
-}
-
-Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
- // This should never be called, since we use Walk and not
- // WalkExponential.
- LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
- return re->Incref();
-}
-
-Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
- if (re->simple()) {
- *stop = true;
- return re->Incref();
- }
- return NULL;
-}
-
-Regexp* SimplifyWalker::PostVisit(Regexp* re,
- Regexp* parent_arg,
- Regexp* pre_arg,
- Regexp** child_args,
- int nchild_args) {
- switch (re->op()) {
- case kRegexpNoMatch:
- case kRegexpEmptyMatch:
- case kRegexpLiteral:
- case kRegexpLiteralString:
- case kRegexpBeginLine:
- case kRegexpEndLine:
- case kRegexpBeginText:
- case kRegexpWordBoundary:
- case kRegexpNoWordBoundary:
- case kRegexpEndText:
- case kRegexpAnyChar:
- case kRegexpAnyByte:
- case kRegexpHaveMatch:
- // All these are always simple.
- re->simple_ = true;
- return re->Incref();
-
- case kRegexpConcat:
- case kRegexpAlternate: {
- // These are simple as long as the subpieces are simple.
- if (!ChildArgsChanged(re, child_args)) {
- re->simple_ = true;
- return re->Incref();
- }
- Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(re->nsub());
- Regexp** nre_subs = nre->sub();
- for (int i = 0; i < re->nsub(); i++)
- nre_subs[i] = child_args[i];
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpCapture: {
- Regexp* newsub = child_args[0];
- if (newsub == re->sub()[0]) {
- newsub->Decref();
- re->simple_ = true;
- return re->Incref();
- }
- Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
- nre->AllocSub(1);
- nre->sub()[0] = newsub;
- nre->cap_ = re->cap();
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpStar:
- case kRegexpPlus:
- case kRegexpQuest: {
- Regexp* newsub = child_args[0];
- // Special case: repeat the empty string as much as
- // you want, but it's still the empty string.
- if (newsub->op() == kRegexpEmptyMatch)
- return newsub;
-
- // These are simple as long as the subpiece is simple.
- if (newsub == re->sub()[0]) {
- newsub->Decref();
- re->simple_ = true;
- return re->Incref();
- }
-
- // These are also idempotent if flags are constant.
- if (re->op() == newsub->op() &&
- re->parse_flags() == newsub->parse_flags())
- return newsub;
-
- Regexp* nre = new Regexp(re->op(), re->parse_flags());
- nre->AllocSub(1);
- nre->sub()[0] = newsub;
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpRepeat: {
- Regexp* newsub = child_args[0];
- // Special case: repeat the empty string as much as
- // you want, but it's still the empty string.
- if (newsub->op() == kRegexpEmptyMatch)
- return newsub;
-
- Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
- re->parse_flags());
- newsub->Decref();
- nre->simple_ = true;
- return nre;
- }
-
- case kRegexpCharClass: {
- Regexp* nre = SimplifyCharClass(re);
- nre->simple_ = true;
- return nre;
- }
- }
-
- LOG(ERROR) << "Simplify case not handled: " << re->op();
- return re->Incref();
-}
-
-// Creates a concatenation of two Regexp, consuming refs to re1 and re2.
-// Returns a new Regexp, handing the ref to the caller.
-Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
- Regexp::ParseFlags parse_flags) {
- Regexp* re = new Regexp(kRegexpConcat, parse_flags);
- re->AllocSub(2);
- Regexp** subs = re->sub();
- subs[0] = re1;
- subs[1] = re2;
- return re;
-}
-
-// Simplifies the expression re{min,max} in terms of *, +, and ?.
-// Returns a new regexp. Does not edit re. Does not consume reference to re.
-// Caller must Decref return value when done with it.
-// The result will *not* necessarily have the right capturing parens
-// if you call ToString() and re-parse it: (x){2} becomes (x)(x),
-// but in the Regexp* representation, both (x) are marked as $1.
-Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
- Regexp::ParseFlags f) {
- // x{n,} means at least n matches of x.
- if (max == -1) {
- // Special case: x{0,} is x*
- if (min == 0)
- return Regexp::Star(re->Incref(), f);
-
- // Special case: x{1,} is x+
- if (min == 1)
- return Regexp::Plus(re->Incref(), f);
-
- // General case: x{4,} is xxxx+
- Regexp* nre = new Regexp(kRegexpConcat, f);
- nre->AllocSub(min);
- Regexp** nre_subs = nre->sub();
- for (int i = 0; i < min-1; i++)
- nre_subs[i] = re->Incref();
- nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
- return nre;
- }
-
- // Special case: (x){0} matches only empty string.
- if (min == 0 && max == 0)
- return new Regexp(kRegexpEmptyMatch, f);
-
- // Special case: x{1} is just x.
- if (min == 1 && max == 1)
- return re->Incref();
-
- // General case: x{n,m} means n copies of x and m copies of x?.
- // The machine will do less work if we nest the final m copies,
- // so that x{2,5} = xx(x(x(x)?)?)?
-
- // Build leading prefix: xx. Capturing only on the last one.
- Regexp* nre = NULL;
- if (min > 0) {
- nre = new Regexp(kRegexpConcat, f);
- nre->AllocSub(min);
- Regexp** nre_subs = nre->sub();
- for (int i = 0; i < min; i++)
- nre_subs[i] = re->Incref();
- }
-
- // Build and attach suffix: (x(x(x)?)?)?
- if (max > min) {
- Regexp* suf = Regexp::Quest(re->Incref(), f);
- for (int i = min+1; i < max; i++)
- suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
- if (nre == NULL)
- nre = suf;
- else
- nre = Concat2(nre, suf, f);
- }
-
- if (nre == NULL) {
- // Some degenerate case, like min > max, or min < max < 0.
- // This shouldn't happen, because the parser rejects such regexps.
- LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
- return new Regexp(kRegexpNoMatch, f);
- }
-
- return nre;
-}
-
-// Simplifies a character class.
-// Caller must Decref return value when done with it.
-Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
- CharClass* cc = re->cc();
-
- // Special cases
- if (cc->empty())
- return new Regexp(kRegexpNoMatch, re->parse_flags());
- if (cc->full())
- return new Regexp(kRegexpAnyChar, re->parse_flags());
-
- return re->Incref();
-}
-
-} // namespace re2
« no previous file with comments | « third_party/re2/re2/set.cc ('k') | third_party/re2/re2/stringpiece.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698