third_party/re2/re2/parse.cc - Issue 1530113002: Revert of Update re2

Unified Diff: third_party/re2/re2/parse.cc

Issue 1530113002: Revert of Update re2 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: third_party/re2/re2/parse.cc

diff --git a/third_party/re2/re2/parse.cc b/third_party/re2/re2/parse.cc

index cf74f5a1abbe8b64d13e2dd326dba7336c427e25..0cf4ab411a9ce8853d5e60ebb06126b72424ca93 100644

--- a/third_party/re2/re2/parse.cc

+++ b/third_party/re2/re2/parse.cc

@@ -21,7 +21,6 @@

#include "re2/stringpiece.h"

#include "re2/unicode_casefold.h"

#include "re2/unicode_groups.h"

-#include "re2/walker-inl.h"

namespace re2 {

@@ -157,7 +156,7 @@

int ncap_; // number of capturing parens seen

int rune_max_; // maximum char value for this encoding

- DISALLOW_COPY_AND_ASSIGN(ParseState);

+ DISALLOW_EVIL_CONSTRUCTORS(ParseState);

};

// Pseudo-operators - only on parse stack.

@@ -215,8 +214,7 @@

// single characters (e.g., [.] instead of \.), and some

// analysis does better with fewer character classes.

// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.

- if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {

- re->ccb_->RemoveAbove(rune_max_);

+ if (re->op_ == kRegexpCharClass) {

if (re->ccb_->size() == 1) {

Rune r = re->ccb_->begin()->lo;

re->Decref();

@@ -242,8 +240,8 @@

// Searches the case folding tables and returns the CaseFold* that contains r.

// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.

// If there isn't one, returns NULL.

-const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {

- const CaseFold* ef = f + n;

+CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) {

+ CaseFold* ef = f + n;

// Binary search for entry containing r.

while (n > 0) {

@@ -270,7 +268,7 @@

}

// Returns the result of applying the fold f to the rune r.

-Rune ApplyFold(const CaseFold *f, Rune r) {

+Rune ApplyFold(CaseFold *f, Rune r) {

switch (f->delta) {

default:

return r + f->delta;

@@ -306,7 +304,7 @@

// CycleFoldRune('?') = '?'

Rune CycleFoldRune(Rune r) {

- const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);

+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);

if (f == NULL || r < f->lo)

return r;

return ApplyFold(f, r);

@@ -329,7 +327,7 @@

return;

while (lo <= hi) {

- const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);

+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);

if (f == NULL) // lo has no fold, nor does anything above lo

break;

if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo

@@ -379,6 +377,7 @@

}

r = CycleFoldRune(r);

} while (r != r1);

+ re->ccb_->RemoveAbove(rune_max_);

return PushRegexp(re);

}

@@ -464,59 +463,6 @@

return true;

}

-// RepetitionWalker reports whether the repetition regexp is valid.

-// Valid means that the combination of the top-level repetition

-// and any inner repetitions does not exceed n copies of the

-// innermost thing.

-// This rewalks the regexp tree and is called for every repetition,

-// so we have to worry about inducing quadratic behavior in the parser.

-// We avoid this by only using RepetitionWalker when min or max >= 2.

-// In that case the depth of any >= 2 nesting can only get to 9 without

-// triggering a parse error, so each subtree can only be rewalked 9 times.

-class RepetitionWalker : public Regexp::Walker<int> {

- public:

- RepetitionWalker() {}

- virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);

- virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,

- int* child_args, int nchild_args);

- virtual int ShortVisit(Regexp* re, int parent_arg);

- private:

- DISALLOW_COPY_AND_ASSIGN(RepetitionWalker);

-};

-int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {

- int arg = parent_arg;

- if (re->op() == kRegexpRepeat) {

- int m = re->max();

- if (m < 0) {

- m = re->min();

- }

- if (m > 0) {

- arg /= m;

- }

- return arg;

-int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,

- int* child_args, int nchild_args) {

- int arg = pre_arg;

- for (int i = 0; i < nchild_args; i++) {

- if (child_args[i] < arg) {

- arg = child_args[i];

- }

- return arg;

-int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {

- // This should never be called, since we use Walk and not

- // WalkExponential.

- LOG(DFATAL) << "RepetitionWalker::ShortVisit called";

- return 0;

// Pushes a repetition regexp onto the stack.

// A valid argument for the operator must already be on the stack.

bool Regexp::ParseState::PushRepetition(int min, int max,

@@ -542,15 +488,8 @@

re->down_ = stacktop_->down_;

re->sub()[0] = FinishRegexp(stacktop_);

re->simple_ = re->ComputeSimple();

stacktop_ = re;

- if (min >= 2 || max >= 2) {

- RepetitionWalker w;

- if (w.Walk(stacktop_, 1000) == 0) {

- status_->set_code(kRegexpRepeatSize);

- status_->set_error_arg(s);

- return false;

- }

return true;

}

@@ -576,6 +515,13 @@

return PushRegexp(re);

}

+// Adds r to cc, along with r's upper case if foldascii is set.

+static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {

+ cc->AddRange(r, r);

+ if (foldascii && 'a' <= r && r <= 'z')

+ cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');

// Processes a vertical bar in the input.

bool Regexp::ParseState::DoVerticalBar() {

MaybeConcatString(-1, NoParseFlags);

@@ -589,34 +535,46 @@

Regexp* r1;

Regexp* r2;

if ((r1 = stacktop_) != NULL &&

- (r2 = r1->down_) != NULL &&

+ (r2 = stacktop_->down_) != NULL &&

r2->op() == kVerticalBar) {

+ // If above and below vertical bar are literal or char class,

+ // can merge into a single char class.

Regexp* r3;

- if ((r3 = r2->down_) != NULL &&

- (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {

- // AnyChar is above or below the vertical bar. Let it subsume

- // the other when the other is Literal, CharClass or AnyChar.

- if (r3->op() == kRegexpAnyChar &&

- (r1->op() == kRegexpLiteral ||

- r1->op() == kRegexpCharClass ||

- r1->op() == kRegexpAnyChar)) {

- // Discard r1.

- stacktop_ = r2;

- r1->Decref();

- return true;

+ if ((r1->op() == kRegexpLiteral ||

+ r1->op() == kRegexpCharClass ||

+ r1->op() == kRegexpAnyChar) &&

+ (r3 = r2->down_) != NULL) {

+ Rune rune;

+ switch (r3->op()) {

+ case kRegexpLiteral: // convert to char class

+ rune = r3->rune_;

+ r3->op_ = kRegexpCharClass;

+ r3->cc_ = NULL;

+ r3->ccb_ = new CharClassBuilder;

+ AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);

+ // fall through

+ case kRegexpCharClass:

+ if (r1->op() == kRegexpLiteral)

+ AddLiteral(r3->ccb_, r1->rune_,

+ r1->parse_flags_ & Regexp::FoldCase);

+ else if (r1->op() == kRegexpCharClass)

+ r3->ccb_->AddCharClass(r1->ccb_);

+ if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {

+ delete r3->ccb_;

+ r3->ccb_ = NULL;

+ r3->op_ = kRegexpAnyChar;

+ }

+ // fall through

+ case kRegexpAnyChar:

+ // pop r1

+ stacktop_ = r2;

+ r1->Decref();

+ return true;

+ default:

+ break;

}

- if (r1->op() == kRegexpAnyChar &&

- (r3->op() == kRegexpLiteral ||

- r3->op() == kRegexpCharClass ||

- r3->op() == kRegexpAnyChar)) {

- // Rearrange the stack and discard r3.

- r1->down_ = r3->down_;

- r2->down_ = r1;

- stacktop_ = r2;

- r3->Decref();

- return true;

- }

+ }

// Swap r1 below vertical bar (r2).

r1->down_ = r2->down_;

r2->down_ = r1;

@@ -1147,7 +1105,7 @@

if (r >= 0) {

re1->op_ = kRegexpLiteral;

re1->rune_ = r;

- re1->parse_flags_ = static_cast<uint16>(flags);

+ re1->parse_flags_ = flags;

return true;

}

@@ -1230,14 +1188,6 @@

int n;

if (fullrune(sp->data(), sp->size())) {

n = chartorune(r, sp->data());

- // Some copies of chartorune have a bug that accepts

- // encodings of values in (10FFFF, 1FFFFF] as valid.

- // Those values break the character class algorithm,

- // which assumes Runemax is the largest rune.

- if (*r > Runemax) {

- n = 1;

- *r = Runeerror;

- }

if (!(n == 1 && *r == Runeerror)) { // no decoding error

sp->remove_prefix(n);

return n;

@@ -1340,8 +1290,6 @@

}

- if (code > rune_max)

- goto BadEscape;

*rp = code;

return true;

@@ -1427,8 +1375,7 @@

BadEscape:

// Unrecognized escape sequence.

status->set_code(kRegexpBadEscape);

- status->set_error_arg(

- StringPiece(begin, static_cast<int>(s->data() - begin)));

+ status->set_error_arg(StringPiece(begin, s->data() - begin));

return false;

}

@@ -1456,8 +1403,8 @@

}

// Look for a group with the given name.

-static const UGroup* LookupGroup(const StringPiece& name,

- const UGroup *groups, int ngroups) {

+static UGroup* LookupGroup(const StringPiece& name,

+ UGroup *groups, int ngroups) {

// Simple name lookup.

for (int i = 0; i < ngroups; i++)

if (StringPiece(groups[i].name) == name)

@@ -1471,16 +1418,16 @@

static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };

// Look for a POSIX group with the given name (e.g., "[:^alpha:]")

-static const UGroup* LookupPosixGroup(const StringPiece& name) {

+static UGroup* LookupPosixGroup(const StringPiece& name) {

return LookupGroup(name, posix_groups, num_posix_groups);

}

-static const UGroup* LookupPerlGroup(const StringPiece& name) {

+static UGroup* LookupPerlGroup(const StringPiece& name) {

return LookupGroup(name, perl_groups, num_perl_groups);

}

// Look for a Unicode group with the given name (e.g., "Han")

-static const UGroup* LookupUnicodeGroup(const StringPiece& name) {

+static UGroup* LookupUnicodeGroup(const StringPiece& name) {

// Special case: "Any" means any.

if (name == StringPiece("Any"))

return &anygroup;

@@ -1488,7 +1435,7 @@

}

// Add a UGroup or its negation to the character class.

-static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,

+static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign,

Regexp::ParseFlags parse_flags) {

if (sign == +1) {

for (int i = 0; i < g->nr16; i++) {

@@ -1539,7 +1486,7 @@

// On success, sets *s to span the remainder of the string

// and returns the corresponding UGroup.

// The StringPiece must *NOT* be edited unless the call succeeds.

-const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {

+UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {

if (!(parse_flags & Regexp::PerlClasses))

return NULL;

if (s->size() < 2 || (*s)[0] != '\\')

@@ -1547,7 +1494,7 @@

// Could use StringPieceToRune, but there aren't

// any non-ASCII Perl group names.

StringPiece name(s->begin(), 2);

- const UGroup *g = LookupPerlGroup(name);

+ UGroup *g = LookupPerlGroup(name);

if (g == NULL)

return NULL;

s->remove_prefix(name.size());

@@ -1587,10 +1534,10 @@

if (c != '{') {

// Name is the bit of string we just skipped over for c.

const char* p = seq.begin() + 2;

- name = StringPiece(p, static_cast<int>(s->begin() - p));

+ name = StringPiece(p, s->begin() - p);

} else {

// Name is in braces. Look for closing }

- size_t end = s->find('}', 0);

+ int end = s->find('}', 0);

if (end == s->npos) {

if (!IsValidUTF8(seq, status))

return kParseError;

@@ -1598,21 +1545,21 @@

status->set_error_arg(seq);

return kParseError;

}

- name = StringPiece(s->begin(), static_cast<int>(end)); // without '}'

- s->remove_prefix(static_cast<int>(end) + 1); // with '}'

+ name = StringPiece(s->begin(), end); // without '}'

+ s->remove_prefix(end + 1); // with '}'

if (!IsValidUTF8(name, status))

return kParseError;

}

// Chop seq where s now begins.

- seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin()));

+ seq = StringPiece(seq.begin(), s->begin() - seq.begin());

// Look up group

if (name.size() > 0 && name[0] == '^') {

sign = -sign;

name.remove_prefix(1); // '^'

}

- const UGroup *g = LookupUnicodeGroup(name);

+ UGroup *g = LookupUnicodeGroup(name);

if (g == NULL) {

status->set_code(kRegexpBadCharRange);

status->set_error_arg(seq);

@@ -1646,9 +1593,9 @@

// Got it. Check that it's valid.

q += 2;

- StringPiece name(p, static_cast<int>(q-p));

- const UGroup *g = LookupPosixGroup(name);

+ StringPiece name(p, q-p);

+ UGroup *g = LookupPosixGroup(name);

if (g == NULL) {

status->set_code(kRegexpBadCharRange);

status->set_error_arg(name);

@@ -1700,8 +1647,7 @@

return false;

if (rr->hi < rr->lo) {

status->set_code(kRegexpBadCharRange);

- status->set_error_arg(

- StringPiece(os.data(), static_cast<int>(s->data() - os.data())));

+ status->set_error_arg(StringPiece(os.data(), s->data() - os.data()));

return false;

}

} else {

@@ -1786,7 +1732,7 @@

}

// Look for Perl character class symbols (extension).

- const UGroup *g = MaybeParsePerlCCEscape(s, flags_);

+ UGroup *g = MaybeParsePerlCCEscape(s, flags_);

if (g != NULL) {

AddUGroup(re->ccb_, g, g->sign, flags_);

continue;

@@ -1815,6 +1761,7 @@

if (negated)

re->ccb_->Negate();

+ re->ccb_->RemoveAbove(rune_max_);

*out_re = re;

return true;

@@ -1873,7 +1820,7 @@

// so that's the one we implement. One is enough.

if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {

// Pull out name.

- size_t end = t.find('>', 2);

+ int end = t.find('>', 2);

if (end == t.npos) {

if (!IsValidUTF8(*s, status_))

return false;

@@ -1883,8 +1830,8 @@

}

// t is "P<name>...", t[end] == '>'

- StringPiece capture(t.begin()-2, static_cast<int>(end)+3); // "(?P<name>"

- StringPiece name(t.begin()+2, static_cast<int>(end)-2); // "name"

+ StringPiece capture(t.begin()-2, end+3); // "(?P<name>"

+ StringPiece name(t.begin()+2, end-2); // "name"

if (!IsValidUTF8(name, status_))

return false;

if (!IsValidCaptureName(name)) {

@@ -1898,7 +1845,7 @@

return false;

}

- s->remove_prefix(static_cast<int>(capture.end() - s->begin()));

+ s->remove_prefix(capture.end() - s->begin());

return true;

}

@@ -1981,8 +1928,7 @@

BadPerlOp:

status_->set_code(kRegexpBadPerlOp);

- status_->set_error_arg(

- StringPiece(s->begin(), static_cast<int>(t.begin() - s->begin())));

+ status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin()));

return false;

}

@@ -2129,13 +2075,12 @@

// a** is a syntax error, not a double-star.

// (and a++ means something else entirely, which we don't support!)

status->set_code(kRegexpRepeatOp);

- status->set_error_arg(

- StringPiece(lastunary.begin(),

- static_cast<int>(t.begin() - lastunary.begin())));

+ status->set_error_arg(StringPiece(lastunary.begin(),

+ t.begin() - lastunary.begin()));

return NULL;

}

- opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));

+ opstr.set(opstr.data(), t.data() - opstr.data());

if (!ps.PushRepeatOp(op, opstr, nongreedy))

return NULL;

isunary = opstr;

@@ -2161,13 +2106,12 @@

if (lastunary.size() > 0) {

// Not allowed to stack repetition operators.

status->set_code(kRegexpRepeatOp);

- status->set_error_arg(

- StringPiece(lastunary.begin(),

- static_cast<int>(t.begin() - lastunary.begin())));

+ status->set_error_arg(StringPiece(lastunary.begin(),

+ t.begin() - lastunary.begin()));

return NULL;

}

- opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));

+ opstr.set(opstr.data(), t.data() - opstr.data());

if (!ps.PushRepetition(lo, hi, opstr, nongreedy))

return NULL;

isunary = opstr;

@@ -2243,7 +2187,7 @@

}

- const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());

+ UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());

if (g != NULL) {

Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);

re->ccb_ = new CharClassBuilder;

« no previous file with comments | « third_party/re2/re2/onepass.cc ('k') | third_party/re2/re2/perl_groups.cc » ('j') | no next file with comments »