Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Unified Diff: third_party/re2/re2/parse.cc

Issue 1530113002: Revert of Update re2 (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « third_party/re2/re2/onepass.cc ('k') | third_party/re2/re2/perl_groups.cc » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: third_party/re2/re2/parse.cc
diff --git a/third_party/re2/re2/parse.cc b/third_party/re2/re2/parse.cc
index cf74f5a1abbe8b64d13e2dd326dba7336c427e25..0cf4ab411a9ce8853d5e60ebb06126b72424ca93 100644
--- a/third_party/re2/re2/parse.cc
+++ b/third_party/re2/re2/parse.cc
@@ -21,7 +21,6 @@
#include "re2/stringpiece.h"
#include "re2/unicode_casefold.h"
#include "re2/unicode_groups.h"
-#include "re2/walker-inl.h"
namespace re2 {
@@ -157,7 +156,7 @@
int ncap_; // number of capturing parens seen
int rune_max_; // maximum char value for this encoding
- DISALLOW_COPY_AND_ASSIGN(ParseState);
+ DISALLOW_EVIL_CONSTRUCTORS(ParseState);
};
// Pseudo-operators - only on parse stack.
@@ -215,8 +214,7 @@
// single characters (e.g., [.] instead of \.), and some
// analysis does better with fewer character classes.
// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
- if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
- re->ccb_->RemoveAbove(rune_max_);
+ if (re->op_ == kRegexpCharClass) {
if (re->ccb_->size() == 1) {
Rune r = re->ccb_->begin()->lo;
re->Decref();
@@ -242,8 +240,8 @@
// Searches the case folding tables and returns the CaseFold* that contains r.
// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
// If there isn't one, returns NULL.
-const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {
- const CaseFold* ef = f + n;
+CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) {
+ CaseFold* ef = f + n;
// Binary search for entry containing r.
while (n > 0) {
@@ -270,7 +268,7 @@
}
// Returns the result of applying the fold f to the rune r.
-Rune ApplyFold(const CaseFold *f, Rune r) {
+Rune ApplyFold(CaseFold *f, Rune r) {
switch (f->delta) {
default:
return r + f->delta;
@@ -306,7 +304,7 @@
//
// CycleFoldRune('?') = '?'
Rune CycleFoldRune(Rune r) {
- const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
if (f == NULL || r < f->lo)
return r;
return ApplyFold(f, r);
@@ -329,7 +327,7 @@
return;
while (lo <= hi) {
- const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
+ CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
if (f == NULL) // lo has no fold, nor does anything above lo
break;
if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
@@ -379,6 +377,7 @@
}
r = CycleFoldRune(r);
} while (r != r1);
+ re->ccb_->RemoveAbove(rune_max_);
return PushRegexp(re);
}
@@ -464,59 +463,6 @@
return true;
}
-// RepetitionWalker reports whether the repetition regexp is valid.
-// Valid means that the combination of the top-level repetition
-// and any inner repetitions does not exceed n copies of the
-// innermost thing.
-// This rewalks the regexp tree and is called for every repetition,
-// so we have to worry about inducing quadratic behavior in the parser.
-// We avoid this by only using RepetitionWalker when min or max >= 2.
-// In that case the depth of any >= 2 nesting can only get to 9 without
-// triggering a parse error, so each subtree can only be rewalked 9 times.
-class RepetitionWalker : public Regexp::Walker<int> {
- public:
- RepetitionWalker() {}
- virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
- virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
- int* child_args, int nchild_args);
- virtual int ShortVisit(Regexp* re, int parent_arg);
-
- private:
- DISALLOW_COPY_AND_ASSIGN(RepetitionWalker);
-};
-
-int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
- int arg = parent_arg;
- if (re->op() == kRegexpRepeat) {
- int m = re->max();
- if (m < 0) {
- m = re->min();
- }
- if (m > 0) {
- arg /= m;
- }
- }
- return arg;
-}
-
-int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
- int* child_args, int nchild_args) {
- int arg = pre_arg;
- for (int i = 0; i < nchild_args; i++) {
- if (child_args[i] < arg) {
- arg = child_args[i];
- }
- }
- return arg;
-}
-
-int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
- // This should never be called, since we use Walk and not
- // WalkExponential.
- LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
- return 0;
-}
-
// Pushes a repetition regexp onto the stack.
// A valid argument for the operator must already be on the stack.
bool Regexp::ParseState::PushRepetition(int min, int max,
@@ -542,15 +488,8 @@
re->down_ = stacktop_->down_;
re->sub()[0] = FinishRegexp(stacktop_);
re->simple_ = re->ComputeSimple();
+
stacktop_ = re;
- if (min >= 2 || max >= 2) {
- RepetitionWalker w;
- if (w.Walk(stacktop_, 1000) == 0) {
- status_->set_code(kRegexpRepeatSize);
- status_->set_error_arg(s);
- return false;
- }
- }
return true;
}
@@ -576,6 +515,13 @@
return PushRegexp(re);
}
+// Adds r to cc, along with r's upper case if foldascii is set.
+static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {
+ cc->AddRange(r, r);
+ if (foldascii && 'a' <= r && r <= 'z')
+ cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');
+}
+
// Processes a vertical bar in the input.
bool Regexp::ParseState::DoVerticalBar() {
MaybeConcatString(-1, NoParseFlags);
@@ -589,34 +535,46 @@
Regexp* r1;
Regexp* r2;
if ((r1 = stacktop_) != NULL &&
- (r2 = r1->down_) != NULL &&
+ (r2 = stacktop_->down_) != NULL &&
r2->op() == kVerticalBar) {
+ // If above and below vertical bar are literal or char class,
+ // can merge into a single char class.
Regexp* r3;
- if ((r3 = r2->down_) != NULL &&
- (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
- // AnyChar is above or below the vertical bar. Let it subsume
- // the other when the other is Literal, CharClass or AnyChar.
- if (r3->op() == kRegexpAnyChar &&
- (r1->op() == kRegexpLiteral ||
- r1->op() == kRegexpCharClass ||
- r1->op() == kRegexpAnyChar)) {
- // Discard r1.
- stacktop_ = r2;
- r1->Decref();
- return true;
+ if ((r1->op() == kRegexpLiteral ||
+ r1->op() == kRegexpCharClass ||
+ r1->op() == kRegexpAnyChar) &&
+ (r3 = r2->down_) != NULL) {
+ Rune rune;
+ switch (r3->op()) {
+ case kRegexpLiteral: // convert to char class
+ rune = r3->rune_;
+ r3->op_ = kRegexpCharClass;
+ r3->cc_ = NULL;
+ r3->ccb_ = new CharClassBuilder;
+ AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);
+ // fall through
+ case kRegexpCharClass:
+ if (r1->op() == kRegexpLiteral)
+ AddLiteral(r3->ccb_, r1->rune_,
+ r1->parse_flags_ & Regexp::FoldCase);
+ else if (r1->op() == kRegexpCharClass)
+ r3->ccb_->AddCharClass(r1->ccb_);
+ if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
+ delete r3->ccb_;
+ r3->ccb_ = NULL;
+ r3->op_ = kRegexpAnyChar;
+ }
+ // fall through
+ case kRegexpAnyChar:
+ // pop r1
+ stacktop_ = r2;
+ r1->Decref();
+ return true;
+ default:
+ break;
}
- if (r1->op() == kRegexpAnyChar &&
- (r3->op() == kRegexpLiteral ||
- r3->op() == kRegexpCharClass ||
- r3->op() == kRegexpAnyChar)) {
- // Rearrange the stack and discard r3.
- r1->down_ = r3->down_;
- r2->down_ = r1;
- stacktop_ = r2;
- r3->Decref();
- return true;
- }
- }
+ }
+
// Swap r1 below vertical bar (r2).
r1->down_ = r2->down_;
r2->down_ = r1;
@@ -1147,7 +1105,7 @@
if (r >= 0) {
re1->op_ = kRegexpLiteral;
re1->rune_ = r;
- re1->parse_flags_ = static_cast<uint16>(flags);
+ re1->parse_flags_ = flags;
return true;
}
@@ -1230,14 +1188,6 @@
int n;
if (fullrune(sp->data(), sp->size())) {
n = chartorune(r, sp->data());
- // Some copies of chartorune have a bug that accepts
- // encodings of values in (10FFFF, 1FFFFF] as valid.
- // Those values break the character class algorithm,
- // which assumes Runemax is the largest rune.
- if (*r > Runemax) {
- n = 1;
- *r = Runeerror;
- }
if (!(n == 1 && *r == Runeerror)) { // no decoding error
sp->remove_prefix(n);
return n;
@@ -1340,8 +1290,6 @@
}
}
}
- if (code > rune_max)
- goto BadEscape;
*rp = code;
return true;
@@ -1427,8 +1375,7 @@
BadEscape:
// Unrecognized escape sequence.
status->set_code(kRegexpBadEscape);
- status->set_error_arg(
- StringPiece(begin, static_cast<int>(s->data() - begin)));
+ status->set_error_arg(StringPiece(begin, s->data() - begin));
return false;
}
@@ -1456,8 +1403,8 @@
}
// Look for a group with the given name.
-static const UGroup* LookupGroup(const StringPiece& name,
- const UGroup *groups, int ngroups) {
+static UGroup* LookupGroup(const StringPiece& name,
+ UGroup *groups, int ngroups) {
// Simple name lookup.
for (int i = 0; i < ngroups; i++)
if (StringPiece(groups[i].name) == name)
@@ -1471,16 +1418,16 @@
static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
// Look for a POSIX group with the given name (e.g., "[:^alpha:]")
-static const UGroup* LookupPosixGroup(const StringPiece& name) {
+static UGroup* LookupPosixGroup(const StringPiece& name) {
return LookupGroup(name, posix_groups, num_posix_groups);
}
-static const UGroup* LookupPerlGroup(const StringPiece& name) {
+static UGroup* LookupPerlGroup(const StringPiece& name) {
return LookupGroup(name, perl_groups, num_perl_groups);
}
// Look for a Unicode group with the given name (e.g., "Han")
-static const UGroup* LookupUnicodeGroup(const StringPiece& name) {
+static UGroup* LookupUnicodeGroup(const StringPiece& name) {
// Special case: "Any" means any.
if (name == StringPiece("Any"))
return &anygroup;
@@ -1488,7 +1435,7 @@
}
// Add a UGroup or its negation to the character class.
-static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
+static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign,
Regexp::ParseFlags parse_flags) {
if (sign == +1) {
for (int i = 0; i < g->nr16; i++) {
@@ -1539,7 +1486,7 @@
// On success, sets *s to span the remainder of the string
// and returns the corresponding UGroup.
// The StringPiece must *NOT* be edited unless the call succeeds.
-const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
+UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
if (!(parse_flags & Regexp::PerlClasses))
return NULL;
if (s->size() < 2 || (*s)[0] != '\\')
@@ -1547,7 +1494,7 @@
// Could use StringPieceToRune, but there aren't
// any non-ASCII Perl group names.
StringPiece name(s->begin(), 2);
- const UGroup *g = LookupPerlGroup(name);
+ UGroup *g = LookupPerlGroup(name);
if (g == NULL)
return NULL;
s->remove_prefix(name.size());
@@ -1587,10 +1534,10 @@
if (c != '{') {
// Name is the bit of string we just skipped over for c.
const char* p = seq.begin() + 2;
- name = StringPiece(p, static_cast<int>(s->begin() - p));
+ name = StringPiece(p, s->begin() - p);
} else {
// Name is in braces. Look for closing }
- size_t end = s->find('}', 0);
+ int end = s->find('}', 0);
if (end == s->npos) {
if (!IsValidUTF8(seq, status))
return kParseError;
@@ -1598,21 +1545,21 @@
status->set_error_arg(seq);
return kParseError;
}
- name = StringPiece(s->begin(), static_cast<int>(end)); // without '}'
- s->remove_prefix(static_cast<int>(end) + 1); // with '}'
+ name = StringPiece(s->begin(), end); // without '}'
+ s->remove_prefix(end + 1); // with '}'
if (!IsValidUTF8(name, status))
return kParseError;
}
// Chop seq where s now begins.
- seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin()));
+ seq = StringPiece(seq.begin(), s->begin() - seq.begin());
// Look up group
if (name.size() > 0 && name[0] == '^') {
sign = -sign;
name.remove_prefix(1); // '^'
}
- const UGroup *g = LookupUnicodeGroup(name);
+ UGroup *g = LookupUnicodeGroup(name);
if (g == NULL) {
status->set_code(kRegexpBadCharRange);
status->set_error_arg(seq);
@@ -1646,9 +1593,9 @@
// Got it. Check that it's valid.
q += 2;
- StringPiece name(p, static_cast<int>(q-p));
-
- const UGroup *g = LookupPosixGroup(name);
+ StringPiece name(p, q-p);
+
+ UGroup *g = LookupPosixGroup(name);
if (g == NULL) {
status->set_code(kRegexpBadCharRange);
status->set_error_arg(name);
@@ -1700,8 +1647,7 @@
return false;
if (rr->hi < rr->lo) {
status->set_code(kRegexpBadCharRange);
- status->set_error_arg(
- StringPiece(os.data(), static_cast<int>(s->data() - os.data())));
+ status->set_error_arg(StringPiece(os.data(), s->data() - os.data()));
return false;
}
} else {
@@ -1786,7 +1732,7 @@
}
// Look for Perl character class symbols (extension).
- const UGroup *g = MaybeParsePerlCCEscape(s, flags_);
+ UGroup *g = MaybeParsePerlCCEscape(s, flags_);
if (g != NULL) {
AddUGroup(re->ccb_, g, g->sign, flags_);
continue;
@@ -1815,6 +1761,7 @@
if (negated)
re->ccb_->Negate();
+ re->ccb_->RemoveAbove(rune_max_);
*out_re = re;
return true;
@@ -1873,7 +1820,7 @@
// so that's the one we implement. One is enough.
if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
// Pull out name.
- size_t end = t.find('>', 2);
+ int end = t.find('>', 2);
if (end == t.npos) {
if (!IsValidUTF8(*s, status_))
return false;
@@ -1883,8 +1830,8 @@
}
// t is "P<name>...", t[end] == '>'
- StringPiece capture(t.begin()-2, static_cast<int>(end)+3); // "(?P<name>"
- StringPiece name(t.begin()+2, static_cast<int>(end)-2); // "name"
+ StringPiece capture(t.begin()-2, end+3); // "(?P<name>"
+ StringPiece name(t.begin()+2, end-2); // "name"
if (!IsValidUTF8(name, status_))
return false;
if (!IsValidCaptureName(name)) {
@@ -1898,7 +1845,7 @@
return false;
}
- s->remove_prefix(static_cast<int>(capture.end() - s->begin()));
+ s->remove_prefix(capture.end() - s->begin());
return true;
}
@@ -1981,8 +1928,7 @@
BadPerlOp:
status_->set_code(kRegexpBadPerlOp);
- status_->set_error_arg(
- StringPiece(s->begin(), static_cast<int>(t.begin() - s->begin())));
+ status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin()));
return false;
}
@@ -2129,13 +2075,12 @@
// a** is a syntax error, not a double-star.
// (and a++ means something else entirely, which we don't support!)
status->set_code(kRegexpRepeatOp);
- status->set_error_arg(
- StringPiece(lastunary.begin(),
- static_cast<int>(t.begin() - lastunary.begin())));
+ status->set_error_arg(StringPiece(lastunary.begin(),
+ t.begin() - lastunary.begin()));
return NULL;
}
}
- opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));
+ opstr.set(opstr.data(), t.data() - opstr.data());
if (!ps.PushRepeatOp(op, opstr, nongreedy))
return NULL;
isunary = opstr;
@@ -2161,13 +2106,12 @@
if (lastunary.size() > 0) {
// Not allowed to stack repetition operators.
status->set_code(kRegexpRepeatOp);
- status->set_error_arg(
- StringPiece(lastunary.begin(),
- static_cast<int>(t.begin() - lastunary.begin())));
+ status->set_error_arg(StringPiece(lastunary.begin(),
+ t.begin() - lastunary.begin()));
return NULL;
}
}
- opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));
+ opstr.set(opstr.data(), t.data() - opstr.data());
if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
return NULL;
isunary = opstr;
@@ -2243,7 +2187,7 @@
}
}
- const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
+ UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
if (g != NULL) {
Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
re->ccb_ = new CharClassBuilder;
« no previous file with comments | « third_party/re2/re2/onepass.cc ('k') | third_party/re2/re2/perl_groups.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698