| Index: third_party/re2/re2/parse.cc
|
| diff --git a/third_party/re2/re2/parse.cc b/third_party/re2/re2/parse.cc
|
| index cf74f5a1abbe8b64d13e2dd326dba7336c427e25..0cf4ab411a9ce8853d5e60ebb06126b72424ca93 100644
|
| --- a/third_party/re2/re2/parse.cc
|
| +++ b/third_party/re2/re2/parse.cc
|
| @@ -21,7 +21,6 @@
|
| #include "re2/stringpiece.h"
|
| #include "re2/unicode_casefold.h"
|
| #include "re2/unicode_groups.h"
|
| -#include "re2/walker-inl.h"
|
|
|
| namespace re2 {
|
|
|
| @@ -157,7 +156,7 @@
|
| int ncap_; // number of capturing parens seen
|
| int rune_max_; // maximum char value for this encoding
|
|
|
| - DISALLOW_COPY_AND_ASSIGN(ParseState);
|
| + DISALLOW_EVIL_CONSTRUCTORS(ParseState);
|
| };
|
|
|
| // Pseudo-operators - only on parse stack.
|
| @@ -215,8 +214,7 @@
|
| // single characters (e.g., [.] instead of \.), and some
|
| // analysis does better with fewer character classes.
|
| // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.
|
| - if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {
|
| - re->ccb_->RemoveAbove(rune_max_);
|
| + if (re->op_ == kRegexpCharClass) {
|
| if (re->ccb_->size() == 1) {
|
| Rune r = re->ccb_->begin()->lo;
|
| re->Decref();
|
| @@ -242,8 +240,8 @@
|
| // Searches the case folding tables and returns the CaseFold* that contains r.
|
| // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.
|
| // If there isn't one, returns NULL.
|
| -const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {
|
| - const CaseFold* ef = f + n;
|
| +CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) {
|
| + CaseFold* ef = f + n;
|
|
|
| // Binary search for entry containing r.
|
| while (n > 0) {
|
| @@ -270,7 +268,7 @@
|
| }
|
|
|
| // Returns the result of applying the fold f to the rune r.
|
| -Rune ApplyFold(const CaseFold *f, Rune r) {
|
| +Rune ApplyFold(CaseFold *f, Rune r) {
|
| switch (f->delta) {
|
| default:
|
| return r + f->delta;
|
| @@ -306,7 +304,7 @@
|
| //
|
| // CycleFoldRune('?') = '?'
|
| Rune CycleFoldRune(Rune r) {
|
| - const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
|
| + CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);
|
| if (f == NULL || r < f->lo)
|
| return r;
|
| return ApplyFold(f, r);
|
| @@ -329,7 +327,7 @@
|
| return;
|
|
|
| while (lo <= hi) {
|
| - const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
|
| + CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo);
|
| if (f == NULL) // lo has no fold, nor does anything above lo
|
| break;
|
| if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo
|
| @@ -379,6 +377,7 @@
|
| }
|
| r = CycleFoldRune(r);
|
| } while (r != r1);
|
| + re->ccb_->RemoveAbove(rune_max_);
|
| return PushRegexp(re);
|
| }
|
|
|
| @@ -464,59 +463,6 @@
|
| return true;
|
| }
|
|
|
| -// RepetitionWalker reports whether the repetition regexp is valid.
|
| -// Valid means that the combination of the top-level repetition
|
| -// and any inner repetitions does not exceed n copies of the
|
| -// innermost thing.
|
| -// This rewalks the regexp tree and is called for every repetition,
|
| -// so we have to worry about inducing quadratic behavior in the parser.
|
| -// We avoid this by only using RepetitionWalker when min or max >= 2.
|
| -// In that case the depth of any >= 2 nesting can only get to 9 without
|
| -// triggering a parse error, so each subtree can only be rewalked 9 times.
|
| -class RepetitionWalker : public Regexp::Walker<int> {
|
| - public:
|
| - RepetitionWalker() {}
|
| - virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);
|
| - virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
| - int* child_args, int nchild_args);
|
| - virtual int ShortVisit(Regexp* re, int parent_arg);
|
| -
|
| - private:
|
| - DISALLOW_COPY_AND_ASSIGN(RepetitionWalker);
|
| -};
|
| -
|
| -int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {
|
| - int arg = parent_arg;
|
| - if (re->op() == kRegexpRepeat) {
|
| - int m = re->max();
|
| - if (m < 0) {
|
| - m = re->min();
|
| - }
|
| - if (m > 0) {
|
| - arg /= m;
|
| - }
|
| - }
|
| - return arg;
|
| -}
|
| -
|
| -int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,
|
| - int* child_args, int nchild_args) {
|
| - int arg = pre_arg;
|
| - for (int i = 0; i < nchild_args; i++) {
|
| - if (child_args[i] < arg) {
|
| - arg = child_args[i];
|
| - }
|
| - }
|
| - return arg;
|
| -}
|
| -
|
| -int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {
|
| - // This should never be called, since we use Walk and not
|
| - // WalkExponential.
|
| - LOG(DFATAL) << "RepetitionWalker::ShortVisit called";
|
| - return 0;
|
| -}
|
| -
|
| // Pushes a repetition regexp onto the stack.
|
| // A valid argument for the operator must already be on the stack.
|
| bool Regexp::ParseState::PushRepetition(int min, int max,
|
| @@ -542,15 +488,8 @@
|
| re->down_ = stacktop_->down_;
|
| re->sub()[0] = FinishRegexp(stacktop_);
|
| re->simple_ = re->ComputeSimple();
|
| +
|
| stacktop_ = re;
|
| - if (min >= 2 || max >= 2) {
|
| - RepetitionWalker w;
|
| - if (w.Walk(stacktop_, 1000) == 0) {
|
| - status_->set_code(kRegexpRepeatSize);
|
| - status_->set_error_arg(s);
|
| - return false;
|
| - }
|
| - }
|
| return true;
|
| }
|
|
|
| @@ -576,6 +515,13 @@
|
| return PushRegexp(re);
|
| }
|
|
|
| +// Adds r to cc, along with r's upper case if foldascii is set.
|
| +static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) {
|
| + cc->AddRange(r, r);
|
| + if (foldascii && 'a' <= r && r <= 'z')
|
| + cc->AddRange(r + 'A' - 'a', r + 'A' - 'a');
|
| +}
|
| +
|
| // Processes a vertical bar in the input.
|
| bool Regexp::ParseState::DoVerticalBar() {
|
| MaybeConcatString(-1, NoParseFlags);
|
| @@ -589,34 +535,46 @@
|
| Regexp* r1;
|
| Regexp* r2;
|
| if ((r1 = stacktop_) != NULL &&
|
| - (r2 = r1->down_) != NULL &&
|
| + (r2 = stacktop_->down_) != NULL &&
|
| r2->op() == kVerticalBar) {
|
| + // If above and below vertical bar are literal or char class,
|
| + // can merge into a single char class.
|
| Regexp* r3;
|
| - if ((r3 = r2->down_) != NULL &&
|
| - (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) {
|
| - // AnyChar is above or below the vertical bar. Let it subsume
|
| - // the other when the other is Literal, CharClass or AnyChar.
|
| - if (r3->op() == kRegexpAnyChar &&
|
| - (r1->op() == kRegexpLiteral ||
|
| - r1->op() == kRegexpCharClass ||
|
| - r1->op() == kRegexpAnyChar)) {
|
| - // Discard r1.
|
| - stacktop_ = r2;
|
| - r1->Decref();
|
| - return true;
|
| + if ((r1->op() == kRegexpLiteral ||
|
| + r1->op() == kRegexpCharClass ||
|
| + r1->op() == kRegexpAnyChar) &&
|
| + (r3 = r2->down_) != NULL) {
|
| + Rune rune;
|
| + switch (r3->op()) {
|
| + case kRegexpLiteral: // convert to char class
|
| + rune = r3->rune_;
|
| + r3->op_ = kRegexpCharClass;
|
| + r3->cc_ = NULL;
|
| + r3->ccb_ = new CharClassBuilder;
|
| + AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase);
|
| + // fall through
|
| + case kRegexpCharClass:
|
| + if (r1->op() == kRegexpLiteral)
|
| + AddLiteral(r3->ccb_, r1->rune_,
|
| + r1->parse_flags_ & Regexp::FoldCase);
|
| + else if (r1->op() == kRegexpCharClass)
|
| + r3->ccb_->AddCharClass(r1->ccb_);
|
| + if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) {
|
| + delete r3->ccb_;
|
| + r3->ccb_ = NULL;
|
| + r3->op_ = kRegexpAnyChar;
|
| + }
|
| + // fall through
|
| + case kRegexpAnyChar:
|
| + // pop r1
|
| + stacktop_ = r2;
|
| + r1->Decref();
|
| + return true;
|
| + default:
|
| + break;
|
| }
|
| - if (r1->op() == kRegexpAnyChar &&
|
| - (r3->op() == kRegexpLiteral ||
|
| - r3->op() == kRegexpCharClass ||
|
| - r3->op() == kRegexpAnyChar)) {
|
| - // Rearrange the stack and discard r3.
|
| - r1->down_ = r3->down_;
|
| - r2->down_ = r1;
|
| - stacktop_ = r2;
|
| - r3->Decref();
|
| - return true;
|
| - }
|
| - }
|
| + }
|
| +
|
| // Swap r1 below vertical bar (r2).
|
| r1->down_ = r2->down_;
|
| r2->down_ = r1;
|
| @@ -1147,7 +1105,7 @@
|
| if (r >= 0) {
|
| re1->op_ = kRegexpLiteral;
|
| re1->rune_ = r;
|
| - re1->parse_flags_ = static_cast<uint16>(flags);
|
| + re1->parse_flags_ = flags;
|
| return true;
|
| }
|
|
|
| @@ -1230,14 +1188,6 @@
|
| int n;
|
| if (fullrune(sp->data(), sp->size())) {
|
| n = chartorune(r, sp->data());
|
| - // Some copies of chartorune have a bug that accepts
|
| - // encodings of values in (10FFFF, 1FFFFF] as valid.
|
| - // Those values break the character class algorithm,
|
| - // which assumes Runemax is the largest rune.
|
| - if (*r > Runemax) {
|
| - n = 1;
|
| - *r = Runeerror;
|
| - }
|
| if (!(n == 1 && *r == Runeerror)) { // no decoding error
|
| sp->remove_prefix(n);
|
| return n;
|
| @@ -1340,8 +1290,6 @@
|
| }
|
| }
|
| }
|
| - if (code > rune_max)
|
| - goto BadEscape;
|
| *rp = code;
|
| return true;
|
|
|
| @@ -1427,8 +1375,7 @@
|
| BadEscape:
|
| // Unrecognized escape sequence.
|
| status->set_code(kRegexpBadEscape);
|
| - status->set_error_arg(
|
| - StringPiece(begin, static_cast<int>(s->data() - begin)));
|
| + status->set_error_arg(StringPiece(begin, s->data() - begin));
|
| return false;
|
| }
|
|
|
| @@ -1456,8 +1403,8 @@
|
| }
|
|
|
| // Look for a group with the given name.
|
| -static const UGroup* LookupGroup(const StringPiece& name,
|
| - const UGroup *groups, int ngroups) {
|
| +static UGroup* LookupGroup(const StringPiece& name,
|
| + UGroup *groups, int ngroups) {
|
| // Simple name lookup.
|
| for (int i = 0; i < ngroups; i++)
|
| if (StringPiece(groups[i].name) == name)
|
| @@ -1471,16 +1418,16 @@
|
| static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };
|
|
|
| // Look for a POSIX group with the given name (e.g., "[:^alpha:]")
|
| -static const UGroup* LookupPosixGroup(const StringPiece& name) {
|
| +static UGroup* LookupPosixGroup(const StringPiece& name) {
|
| return LookupGroup(name, posix_groups, num_posix_groups);
|
| }
|
|
|
| -static const UGroup* LookupPerlGroup(const StringPiece& name) {
|
| +static UGroup* LookupPerlGroup(const StringPiece& name) {
|
| return LookupGroup(name, perl_groups, num_perl_groups);
|
| }
|
|
|
| // Look for a Unicode group with the given name (e.g., "Han")
|
| -static const UGroup* LookupUnicodeGroup(const StringPiece& name) {
|
| +static UGroup* LookupUnicodeGroup(const StringPiece& name) {
|
| // Special case: "Any" means any.
|
| if (name == StringPiece("Any"))
|
| return &anygroup;
|
| @@ -1488,7 +1435,7 @@
|
| }
|
|
|
| // Add a UGroup or its negation to the character class.
|
| -static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign,
|
| +static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign,
|
| Regexp::ParseFlags parse_flags) {
|
| if (sign == +1) {
|
| for (int i = 0; i < g->nr16; i++) {
|
| @@ -1539,7 +1486,7 @@
|
| // On success, sets *s to span the remainder of the string
|
| // and returns the corresponding UGroup.
|
| // The StringPiece must *NOT* be edited unless the call succeeds.
|
| -const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
|
| +UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) {
|
| if (!(parse_flags & Regexp::PerlClasses))
|
| return NULL;
|
| if (s->size() < 2 || (*s)[0] != '\\')
|
| @@ -1547,7 +1494,7 @@
|
| // Could use StringPieceToRune, but there aren't
|
| // any non-ASCII Perl group names.
|
| StringPiece name(s->begin(), 2);
|
| - const UGroup *g = LookupPerlGroup(name);
|
| + UGroup *g = LookupPerlGroup(name);
|
| if (g == NULL)
|
| return NULL;
|
| s->remove_prefix(name.size());
|
| @@ -1587,10 +1534,10 @@
|
| if (c != '{') {
|
| // Name is the bit of string we just skipped over for c.
|
| const char* p = seq.begin() + 2;
|
| - name = StringPiece(p, static_cast<int>(s->begin() - p));
|
| + name = StringPiece(p, s->begin() - p);
|
| } else {
|
| // Name is in braces. Look for closing }
|
| - size_t end = s->find('}', 0);
|
| + int end = s->find('}', 0);
|
| if (end == s->npos) {
|
| if (!IsValidUTF8(seq, status))
|
| return kParseError;
|
| @@ -1598,21 +1545,21 @@
|
| status->set_error_arg(seq);
|
| return kParseError;
|
| }
|
| - name = StringPiece(s->begin(), static_cast<int>(end)); // without '}'
|
| - s->remove_prefix(static_cast<int>(end) + 1); // with '}'
|
| + name = StringPiece(s->begin(), end); // without '}'
|
| + s->remove_prefix(end + 1); // with '}'
|
| if (!IsValidUTF8(name, status))
|
| return kParseError;
|
| }
|
|
|
| // Chop seq where s now begins.
|
| - seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin()));
|
| + seq = StringPiece(seq.begin(), s->begin() - seq.begin());
|
|
|
| // Look up group
|
| if (name.size() > 0 && name[0] == '^') {
|
| sign = -sign;
|
| name.remove_prefix(1); // '^'
|
| }
|
| - const UGroup *g = LookupUnicodeGroup(name);
|
| + UGroup *g = LookupUnicodeGroup(name);
|
| if (g == NULL) {
|
| status->set_code(kRegexpBadCharRange);
|
| status->set_error_arg(seq);
|
| @@ -1646,9 +1593,9 @@
|
|
|
| // Got it. Check that it's valid.
|
| q += 2;
|
| - StringPiece name(p, static_cast<int>(q-p));
|
| -
|
| - const UGroup *g = LookupPosixGroup(name);
|
| + StringPiece name(p, q-p);
|
| +
|
| + UGroup *g = LookupPosixGroup(name);
|
| if (g == NULL) {
|
| status->set_code(kRegexpBadCharRange);
|
| status->set_error_arg(name);
|
| @@ -1700,8 +1647,7 @@
|
| return false;
|
| if (rr->hi < rr->lo) {
|
| status->set_code(kRegexpBadCharRange);
|
| - status->set_error_arg(
|
| - StringPiece(os.data(), static_cast<int>(s->data() - os.data())));
|
| + status->set_error_arg(StringPiece(os.data(), s->data() - os.data()));
|
| return false;
|
| }
|
| } else {
|
| @@ -1786,7 +1732,7 @@
|
| }
|
|
|
| // Look for Perl character class symbols (extension).
|
| - const UGroup *g = MaybeParsePerlCCEscape(s, flags_);
|
| + UGroup *g = MaybeParsePerlCCEscape(s, flags_);
|
| if (g != NULL) {
|
| AddUGroup(re->ccb_, g, g->sign, flags_);
|
| continue;
|
| @@ -1815,6 +1761,7 @@
|
|
|
| if (negated)
|
| re->ccb_->Negate();
|
| + re->ccb_->RemoveAbove(rune_max_);
|
|
|
| *out_re = re;
|
| return true;
|
| @@ -1873,7 +1820,7 @@
|
| // so that's the one we implement. One is enough.
|
| if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {
|
| // Pull out name.
|
| - size_t end = t.find('>', 2);
|
| + int end = t.find('>', 2);
|
| if (end == t.npos) {
|
| if (!IsValidUTF8(*s, status_))
|
| return false;
|
| @@ -1883,8 +1830,8 @@
|
| }
|
|
|
| // t is "P<name>...", t[end] == '>'
|
| - StringPiece capture(t.begin()-2, static_cast<int>(end)+3); // "(?P<name>"
|
| - StringPiece name(t.begin()+2, static_cast<int>(end)-2); // "name"
|
| + StringPiece capture(t.begin()-2, end+3); // "(?P<name>"
|
| + StringPiece name(t.begin()+2, end-2); // "name"
|
| if (!IsValidUTF8(name, status_))
|
| return false;
|
| if (!IsValidCaptureName(name)) {
|
| @@ -1898,7 +1845,7 @@
|
| return false;
|
| }
|
|
|
| - s->remove_prefix(static_cast<int>(capture.end() - s->begin()));
|
| + s->remove_prefix(capture.end() - s->begin());
|
| return true;
|
| }
|
|
|
| @@ -1981,8 +1928,7 @@
|
|
|
| BadPerlOp:
|
| status_->set_code(kRegexpBadPerlOp);
|
| - status_->set_error_arg(
|
| - StringPiece(s->begin(), static_cast<int>(t.begin() - s->begin())));
|
| + status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin()));
|
| return false;
|
| }
|
|
|
| @@ -2129,13 +2075,12 @@
|
| // a** is a syntax error, not a double-star.
|
| // (and a++ means something else entirely, which we don't support!)
|
| status->set_code(kRegexpRepeatOp);
|
| - status->set_error_arg(
|
| - StringPiece(lastunary.begin(),
|
| - static_cast<int>(t.begin() - lastunary.begin())));
|
| + status->set_error_arg(StringPiece(lastunary.begin(),
|
| + t.begin() - lastunary.begin()));
|
| return NULL;
|
| }
|
| }
|
| - opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));
|
| + opstr.set(opstr.data(), t.data() - opstr.data());
|
| if (!ps.PushRepeatOp(op, opstr, nongreedy))
|
| return NULL;
|
| isunary = opstr;
|
| @@ -2161,13 +2106,12 @@
|
| if (lastunary.size() > 0) {
|
| // Not allowed to stack repetition operators.
|
| status->set_code(kRegexpRepeatOp);
|
| - status->set_error_arg(
|
| - StringPiece(lastunary.begin(),
|
| - static_cast<int>(t.begin() - lastunary.begin())));
|
| + status->set_error_arg(StringPiece(lastunary.begin(),
|
| + t.begin() - lastunary.begin()));
|
| return NULL;
|
| }
|
| }
|
| - opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));
|
| + opstr.set(opstr.data(), t.data() - opstr.data());
|
| if (!ps.PushRepetition(lo, hi, opstr, nongreedy))
|
| return NULL;
|
| isunary = opstr;
|
| @@ -2243,7 +2187,7 @@
|
| }
|
| }
|
|
|
| - const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
|
| + UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());
|
| if (g != NULL) {
|
| Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);
|
| re->ccb_ = new CharClassBuilder;
|
|
|