Index: third_party/re2/re2/parse.cc |
diff --git a/third_party/re2/re2/parse.cc b/third_party/re2/re2/parse.cc |
index 0cf4ab411a9ce8853d5e60ebb06126b72424ca93..cf74f5a1abbe8b64d13e2dd326dba7336c427e25 100644 |
--- a/third_party/re2/re2/parse.cc |
+++ b/third_party/re2/re2/parse.cc |
@@ -21,6 +21,7 @@ |
#include "re2/stringpiece.h" |
#include "re2/unicode_casefold.h" |
#include "re2/unicode_groups.h" |
+#include "re2/walker-inl.h" |
namespace re2 { |
@@ -156,7 +157,7 @@ private: |
int ncap_; // number of capturing parens seen |
int rune_max_; // maximum char value for this encoding |
- DISALLOW_EVIL_CONSTRUCTORS(ParseState); |
+ DISALLOW_COPY_AND_ASSIGN(ParseState); |
}; |
// Pseudo-operators - only on parse stack. |
@@ -214,7 +215,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { |
// single characters (e.g., [.] instead of \.), and some |
// analysis does better with fewer character classes. |
// Similarly, [Aa] can be rewritten as a literal A with ASCII case folding. |
- if (re->op_ == kRegexpCharClass) { |
+ if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) { |
+ re->ccb_->RemoveAbove(rune_max_); |
if (re->ccb_->size() == 1) { |
Rune r = re->ccb_->begin()->lo; |
re->Decref(); |
@@ -240,8 +242,8 @@ bool Regexp::ParseState::PushRegexp(Regexp* re) { |
// Searches the case folding tables and returns the CaseFold* that contains r. |
// If there isn't one, returns the CaseFold* with smallest f->lo bigger than r. |
// If there isn't one, returns NULL. |
-CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { |
- CaseFold* ef = f + n; |
+const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) { |
+ const CaseFold* ef = f + n; |
// Binary search for entry containing r. |
while (n > 0) { |
@@ -268,7 +270,7 @@ CaseFold* LookupCaseFold(CaseFold *f, int n, Rune r) { |
} |
// Returns the result of applying the fold f to the rune r. |
-Rune ApplyFold(CaseFold *f, Rune r) { |
+Rune ApplyFold(const CaseFold *f, Rune r) { |
switch (f->delta) { |
default: |
return r + f->delta; |
@@ -304,7 +306,7 @@ Rune ApplyFold(CaseFold *f, Rune r) { |
// |
// CycleFoldRune('?') = '?' |
Rune CycleFoldRune(Rune r) { |
- CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); |
+ const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r); |
if (f == NULL || r < f->lo) |
return r; |
return ApplyFold(f, r); |
@@ -327,7 +329,7 @@ static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) { |
return; |
while (lo <= hi) { |
- CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); |
+ const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, lo); |
if (f == NULL) // lo has no fold, nor does anything above lo |
break; |
if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo |
@@ -377,7 +379,6 @@ bool Regexp::ParseState::PushLiteral(Rune r) { |
} |
r = CycleFoldRune(r); |
} while (r != r1); |
- re->ccb_->RemoveAbove(rune_max_); |
return PushRegexp(re); |
} |
@@ -463,6 +464,59 @@ bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s, |
return true; |
} |
+// RepetitionWalker reports whether the repetition regexp is valid. |
+// Valid means that the combination of the top-level repetition |
+// and any inner repetitions does not exceed n copies of the |
+// innermost thing. |
+// This rewalks the regexp tree and is called for every repetition, |
+// so we have to worry about inducing quadratic behavior in the parser. |
+// We avoid this by only using RepetitionWalker when min or max >= 2. |
+// In that case the depth of any >= 2 nesting can only get to 9 without |
+// triggering a parse error, so each subtree can only be rewalked 9 times. |
+class RepetitionWalker : public Regexp::Walker<int> { |
+ public: |
+ RepetitionWalker() {} |
+ virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); |
+ virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, |
+ int* child_args, int nchild_args); |
+ virtual int ShortVisit(Regexp* re, int parent_arg); |
+ |
+ private: |
+ DISALLOW_COPY_AND_ASSIGN(RepetitionWalker); |
+}; |
+ |
+int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { |
+ int arg = parent_arg; |
+ if (re->op() == kRegexpRepeat) { |
+ int m = re->max(); |
+ if (m < 0) { |
+ m = re->min(); |
+ } |
+ if (m > 0) { |
+ arg /= m; |
+ } |
+ } |
+ return arg; |
+} |
+ |
+int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, |
+ int* child_args, int nchild_args) { |
+ int arg = pre_arg; |
+ for (int i = 0; i < nchild_args; i++) { |
+ if (child_args[i] < arg) { |
+ arg = child_args[i]; |
+ } |
+ } |
+ return arg; |
+} |
+ |
+int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) { |
+ // This should never be called, since we use Walk and not |
+ // WalkExponential. |
+ LOG(DFATAL) << "RepetitionWalker::ShortVisit called"; |
+ return 0; |
+} |
+ |
// Pushes a repetition regexp onto the stack. |
// A valid argument for the operator must already be on the stack. |
bool Regexp::ParseState::PushRepetition(int min, int max, |
@@ -488,8 +542,15 @@ bool Regexp::ParseState::PushRepetition(int min, int max, |
re->down_ = stacktop_->down_; |
re->sub()[0] = FinishRegexp(stacktop_); |
re->simple_ = re->ComputeSimple(); |
- |
stacktop_ = re; |
+ if (min >= 2 || max >= 2) { |
+ RepetitionWalker w; |
+ if (w.Walk(stacktop_, 1000) == 0) { |
+ status_->set_code(kRegexpRepeatSize); |
+ status_->set_error_arg(s); |
+ return false; |
+ } |
+ } |
return true; |
} |
@@ -515,13 +576,6 @@ bool Regexp::ParseState::DoLeftParenNoCapture() { |
return PushRegexp(re); |
} |
-// Adds r to cc, along with r's upper case if foldascii is set. |
-static void AddLiteral(CharClassBuilder* cc, Rune r, bool foldascii) { |
- cc->AddRange(r, r); |
- if (foldascii && 'a' <= r && r <= 'z') |
- cc->AddRange(r + 'A' - 'a', r + 'A' - 'a'); |
-} |
- |
// Processes a vertical bar in the input. |
bool Regexp::ParseState::DoVerticalBar() { |
MaybeConcatString(-1, NoParseFlags); |
@@ -535,46 +589,34 @@ bool Regexp::ParseState::DoVerticalBar() { |
Regexp* r1; |
Regexp* r2; |
if ((r1 = stacktop_) != NULL && |
- (r2 = stacktop_->down_) != NULL && |
+ (r2 = r1->down_) != NULL && |
r2->op() == kVerticalBar) { |
- // If above and below vertical bar are literal or char class, |
- // can merge into a single char class. |
Regexp* r3; |
- if ((r1->op() == kRegexpLiteral || |
- r1->op() == kRegexpCharClass || |
- r1->op() == kRegexpAnyChar) && |
- (r3 = r2->down_) != NULL) { |
- Rune rune; |
- switch (r3->op()) { |
- case kRegexpLiteral: // convert to char class |
- rune = r3->rune_; |
- r3->op_ = kRegexpCharClass; |
- r3->cc_ = NULL; |
- r3->ccb_ = new CharClassBuilder; |
- AddLiteral(r3->ccb_, rune, r3->parse_flags_ & Regexp::FoldCase); |
- // fall through |
- case kRegexpCharClass: |
- if (r1->op() == kRegexpLiteral) |
- AddLiteral(r3->ccb_, r1->rune_, |
- r1->parse_flags_ & Regexp::FoldCase); |
- else if (r1->op() == kRegexpCharClass) |
- r3->ccb_->AddCharClass(r1->ccb_); |
- if (r1->op() == kRegexpAnyChar || r3->ccb_->full()) { |
- delete r3->ccb_; |
- r3->ccb_ = NULL; |
- r3->op_ = kRegexpAnyChar; |
- } |
- // fall through |
- case kRegexpAnyChar: |
- // pop r1 |
- stacktop_ = r2; |
- r1->Decref(); |
- return true; |
- default: |
- break; |
+ if ((r3 = r2->down_) != NULL && |
+ (r1->op() == kRegexpAnyChar || r3->op() == kRegexpAnyChar)) { |
+ // AnyChar is above or below the vertical bar. Let it subsume |
+ // the other when the other is Literal, CharClass or AnyChar. |
+ if (r3->op() == kRegexpAnyChar && |
+ (r1->op() == kRegexpLiteral || |
+ r1->op() == kRegexpCharClass || |
+ r1->op() == kRegexpAnyChar)) { |
+ // Discard r1. |
+ stacktop_ = r2; |
+ r1->Decref(); |
+ return true; |
+ } |
+ if (r1->op() == kRegexpAnyChar && |
+ (r3->op() == kRegexpLiteral || |
+ r3->op() == kRegexpCharClass || |
+ r3->op() == kRegexpAnyChar)) { |
+ // Rearrange the stack and discard r3. |
+ r1->down_ = r3->down_; |
+ r2->down_ = r1; |
+ stacktop_ = r2; |
+ r3->Decref(); |
+ return true; |
} |
} |
- |
// Swap r1 below vertical bar (r2). |
r1->down_ = r2->down_; |
r2->down_ = r1; |
@@ -1105,7 +1147,7 @@ bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) { |
if (r >= 0) { |
re1->op_ = kRegexpLiteral; |
re1->rune_ = r; |
- re1->parse_flags_ = flags; |
+ re1->parse_flags_ = static_cast<uint16>(flags); |
return true; |
} |
@@ -1188,6 +1230,14 @@ static int StringPieceToRune(Rune *r, StringPiece *sp, RegexpStatus* status) { |
int n; |
if (fullrune(sp->data(), sp->size())) { |
n = chartorune(r, sp->data()); |
+ // Some copies of chartorune have a bug that accepts |
+ // encodings of values in (10FFFF, 1FFFFF] as valid. |
+ // Those values break the character class algorithm, |
+ // which assumes Runemax is the largest rune. |
+ if (*r > Runemax) { |
+ n = 1; |
+ *r = Runeerror; |
+ } |
if (!(n == 1 && *r == Runeerror)) { // no decoding error |
sp->remove_prefix(n); |
return n; |
@@ -1290,6 +1340,8 @@ static bool ParseEscape(StringPiece* s, Rune* rp, |
} |
} |
} |
+ if (code > rune_max) |
+ goto BadEscape; |
*rp = code; |
return true; |
@@ -1375,7 +1427,8 @@ static bool ParseEscape(StringPiece* s, Rune* rp, |
BadEscape: |
// Unrecognized escape sequence. |
status->set_code(kRegexpBadEscape); |
- status->set_error_arg(StringPiece(begin, s->data() - begin)); |
+ status->set_error_arg( |
+ StringPiece(begin, static_cast<int>(s->data() - begin))); |
return false; |
} |
@@ -1403,8 +1456,8 @@ void CharClassBuilder::AddRangeFlags( |
} |
// Look for a group with the given name. |
-static UGroup* LookupGroup(const StringPiece& name, |
- UGroup *groups, int ngroups) { |
+static const UGroup* LookupGroup(const StringPiece& name, |
+ const UGroup *groups, int ngroups) { |
// Simple name lookup. |
for (int i = 0; i < ngroups; i++) |
if (StringPiece(groups[i].name) == name) |
@@ -1418,16 +1471,16 @@ static URange32 any32[] = { { 65536, Runemax } }; |
static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 }; |
// Look for a POSIX group with the given name (e.g., "[:^alpha:]") |
-static UGroup* LookupPosixGroup(const StringPiece& name) { |
+static const UGroup* LookupPosixGroup(const StringPiece& name) { |
return LookupGroup(name, posix_groups, num_posix_groups); |
} |
-static UGroup* LookupPerlGroup(const StringPiece& name) { |
+static const UGroup* LookupPerlGroup(const StringPiece& name) { |
return LookupGroup(name, perl_groups, num_perl_groups); |
} |
// Look for a Unicode group with the given name (e.g., "Han") |
-static UGroup* LookupUnicodeGroup(const StringPiece& name) { |
+static const UGroup* LookupUnicodeGroup(const StringPiece& name) { |
// Special case: "Any" means any. |
if (name == StringPiece("Any")) |
return &anygroup; |
@@ -1435,7 +1488,7 @@ static UGroup* LookupUnicodeGroup(const StringPiece& name) { |
} |
// Add a UGroup or its negation to the character class. |
-static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, |
+static void AddUGroup(CharClassBuilder *cc, const UGroup *g, int sign, |
Regexp::ParseFlags parse_flags) { |
if (sign == +1) { |
for (int i = 0; i < g->nr16; i++) { |
@@ -1486,7 +1539,7 @@ static void AddUGroup(CharClassBuilder *cc, UGroup *g, int sign, |
// On success, sets *s to span the remainder of the string |
// and returns the corresponding UGroup. |
// The StringPiece must *NOT* be edited unless the call succeeds. |
-UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { |
+const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { |
if (!(parse_flags & Regexp::PerlClasses)) |
return NULL; |
if (s->size() < 2 || (*s)[0] != '\\') |
@@ -1494,7 +1547,7 @@ UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_flags) { |
// Could use StringPieceToRune, but there aren't |
// any non-ASCII Perl group names. |
StringPiece name(s->begin(), 2); |
- UGroup *g = LookupPerlGroup(name); |
+ const UGroup *g = LookupPerlGroup(name); |
if (g == NULL) |
return NULL; |
s->remove_prefix(name.size()); |
@@ -1534,10 +1587,10 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, |
if (c != '{') { |
// Name is the bit of string we just skipped over for c. |
const char* p = seq.begin() + 2; |
- name = StringPiece(p, s->begin() - p); |
+ name = StringPiece(p, static_cast<int>(s->begin() - p)); |
} else { |
// Name is in braces. Look for closing } |
- int end = s->find('}', 0); |
+ size_t end = s->find('}', 0); |
if (end == s->npos) { |
if (!IsValidUTF8(seq, status)) |
return kParseError; |
@@ -1545,21 +1598,21 @@ ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags, |
status->set_error_arg(seq); |
return kParseError; |
} |
- name = StringPiece(s->begin(), end); // without '}' |
- s->remove_prefix(end + 1); // with '}' |
+ name = StringPiece(s->begin(), static_cast<int>(end)); // without '}' |
+ s->remove_prefix(static_cast<int>(end) + 1); // with '}' |
if (!IsValidUTF8(name, status)) |
return kParseError; |
} |
// Chop seq where s now begins. |
- seq = StringPiece(seq.begin(), s->begin() - seq.begin()); |
+ seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin())); |
// Look up group |
if (name.size() > 0 && name[0] == '^') { |
sign = -sign; |
name.remove_prefix(1); // '^' |
} |
- UGroup *g = LookupUnicodeGroup(name); |
+ const UGroup *g = LookupUnicodeGroup(name); |
if (g == NULL) { |
status->set_code(kRegexpBadCharRange); |
status->set_error_arg(seq); |
@@ -1593,9 +1646,9 @@ static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags, |
// Got it. Check that it's valid. |
q += 2; |
- StringPiece name(p, q-p); |
+ StringPiece name(p, static_cast<int>(q-p)); |
- UGroup *g = LookupPosixGroup(name); |
+ const UGroup *g = LookupPosixGroup(name); |
if (g == NULL) { |
status->set_code(kRegexpBadCharRange); |
status->set_error_arg(name); |
@@ -1647,7 +1700,8 @@ bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr, |
return false; |
if (rr->hi < rr->lo) { |
status->set_code(kRegexpBadCharRange); |
- status->set_error_arg(StringPiece(os.data(), s->data() - os.data())); |
+ status->set_error_arg( |
+ StringPiece(os.data(), static_cast<int>(s->data() - os.data()))); |
return false; |
} |
} else { |
@@ -1732,7 +1786,7 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, |
} |
// Look for Perl character class symbols (extension). |
- UGroup *g = MaybeParsePerlCCEscape(s, flags_); |
+ const UGroup *g = MaybeParsePerlCCEscape(s, flags_); |
if (g != NULL) { |
AddUGroup(re->ccb_, g, g->sign, flags_); |
continue; |
@@ -1761,7 +1815,6 @@ bool Regexp::ParseState::ParseCharClass(StringPiece* s, |
if (negated) |
re->ccb_->Negate(); |
- re->ccb_->RemoveAbove(rune_max_); |
*out_re = re; |
return true; |
@@ -1820,7 +1873,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { |
// so that's the one we implement. One is enough. |
if (t.size() > 2 && t[0] == 'P' && t[1] == '<') { |
// Pull out name. |
- int end = t.find('>', 2); |
+ size_t end = t.find('>', 2); |
if (end == t.npos) { |
if (!IsValidUTF8(*s, status_)) |
return false; |
@@ -1830,8 +1883,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { |
} |
// t is "P<name>...", t[end] == '>' |
- StringPiece capture(t.begin()-2, end+3); // "(?P<name>" |
- StringPiece name(t.begin()+2, end-2); // "name" |
+ StringPiece capture(t.begin()-2, static_cast<int>(end)+3); // "(?P<name>" |
+ StringPiece name(t.begin()+2, static_cast<int>(end)-2); // "name" |
if (!IsValidUTF8(name, status_)) |
return false; |
if (!IsValidCaptureName(name)) { |
@@ -1845,7 +1898,7 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { |
return false; |
} |
- s->remove_prefix(capture.end() - s->begin()); |
+ s->remove_prefix(static_cast<int>(capture.end() - s->begin())); |
return true; |
} |
@@ -1928,7 +1981,8 @@ bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) { |
BadPerlOp: |
status_->set_code(kRegexpBadPerlOp); |
- status_->set_error_arg(StringPiece(s->begin(), t.begin() - s->begin())); |
+ status_->set_error_arg( |
+ StringPiece(s->begin(), static_cast<int>(t.begin() - s->begin()))); |
return false; |
} |
@@ -2075,12 +2129,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, |
// a** is a syntax error, not a double-star. |
// (and a++ means something else entirely, which we don't support!) |
status->set_code(kRegexpRepeatOp); |
- status->set_error_arg(StringPiece(lastunary.begin(), |
- t.begin() - lastunary.begin())); |
+ status->set_error_arg( |
+ StringPiece(lastunary.begin(), |
+ static_cast<int>(t.begin() - lastunary.begin()))); |
return NULL; |
} |
} |
- opstr.set(opstr.data(), t.data() - opstr.data()); |
+ opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data())); |
if (!ps.PushRepeatOp(op, opstr, nongreedy)) |
return NULL; |
isunary = opstr; |
@@ -2106,12 +2161,13 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, |
if (lastunary.size() > 0) { |
// Not allowed to stack repetition operators. |
status->set_code(kRegexpRepeatOp); |
- status->set_error_arg(StringPiece(lastunary.begin(), |
- t.begin() - lastunary.begin())); |
+ status->set_error_arg( |
+ StringPiece(lastunary.begin(), |
+ static_cast<int>(t.begin() - lastunary.begin()))); |
return NULL; |
} |
} |
- opstr.set(opstr.data(), t.data() - opstr.data()); |
+ opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data())); |
if (!ps.PushRepetition(lo, hi, opstr, nongreedy)) |
return NULL; |
isunary = opstr; |
@@ -2187,7 +2243,7 @@ Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags, |
} |
} |
- UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); |
+ const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags()); |
if (g != NULL) { |
Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase); |
re->ccb_ = new CharClassBuilder; |