| Index: third_party/re2/re2/mimics_pcre.cc
|
| diff --git a/third_party/re2/re2/mimics_pcre.cc b/third_party/re2/re2/mimics_pcre.cc
|
| new file mode 100644
|
| index 0000000000000000000000000000000000000000..fc6dd4ad5941f9b1810464a1f3155ccadc48aa60
|
| --- /dev/null
|
| +++ b/third_party/re2/re2/mimics_pcre.cc
|
| @@ -0,0 +1,185 @@
|
| +// Copyright 2008 The RE2 Authors. All Rights Reserved.
|
| +// Use of this source code is governed by a BSD-style
|
| +// license that can be found in the LICENSE file.
|
| +
|
| +// Determine whether this library should match PCRE exactly
|
| +// for a particular Regexp. (If so, the testing framework can
|
| +// check that it does.)
|
| +//
|
| +// This library matches PCRE except in these cases:
|
| +// * the regexp contains a repetition of an empty string,
|
| +// like (a*)* or (a*)+. In this case, PCRE will treat
|
| +// the repetition sequence as ending with an empty string,
|
| +// while this library does not.
|
| +// * Perl and PCRE differ on whether \v matches \n.
|
| +// For historical reasons, this library implements the Perl behavior.
|
| +// * Perl and PCRE allow $ in one-line mode to match either the very
|
| +// end of the text or just before a \n at the end of the text.
|
| +// This library requires it to match only the end of the text.
|
| +// * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
|
| +// match the end of the text if the last character is a \n.
|
| +// This library does allow it.
|
| +//
|
| +// Regexp::MimicsPCRE checks for any of these conditions.
|
| +
|
| +#include "util/util.h"
|
| +#include "re2/regexp.h"
|
| +#include "re2/walker-inl.h"
|
| +
|
| +namespace re2 {
|
| +
|
| +// Returns whether re might match an empty string.
|
| +static bool CanBeEmptyString(Regexp *re);
|
| +
|
| +// Walker class to compute whether library handles a regexp
|
| +// exactly as PCRE would. See comment at top for conditions.
|
| +
|
| +class PCREWalker : public Regexp::Walker<bool> {
|
| + public:
|
| + PCREWalker() {}
|
| + bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
|
| + int nchild_args);
|
| +
|
| + bool ShortVisit(Regexp* re, bool a) {
|
| + // Should never be called: we use Walk not WalkExponential.
|
| + LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
| + return a;
|
| + }
|
| +};
|
| +
|
| +// Called after visiting each of re's children and accumulating
|
| +// the return values in child_args. So child_args contains whether
|
| +// this library mimics PCRE for those subexpressions.
|
| +bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
| + bool* child_args, int nchild_args) {
|
| + // If children failed, so do we.
|
| + for (int i = 0; i < nchild_args; i++)
|
| + if (!child_args[i])
|
| + return false;
|
| +
|
| + // Otherwise look for other reasons to fail.
|
| + switch (re->op()) {
|
| + // Look for repeated empty string.
|
| + case kRegexpStar:
|
| + case kRegexpPlus:
|
| + case kRegexpQuest:
|
| + if (CanBeEmptyString(re->sub()[0]))
|
| + return false;
|
| + break;
|
| + case kRegexpRepeat:
|
| + if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
|
| + return false;
|
| + break;
|
| +
|
| + // Look for \v
|
| + case kRegexpLiteral:
|
| + if (re->rune() == '\v')
|
| + return false;
|
| + break;
|
| +
|
| + // Look for $ in single-line mode.
|
| + case kRegexpEndText:
|
| + case kRegexpEmptyMatch:
|
| + if (re->parse_flags() & Regexp::WasDollar)
|
| + return false;
|
| + break;
|
| +
|
| + // Look for ^ in multi-line mode.
|
| + case kRegexpBeginLine:
|
| + // No condition: in single-line mode ^ becomes kRegexpBeginText.
|
| + return false;
|
| +
|
| + default:
|
| + break;
|
| + }
|
| +
|
| + // Not proven guilty.
|
| + return true;
|
| +}
|
| +
|
| +// Returns whether this regexp's behavior will mimic PCRE's exactly.
|
| +bool Regexp::MimicsPCRE() {
|
| + PCREWalker w;
|
| + return w.Walk(this, true);
|
| +}
|
| +
|
| +
|
| +// Walker class to compute whether a Regexp can match an empty string.
|
| +// It is okay to overestimate. For example, \b\B cannot match an empty
|
| +// string, because \b and \B are mutually exclusive, but this isn't
|
| +// that smart and will say it can. Spurious empty strings
|
| +// will reduce the number of regexps we sanity check against PCRE,
|
| +// but they won't break anything.
|
| +
|
| +class EmptyStringWalker : public Regexp::Walker<bool> {
|
| + public:
|
| + EmptyStringWalker() { }
|
| + bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
| + bool* child_args, int nchild_args);
|
| +
|
| + bool ShortVisit(Regexp* re, bool a) {
|
| + // Should never be called: we use Walk not WalkExponential.
|
| + LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
|
| + return a;
|
| + }
|
| +
|
| + private:
|
| + DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
|
| +};
|
| +
|
| +// Called after visiting re's children. child_args contains the return
|
| +// value from each of the children's PostVisits (i.e., whether each child
|
| +// can match an empty string). Returns whether this clause can match an
|
| +// empty string.
|
| +bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
|
| + bool* child_args, int nchild_args) {
|
| + switch (re->op()) {
|
| + case kRegexpNoMatch: // never empty
|
| + case kRegexpLiteral:
|
| + case kRegexpAnyChar:
|
| + case kRegexpAnyByte:
|
| + case kRegexpCharClass:
|
| + case kRegexpLiteralString:
|
| + return false;
|
| +
|
| + case kRegexpEmptyMatch: // always empty
|
| + case kRegexpBeginLine: // always empty, when they match
|
| + case kRegexpEndLine:
|
| + case kRegexpNoWordBoundary:
|
| + case kRegexpWordBoundary:
|
| + case kRegexpBeginText:
|
| + case kRegexpEndText:
|
| + case kRegexpStar: // can always be empty
|
| + case kRegexpQuest:
|
| + case kRegexpHaveMatch:
|
| + return true;
|
| +
|
| + case kRegexpConcat: // can be empty if all children can
|
| + for (int i = 0; i < nchild_args; i++)
|
| + if (!child_args[i])
|
| + return false;
|
| + return true;
|
| +
|
| + case kRegexpAlternate: // can be empty if any child can
|
| + for (int i = 0; i < nchild_args; i++)
|
| + if (child_args[i])
|
| + return true;
|
| + return false;
|
| +
|
| + case kRegexpPlus: // can be empty if the child can
|
| + case kRegexpCapture:
|
| + return child_args[0];
|
| +
|
| + case kRegexpRepeat: // can be empty if child can or is x{0}
|
| + return child_args[0] || re->min() == 0;
|
| + }
|
| + return false;
|
| +}
|
| +
|
| +// Returns whether re can match an empty string.
|
| +static bool CanBeEmptyString(Regexp* re) {
|
| + EmptyStringWalker w;
|
| + return w.Walk(re, true);
|
| +}
|
| +
|
| +} // namespace re2
|
|
|