| OLD | NEW |
| (Empty) |
| 1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Determine whether this library should match PCRE exactly | |
| 6 // for a particular Regexp. (If so, the testing framework can | |
| 7 // check that it does.) | |
| 8 // | |
| 9 // This library matches PCRE except in these cases: | |
| 10 // * the regexp contains a repetition of an empty string, | |
| 11 // like (a*)* or (a*)+. In this case, PCRE will treat | |
| 12 // the repetition sequence as ending with an empty string, | |
| 13 // while this library does not. | |
| 14 // * Perl and PCRE differ on whether \v matches \n. | |
| 15 // For historical reasons, this library implements the Perl behavior. | |
| 16 // * Perl and PCRE allow $ in one-line mode to match either the very | |
| 17 // end of the text or just before a \n at the end of the text. | |
| 18 // This library requires it to match only the end of the text. | |
| 19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to | |
| 20 // match the end of the text if the last character is a \n. | |
| 21 // This library does allow it. | |
| 22 // | |
| 23 // Regexp::MimicsPCRE checks for any of these conditions. | |
| 24 | |
| 25 #include "util/util.h" | |
| 26 #include "re2/regexp.h" | |
| 27 #include "re2/walker-inl.h" | |
| 28 | |
| 29 namespace re2 { | |
| 30 | |
| 31 // Returns whether re might match an empty string. | |
| 32 static bool CanBeEmptyString(Regexp *re); | |
| 33 | |
| 34 // Walker class to compute whether library handles a regexp | |
| 35 // exactly as PCRE would. See comment at top for conditions. | |
| 36 | |
| 37 class PCREWalker : public Regexp::Walker<bool> { | |
| 38 public: | |
| 39 PCREWalker() {} | |
| 40 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, | |
| 41 int nchild_args); | |
| 42 | |
| 43 bool ShortVisit(Regexp* re, bool a) { | |
| 44 // Should never be called: we use Walk not WalkExponential. | |
| 45 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; | |
| 46 return a; | |
| 47 } | |
| 48 }; | |
| 49 | |
| 50 // Called after visiting each of re's children and accumulating | |
| 51 // the return values in child_args. So child_args contains whether | |
| 52 // this library mimics PCRE for those subexpressions. | |
| 53 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, | |
| 54 bool* child_args, int nchild_args) { | |
| 55 // If children failed, so do we. | |
| 56 for (int i = 0; i < nchild_args; i++) | |
| 57 if (!child_args[i]) | |
| 58 return false; | |
| 59 | |
| 60 // Otherwise look for other reasons to fail. | |
| 61 switch (re->op()) { | |
| 62 // Look for repeated empty string. | |
| 63 case kRegexpStar: | |
| 64 case kRegexpPlus: | |
| 65 case kRegexpQuest: | |
| 66 if (CanBeEmptyString(re->sub()[0])) | |
| 67 return false; | |
| 68 break; | |
| 69 case kRegexpRepeat: | |
| 70 if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) | |
| 71 return false; | |
| 72 break; | |
| 73 | |
| 74 // Look for \v | |
| 75 case kRegexpLiteral: | |
| 76 if (re->rune() == '\v') | |
| 77 return false; | |
| 78 break; | |
| 79 | |
| 80 // Look for $ in single-line mode. | |
| 81 case kRegexpEndText: | |
| 82 case kRegexpEmptyMatch: | |
| 83 if (re->parse_flags() & Regexp::WasDollar) | |
| 84 return false; | |
| 85 break; | |
| 86 | |
| 87 // Look for ^ in multi-line mode. | |
| 88 case kRegexpBeginLine: | |
| 89 // No condition: in single-line mode ^ becomes kRegexpBeginText. | |
| 90 return false; | |
| 91 | |
| 92 default: | |
| 93 break; | |
| 94 } | |
| 95 | |
| 96 // Not proven guilty. | |
| 97 return true; | |
| 98 } | |
| 99 | |
| 100 // Returns whether this regexp's behavior will mimic PCRE's exactly. | |
| 101 bool Regexp::MimicsPCRE() { | |
| 102 PCREWalker w; | |
| 103 return w.Walk(this, true); | |
| 104 } | |
| 105 | |
| 106 | |
| 107 // Walker class to compute whether a Regexp can match an empty string. | |
| 108 // It is okay to overestimate. For example, \b\B cannot match an empty | |
| 109 // string, because \b and \B are mutually exclusive, but this isn't | |
| 110 // that smart and will say it can. Spurious empty strings | |
| 111 // will reduce the number of regexps we sanity check against PCRE, | |
| 112 // but they won't break anything. | |
| 113 | |
| 114 class EmptyStringWalker : public Regexp::Walker<bool> { | |
| 115 public: | |
| 116 EmptyStringWalker() { } | |
| 117 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, | |
| 118 bool* child_args, int nchild_args); | |
| 119 | |
| 120 bool ShortVisit(Regexp* re, bool a) { | |
| 121 // Should never be called: we use Walk not WalkExponential. | |
| 122 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; | |
| 123 return a; | |
| 124 } | |
| 125 | |
| 126 private: | |
| 127 DISALLOW_COPY_AND_ASSIGN(EmptyStringWalker); | |
| 128 }; | |
| 129 | |
| 130 // Called after visiting re's children. child_args contains the return | |
| 131 // value from each of the children's PostVisits (i.e., whether each child | |
| 132 // can match an empty string). Returns whether this clause can match an | |
| 133 // empty string. | |
| 134 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, | |
| 135 bool* child_args, int nchild_args) { | |
| 136 switch (re->op()) { | |
| 137 case kRegexpNoMatch: // never empty | |
| 138 case kRegexpLiteral: | |
| 139 case kRegexpAnyChar: | |
| 140 case kRegexpAnyByte: | |
| 141 case kRegexpCharClass: | |
| 142 case kRegexpLiteralString: | |
| 143 return false; | |
| 144 | |
| 145 case kRegexpEmptyMatch: // always empty | |
| 146 case kRegexpBeginLine: // always empty, when they match | |
| 147 case kRegexpEndLine: | |
| 148 case kRegexpNoWordBoundary: | |
| 149 case kRegexpWordBoundary: | |
| 150 case kRegexpBeginText: | |
| 151 case kRegexpEndText: | |
| 152 case kRegexpStar: // can always be empty | |
| 153 case kRegexpQuest: | |
| 154 case kRegexpHaveMatch: | |
| 155 return true; | |
| 156 | |
| 157 case kRegexpConcat: // can be empty if all children can | |
| 158 for (int i = 0; i < nchild_args; i++) | |
| 159 if (!child_args[i]) | |
| 160 return false; | |
| 161 return true; | |
| 162 | |
| 163 case kRegexpAlternate: // can be empty if any child can | |
| 164 for (int i = 0; i < nchild_args; i++) | |
| 165 if (child_args[i]) | |
| 166 return true; | |
| 167 return false; | |
| 168 | |
| 169 case kRegexpPlus: // can be empty if the child can | |
| 170 case kRegexpCapture: | |
| 171 return child_args[0]; | |
| 172 | |
| 173 case kRegexpRepeat: // can be empty if child can or is x{0} | |
| 174 return child_args[0] || re->min() == 0; | |
| 175 } | |
| 176 return false; | |
| 177 } | |
| 178 | |
| 179 // Returns whether re can match an empty string. | |
| 180 static bool CanBeEmptyString(Regexp* re) { | |
| 181 EmptyStringWalker w; | |
| 182 return w.Walk(re, true); | |
| 183 } | |
| 184 | |
| 185 } // namespace re2 | |
| OLD | NEW |