OLD | NEW |
| (Empty) |
1 // Copyright 2008 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Determine whether this library should match PCRE exactly | |
6 // for a particular Regexp. (If so, the testing framework can | |
7 // check that it does.) | |
8 // | |
9 // This library matches PCRE except in these cases: | |
10 // * the regexp contains a repetition of an empty string, | |
11 // like (a*)* or (a*)+. In this case, PCRE will treat | |
12 // the repetition sequence as ending with an empty string, | |
13 // while this library does not. | |
14 // * Perl and PCRE differ on whether \v matches \n. | |
15 // For historical reasons, this library implements the Perl behavior. | |
16 // * Perl and PCRE allow $ in one-line mode to match either the very | |
17 // end of the text or just before a \n at the end of the text. | |
18 // This library requires it to match only the end of the text. | |
19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to | |
20 // match the end of the text if the last character is a \n. | |
21 // This library does allow it. | |
22 // | |
23 // Regexp::MimicsPCRE checks for any of these conditions. | |
24 | |
25 #include "util/util.h" | |
26 #include "re2/regexp.h" | |
27 #include "re2/walker-inl.h" | |
28 | |
29 namespace re2 { | |
30 | |
31 // Returns whether re might match an empty string. | |
32 static bool CanBeEmptyString(Regexp *re); | |
33 | |
34 // Walker class to compute whether library handles a regexp | |
35 // exactly as PCRE would. See comment at top for conditions. | |
36 | |
37 class PCREWalker : public Regexp::Walker<bool> { | |
38 public: | |
39 PCREWalker() {} | |
40 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args, | |
41 int nchild_args); | |
42 | |
43 bool ShortVisit(Regexp* re, bool a) { | |
44 // Should never be called: we use Walk not WalkExponential. | |
45 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; | |
46 return a; | |
47 } | |
48 }; | |
49 | |
50 // Called after visiting each of re's children and accumulating | |
51 // the return values in child_args. So child_args contains whether | |
52 // this library mimics PCRE for those subexpressions. | |
53 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, | |
54 bool* child_args, int nchild_args) { | |
55 // If children failed, so do we. | |
56 for (int i = 0; i < nchild_args; i++) | |
57 if (!child_args[i]) | |
58 return false; | |
59 | |
60 // Otherwise look for other reasons to fail. | |
61 switch (re->op()) { | |
62 // Look for repeated empty string. | |
63 case kRegexpStar: | |
64 case kRegexpPlus: | |
65 case kRegexpQuest: | |
66 if (CanBeEmptyString(re->sub()[0])) | |
67 return false; | |
68 break; | |
69 case kRegexpRepeat: | |
70 if (re->max() == -1 && CanBeEmptyString(re->sub()[0])) | |
71 return false; | |
72 break; | |
73 | |
74 // Look for \v | |
75 case kRegexpLiteral: | |
76 if (re->rune() == '\v') | |
77 return false; | |
78 break; | |
79 | |
80 // Look for $ in single-line mode. | |
81 case kRegexpEndText: | |
82 case kRegexpEmptyMatch: | |
83 if (re->parse_flags() & Regexp::WasDollar) | |
84 return false; | |
85 break; | |
86 | |
87 // Look for ^ in multi-line mode. | |
88 case kRegexpBeginLine: | |
89 // No condition: in single-line mode ^ becomes kRegexpBeginText. | |
90 return false; | |
91 | |
92 default: | |
93 break; | |
94 } | |
95 | |
96 // Not proven guilty. | |
97 return true; | |
98 } | |
99 | |
100 // Returns whether this regexp's behavior will mimic PCRE's exactly. | |
101 bool Regexp::MimicsPCRE() { | |
102 PCREWalker w; | |
103 return w.Walk(this, true); | |
104 } | |
105 | |
106 | |
107 // Walker class to compute whether a Regexp can match an empty string. | |
108 // It is okay to overestimate. For example, \b\B cannot match an empty | |
109 // string, because \b and \B are mutually exclusive, but this isn't | |
110 // that smart and will say it can. Spurious empty strings | |
111 // will reduce the number of regexps we sanity check against PCRE, | |
112 // but they won't break anything. | |
113 | |
114 class EmptyStringWalker : public Regexp::Walker<bool> { | |
115 public: | |
116 EmptyStringWalker() { } | |
117 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, | |
118 bool* child_args, int nchild_args); | |
119 | |
120 bool ShortVisit(Regexp* re, bool a) { | |
121 // Should never be called: we use Walk not WalkExponential. | |
122 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called"; | |
123 return a; | |
124 } | |
125 | |
126 private: | |
127 DISALLOW_COPY_AND_ASSIGN(EmptyStringWalker); | |
128 }; | |
129 | |
130 // Called after visiting re's children. child_args contains the return | |
131 // value from each of the children's PostVisits (i.e., whether each child | |
132 // can match an empty string). Returns whether this clause can match an | |
133 // empty string. | |
134 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg, | |
135 bool* child_args, int nchild_args) { | |
136 switch (re->op()) { | |
137 case kRegexpNoMatch: // never empty | |
138 case kRegexpLiteral: | |
139 case kRegexpAnyChar: | |
140 case kRegexpAnyByte: | |
141 case kRegexpCharClass: | |
142 case kRegexpLiteralString: | |
143 return false; | |
144 | |
145 case kRegexpEmptyMatch: // always empty | |
146 case kRegexpBeginLine: // always empty, when they match | |
147 case kRegexpEndLine: | |
148 case kRegexpNoWordBoundary: | |
149 case kRegexpWordBoundary: | |
150 case kRegexpBeginText: | |
151 case kRegexpEndText: | |
152 case kRegexpStar: // can always be empty | |
153 case kRegexpQuest: | |
154 case kRegexpHaveMatch: | |
155 return true; | |
156 | |
157 case kRegexpConcat: // can be empty if all children can | |
158 for (int i = 0; i < nchild_args; i++) | |
159 if (!child_args[i]) | |
160 return false; | |
161 return true; | |
162 | |
163 case kRegexpAlternate: // can be empty if any child can | |
164 for (int i = 0; i < nchild_args; i++) | |
165 if (child_args[i]) | |
166 return true; | |
167 return false; | |
168 | |
169 case kRegexpPlus: // can be empty if the child can | |
170 case kRegexpCapture: | |
171 return child_args[0]; | |
172 | |
173 case kRegexpRepeat: // can be empty if child can or is x{0} | |
174 return child_args[0] || re->min() == 0; | |
175 } | |
176 return false; | |
177 } | |
178 | |
179 // Returns whether re can match an empty string. | |
180 static bool CanBeEmptyString(Regexp* re) { | |
181 EmptyStringWalker w; | |
182 return w.Walk(re, true); | |
183 } | |
184 | |
185 } // namespace re2 | |
OLD | NEW |