Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(12)

Side by Side Diff: third_party/re2/re2/testing/simplify_test.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/testing/set_test.cc ('k') | third_party/re2/re2/testing/string_generator.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Test simplify.cc.
6
7 #include <string>
8 #include <vector>
9 #include "util/test.h"
10 #include "re2/regexp.h"
11
12 namespace re2 {
13
14 struct Test {
15 const char* regexp;
16 const char* simplified;
17 };
18
19 static Test tests[] = {
20 // Already-simple constructs
21 { "a", "a" },
22 { "ab", "ab" },
23 { "a|b", "[a-b]" },
24 { "ab|cd", "ab|cd" },
25 { "(ab)*", "(ab)*" },
26 { "(ab)+", "(ab)+" },
27 { "(ab)?", "(ab)?" },
28 { ".", "." },
29 { "^", "^" },
30 { "$", "$" },
31 { "[ac]", "[ac]" },
32 { "[^ac]", "[^ac]" },
33
34 // Posix character classes
35 { "[[:alnum:]]", "[0-9A-Za-z]" },
36 { "[[:alpha:]]", "[A-Za-z]" },
37 { "[[:blank:]]", "[\\t ]" },
38 { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },
39 { "[[:digit:]]", "[0-9]" },
40 { "[[:graph:]]", "[!-~]" },
41 { "[[:lower:]]", "[a-z]" },
42 { "[[:print:]]", "[ -~]" },
43 { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },
44 { "[[:space:]]" , "[\\t-\\r ]" },
45 { "[[:upper:]]", "[A-Z]" },
46 { "[[:xdigit:]]", "[0-9A-Fa-f]" },
47
48 // Perl character classes
49 { "\\d", "[0-9]" },
50 { "\\s", "[\\t-\\n\\f-\\r ]" },
51 { "\\w", "[0-9A-Z_a-z]" },
52 { "\\D", "[^0-9]" },
53 { "\\S", "[^\\t-\\n\\f-\\r ]" },
54 { "\\W", "[^0-9A-Z_a-z]" },
55 { "[\\d]", "[0-9]" },
56 { "[\\s]", "[\\t-\\n\\f-\\r ]" },
57 { "[\\w]", "[0-9A-Z_a-z]" },
58 { "[\\D]", "[^0-9]" },
59 { "[\\S]", "[^\\t-\\n\\f-\\r ]" },
60 { "[\\W]", "[^0-9A-Z_a-z]" },
61
62 // Posix repetitions
63 { "a{1}", "a" },
64 { "a{2}", "aa" },
65 { "a{5}", "aaaaa" },
66 { "a{0,1}", "a?" },
67 // The next three are illegible because Simplify inserts (?:)
68 // parens instead of () parens to avoid creating extra
69 // captured subexpressions. The comments show a version fewer parens.
70 { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?
71 { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?
72 { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?
73 { "a{0,2}", "(?:aa?)?" }, // (aa?)?
74 { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?
75 { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?
76 { "a{0,}", "a*" },
77 { "a{1,}", "a+" },
78 { "a{2,}", "aa+" },
79 { "a{5,}", "aaaaa+" },
80
81 // Test that operators simplify their arguments.
82 // (Simplify used to not simplify arguments to a {} repeat.)
83 { "(?:a{1,}){1,}", "a+" },
84 { "(a{1,}b{1,})", "(a+b+)" },
85 { "a{1,}|b{1,}", "a+|b+" },
86 { "(?:a{1,})*", "(?:a+)*" },
87 { "(?:a{1,})+", "a+" },
88 { "(?:a{1,})?", "(?:a+)?" },
89 { "a{0}", "" },
90
91 // Character class simplification
92 { "[ab]", "[a-b]" },
93 { "[a-za-za-z]", "[a-z]" },
94 { "[A-Za-zA-Za-z]", "[A-Za-z]" },
95 { "[ABCDEFGH]", "[A-H]" },
96 { "[AB-CD-EF-GH]", "[A-H]" },
97 { "[W-ZP-XE-R]", "[E-Z]" },
98 { "[a-ee-gg-m]", "[a-m]" },
99 { "[a-ea-ha-m]", "[a-m]" },
100 { "[a-ma-ha-e]", "[a-m]" },
101 { "[a-zA-Z0-9 -~]", "[ -~]" },
102
103 // Empty character classes
104 { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },
105
106 // Full character classes
107 { "[[:cntrl:][:^cntrl:]]", "." },
108
109 // Unicode case folding.
110 { "(?i)A", "[Aa]" },
111 { "(?i)a", "[Aa]" },
112 { "(?i)K", "[Kk\\x{212a}]" },
113 { "(?i)k", "[Kk\\x{212a}]" },
114 { "(?i)\\x{212a}", "[Kk\\x{212a}]" },
115 { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },
116 { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },
117 { "(?i)[\\x00-\\x{10ffff}]", "." },
118
119 // Empty string as a regular expression.
120 // Empty string must be preserved inside parens in order
121 // to make submatches work right, so these are less
122 // interesting than they used to be. ToString inserts
123 // explicit (?:) in place of non-parenthesized empty strings,
124 // to make them easier to spot for other parsers.
125 { "(a|b|)", "([a-b]|(?:))" },
126 { "(|)", "()" },
127 { "a()", "a()" },
128 { "(()|())", "(()|())" },
129 { "(a|)", "(a|(?:))" },
130 { "ab()cd()", "ab()cd()" },
131 { "()", "()" },
132 { "()*", "()*" },
133 { "()+", "()+" },
134 { "()?" , "()?" },
135 { "(){0}", "" },
136 { "(){1}", "()" },
137 { "(){1,}", "()+" },
138 { "(){0,2}", "(?:()()?)?" },
139
140 // Test that coalescing occurs and that the resulting repeats are simplified.
141 // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:
142 { "a*a*", "a*" },
143 { "a*a+", "a+" },
144 { "a*a?", "a*" },
145 { "a*a{2}", "aa+" },
146 { "a*a{2,}", "aa+" },
147 { "a*a{2,3}", "aa+" },
148 { "a+a*", "a+" },
149 { "a+a+", "aa+" },
150 { "a+a?", "a+" },
151 { "a+a{2}", "aaa+" },
152 { "a+a{2,}", "aaa+" },
153 { "a+a{2,3}", "aaa+" },
154 { "a?a*", "a*" },
155 { "a?a+", "a+" },
156 { "a?a?", "(?:aa?)?" },
157 { "a?a{2}", "aaa?" },
158 { "a?a{2,}", "aa+" },
159 { "a?a{2,3}", "aa(?:aa?)?" },
160 { "a{2}a*", "aa+" },
161 { "a{2}a+", "aaa+" },
162 { "a{2}a?", "aaa?" },
163 { "a{2}a{2}", "aaaa" },
164 { "a{2}a{2,}", "aaaa+" },
165 { "a{2}a{2,3}", "aaaaa?" },
166 { "a{2,}a*", "aa+" },
167 { "a{2,}a+", "aaa+" },
168 { "a{2,}a?", "aa+" },
169 { "a{2,}a{2}", "aaaa+" },
170 { "a{2,}a{2,}", "aaaa+" },
171 { "a{2,}a{2,3}", "aaaa+" },
172 { "a{2,3}a*", "aa+" },
173 { "a{2,3}a+", "aaa+" },
174 { "a{2,3}a?", "aa(?:aa?)?" },
175 { "a{2,3}a{2}", "aaaaa?" },
176 { "a{2,3}a{2,}", "aaaa+" },
177 { "a{2,3}a{2,3}", "aaaa(?:aa?)?" },
178 // With a char class, any char and any byte:
179 { "\\d*\\d*", "[0-9]*" },
180 { ".*.*", ".*" },
181 { "\\C*\\C*", "\\C*" },
182 // FoldCase works, but must be consistent:
183 { "(?i)A*a*", "[Aa]*" },
184 { "(?i)a+A+", "[Aa][Aa]+" },
185 { "(?i)A*(?-i)a*", "[Aa]*a*" },
186 { "(?i)a+(?-i)A+", "[Aa]+A+" },
187 // NonGreedy works, but must be consistent:
188 { "a*?a*?", "a*?" },
189 { "a+?a+?", "aa+?" },
190 { "a*?a*", "a*?a*" },
191 { "a+a+?", "a+a+?" },
192 // The second element is the literal, char class, any char or any byte:
193 { "a*a", "a+" },
194 { "\\d*\\d", "[0-9]+" },
195 { ".*.", ".+" },
196 { "\\C*\\C", "\\C+" },
197 // FoldCase works, but must be consistent:
198 { "(?i)A*a", "[Aa]+" },
199 { "(?i)a+A", "[Aa][Aa]+" },
200 { "(?i)A*(?-i)a", "[Aa]*a" },
201 { "(?i)a+(?-i)A", "[Aa]+A" },
202 // The second element is a literal string that begins with the literal:
203 { "a*aa", "aa+" },
204 { "a*aab", "aa+b" },
205 // FoldCase works, but must be consistent:
206 { "(?i)a*aa", "[Aa][Aa]+" },
207 { "(?i)a*aab", "[Aa][Aa]+[Bb]" },
208 { "(?i)a*(?-i)aa", "[Aa]*aa" },
209 { "(?i)a*(?-i)aab", "[Aa]*aab" },
210 // Negative tests with mismatching ops:
211 { "a*b*", "a*b*" },
212 { "\\d*\\D*", "[0-9]*[^0-9]*" },
213 { "a+b", "a+b" },
214 { "\\d+\\D", "[0-9]+[^0-9]" },
215 { "a?bb", "a?bb" },
216 // Negative tests with capturing groups:
217 { "(a*)a*", "(a*)a*" },
218 { "a+(a)", "a+(a)" },
219 { "(a?)(aa)", "(a?)(aa)" },
220 // Just for fun:
221 { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },
222
223 // During coalescing, the child of the repeat changes, so we build a new
224 // repeat. The new repeat must have the min and max of the old repeat.
225 // Failure to copy them results in min=0 and max=0 -> empty match.
226 { "(?:a*aab){2}", "aa+baa+b" },
227
228 // During coalescing, the child of the capture changes, so we build a new
229 // capture. The new capture must have the cap of the old capture.
230 // Failure to copy it results in cap=0 -> ToString() logs a fatal error.
231 { "(a*aab)", "(aa+b)" },
232 };
233
234 TEST(TestSimplify, SimpleRegexps) {
235 for (int i = 0; i < arraysize(tests); i++) {
236 RegexpStatus status;
237 VLOG(1) << "Testing " << tests[i].regexp;
238 Regexp* re = Regexp::Parse(tests[i].regexp,
239 Regexp::MatchNL | (Regexp::LikePerl &
240 ~Regexp::OneLine),
241 &status);
242 CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();
243 Regexp* sre = re->Simplify();
244 CHECK(sre != NULL);
245
246 // Check that already-simple regexps don't allocate new ones.
247 if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {
248 CHECK(re == sre) << " " << tests[i].regexp
249 << " " << re->ToString() << " " << sre->ToString();
250 }
251
252 EXPECT_EQ(tests[i].simplified, sre->ToString())
253 << " " << tests[i].regexp << " " << sre->Dump();
254
255 re->Decref();
256 sre->Decref();
257 }
258 }
259
260 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/testing/set_test.cc ('k') | third_party/re2/re2/testing/string_generator.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698