| OLD | NEW |
| (Empty) |
| 1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
| 2 // Use of this source code is governed by a BSD-style | |
| 3 // license that can be found in the LICENSE file. | |
| 4 | |
| 5 // Test simplify.cc. | |
| 6 | |
| 7 #include <string> | |
| 8 #include <vector> | |
| 9 #include "util/test.h" | |
| 10 #include "re2/regexp.h" | |
| 11 | |
| 12 namespace re2 { | |
| 13 | |
| 14 struct Test { | |
| 15 const char* regexp; | |
| 16 const char* simplified; | |
| 17 }; | |
| 18 | |
| 19 static Test tests[] = { | |
| 20 // Already-simple constructs | |
| 21 { "a", "a" }, | |
| 22 { "ab", "ab" }, | |
| 23 { "a|b", "[a-b]" }, | |
| 24 { "ab|cd", "ab|cd" }, | |
| 25 { "(ab)*", "(ab)*" }, | |
| 26 { "(ab)+", "(ab)+" }, | |
| 27 { "(ab)?", "(ab)?" }, | |
| 28 { ".", "." }, | |
| 29 { "^", "^" }, | |
| 30 { "$", "$" }, | |
| 31 { "[ac]", "[ac]" }, | |
| 32 { "[^ac]", "[^ac]" }, | |
| 33 | |
| 34 // Posix character classes | |
| 35 { "[[:alnum:]]", "[0-9A-Za-z]" }, | |
| 36 { "[[:alpha:]]", "[A-Za-z]" }, | |
| 37 { "[[:blank:]]", "[\\t ]" }, | |
| 38 { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" }, | |
| 39 { "[[:digit:]]", "[0-9]" }, | |
| 40 { "[[:graph:]]", "[!-~]" }, | |
| 41 { "[[:lower:]]", "[a-z]" }, | |
| 42 { "[[:print:]]", "[ -~]" }, | |
| 43 { "[[:punct:]]", "[!-/:-@\\[-`{-~]" }, | |
| 44 { "[[:space:]]" , "[\\t-\\r ]" }, | |
| 45 { "[[:upper:]]", "[A-Z]" }, | |
| 46 { "[[:xdigit:]]", "[0-9A-Fa-f]" }, | |
| 47 | |
| 48 // Perl character classes | |
| 49 { "\\d", "[0-9]" }, | |
| 50 { "\\s", "[\\t-\\n\\f-\\r ]" }, | |
| 51 { "\\w", "[0-9A-Z_a-z]" }, | |
| 52 { "\\D", "[^0-9]" }, | |
| 53 { "\\S", "[^\\t-\\n\\f-\\r ]" }, | |
| 54 { "\\W", "[^0-9A-Z_a-z]" }, | |
| 55 { "[\\d]", "[0-9]" }, | |
| 56 { "[\\s]", "[\\t-\\n\\f-\\r ]" }, | |
| 57 { "[\\w]", "[0-9A-Z_a-z]" }, | |
| 58 { "[\\D]", "[^0-9]" }, | |
| 59 { "[\\S]", "[^\\t-\\n\\f-\\r ]" }, | |
| 60 { "[\\W]", "[^0-9A-Z_a-z]" }, | |
| 61 | |
| 62 // Posix repetitions | |
| 63 { "a{1}", "a" }, | |
| 64 { "a{2}", "aa" }, | |
| 65 { "a{5}", "aaaaa" }, | |
| 66 { "a{0,1}", "a?" }, | |
| 67 // The next three are illegible because Simplify inserts (?:) | |
| 68 // parens instead of () parens to avoid creating extra | |
| 69 // captured subexpressions. The comments show a version fewer parens. | |
| 70 { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)? | |
| 71 { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)? | |
| 72 { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)? | |
| 73 { "a{0,2}", "(?:aa?)?" }, // (aa?)? | |
| 74 { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)? | |
| 75 { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)? | |
| 76 { "a{0,}", "a*" }, | |
| 77 { "a{1,}", "a+" }, | |
| 78 { "a{2,}", "aa+" }, | |
| 79 { "a{5,}", "aaaaa+" }, | |
| 80 | |
| 81 // Test that operators simplify their arguments. | |
| 82 // (Simplify used to not simplify arguments to a {} repeat.) | |
| 83 { "(?:a{1,}){1,}", "a+" }, | |
| 84 { "(a{1,}b{1,})", "(a+b+)" }, | |
| 85 { "a{1,}|b{1,}", "a+|b+" }, | |
| 86 { "(?:a{1,})*", "(?:a+)*" }, | |
| 87 { "(?:a{1,})+", "a+" }, | |
| 88 { "(?:a{1,})?", "(?:a+)?" }, | |
| 89 { "a{0}", "" }, | |
| 90 | |
| 91 // Character class simplification | |
| 92 { "[ab]", "[a-b]" }, | |
| 93 { "[a-za-za-z]", "[a-z]" }, | |
| 94 { "[A-Za-zA-Za-z]", "[A-Za-z]" }, | |
| 95 { "[ABCDEFGH]", "[A-H]" }, | |
| 96 { "[AB-CD-EF-GH]", "[A-H]" }, | |
| 97 { "[W-ZP-XE-R]", "[E-Z]" }, | |
| 98 { "[a-ee-gg-m]", "[a-m]" }, | |
| 99 { "[a-ea-ha-m]", "[a-m]" }, | |
| 100 { "[a-ma-ha-e]", "[a-m]" }, | |
| 101 { "[a-zA-Z0-9 -~]", "[ -~]" }, | |
| 102 | |
| 103 // Empty character classes | |
| 104 { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" }, | |
| 105 | |
| 106 // Full character classes | |
| 107 { "[[:cntrl:][:^cntrl:]]", "." }, | |
| 108 | |
| 109 // Unicode case folding. | |
| 110 { "(?i)A", "[Aa]" }, | |
| 111 { "(?i)a", "[Aa]" }, | |
| 112 { "(?i)K", "[Kk\\x{212a}]" }, | |
| 113 { "(?i)k", "[Kk\\x{212a}]" }, | |
| 114 { "(?i)\\x{212a}", "[Kk\\x{212a}]" }, | |
| 115 { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" }, | |
| 116 { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" }, | |
| 117 { "(?i)[\\x00-\\x{10ffff}]", "." }, | |
| 118 | |
| 119 // Empty string as a regular expression. | |
| 120 // Empty string must be preserved inside parens in order | |
| 121 // to make submatches work right, so these are less | |
| 122 // interesting than they used to be. ToString inserts | |
| 123 // explicit (?:) in place of non-parenthesized empty strings, | |
| 124 // to make them easier to spot for other parsers. | |
| 125 { "(a|b|)", "([a-b]|(?:))" }, | |
| 126 { "(|)", "()" }, | |
| 127 { "a()", "a()" }, | |
| 128 { "(()|())", "(()|())" }, | |
| 129 { "(a|)", "(a|(?:))" }, | |
| 130 { "ab()cd()", "ab()cd()" }, | |
| 131 { "()", "()" }, | |
| 132 { "()*", "()*" }, | |
| 133 { "()+", "()+" }, | |
| 134 { "()?" , "()?" }, | |
| 135 { "(){0}", "" }, | |
| 136 { "(){1}", "()" }, | |
| 137 { "(){1,}", "()+" }, | |
| 138 { "(){0,2}", "(?:()()?)?" }, | |
| 139 | |
| 140 // Test that coalescing occurs and that the resulting repeats are simplified. | |
| 141 // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal: | |
| 142 { "a*a*", "a*" }, | |
| 143 { "a*a+", "a+" }, | |
| 144 { "a*a?", "a*" }, | |
| 145 { "a*a{2}", "aa+" }, | |
| 146 { "a*a{2,}", "aa+" }, | |
| 147 { "a*a{2,3}", "aa+" }, | |
| 148 { "a+a*", "a+" }, | |
| 149 { "a+a+", "aa+" }, | |
| 150 { "a+a?", "a+" }, | |
| 151 { "a+a{2}", "aaa+" }, | |
| 152 { "a+a{2,}", "aaa+" }, | |
| 153 { "a+a{2,3}", "aaa+" }, | |
| 154 { "a?a*", "a*" }, | |
| 155 { "a?a+", "a+" }, | |
| 156 { "a?a?", "(?:aa?)?" }, | |
| 157 { "a?a{2}", "aaa?" }, | |
| 158 { "a?a{2,}", "aa+" }, | |
| 159 { "a?a{2,3}", "aa(?:aa?)?" }, | |
| 160 { "a{2}a*", "aa+" }, | |
| 161 { "a{2}a+", "aaa+" }, | |
| 162 { "a{2}a?", "aaa?" }, | |
| 163 { "a{2}a{2}", "aaaa" }, | |
| 164 { "a{2}a{2,}", "aaaa+" }, | |
| 165 { "a{2}a{2,3}", "aaaaa?" }, | |
| 166 { "a{2,}a*", "aa+" }, | |
| 167 { "a{2,}a+", "aaa+" }, | |
| 168 { "a{2,}a?", "aa+" }, | |
| 169 { "a{2,}a{2}", "aaaa+" }, | |
| 170 { "a{2,}a{2,}", "aaaa+" }, | |
| 171 { "a{2,}a{2,3}", "aaaa+" }, | |
| 172 { "a{2,3}a*", "aa+" }, | |
| 173 { "a{2,3}a+", "aaa+" }, | |
| 174 { "a{2,3}a?", "aa(?:aa?)?" }, | |
| 175 { "a{2,3}a{2}", "aaaaa?" }, | |
| 176 { "a{2,3}a{2,}", "aaaa+" }, | |
| 177 { "a{2,3}a{2,3}", "aaaa(?:aa?)?" }, | |
| 178 // With a char class, any char and any byte: | |
| 179 { "\\d*\\d*", "[0-9]*" }, | |
| 180 { ".*.*", ".*" }, | |
| 181 { "\\C*\\C*", "\\C*" }, | |
| 182 // FoldCase works, but must be consistent: | |
| 183 { "(?i)A*a*", "[Aa]*" }, | |
| 184 { "(?i)a+A+", "[Aa][Aa]+" }, | |
| 185 { "(?i)A*(?-i)a*", "[Aa]*a*" }, | |
| 186 { "(?i)a+(?-i)A+", "[Aa]+A+" }, | |
| 187 // NonGreedy works, but must be consistent: | |
| 188 { "a*?a*?", "a*?" }, | |
| 189 { "a+?a+?", "aa+?" }, | |
| 190 { "a*?a*", "a*?a*" }, | |
| 191 { "a+a+?", "a+a+?" }, | |
| 192 // The second element is the literal, char class, any char or any byte: | |
| 193 { "a*a", "a+" }, | |
| 194 { "\\d*\\d", "[0-9]+" }, | |
| 195 { ".*.", ".+" }, | |
| 196 { "\\C*\\C", "\\C+" }, | |
| 197 // FoldCase works, but must be consistent: | |
| 198 { "(?i)A*a", "[Aa]+" }, | |
| 199 { "(?i)a+A", "[Aa][Aa]+" }, | |
| 200 { "(?i)A*(?-i)a", "[Aa]*a" }, | |
| 201 { "(?i)a+(?-i)A", "[Aa]+A" }, | |
| 202 // The second element is a literal string that begins with the literal: | |
| 203 { "a*aa", "aa+" }, | |
| 204 { "a*aab", "aa+b" }, | |
| 205 // FoldCase works, but must be consistent: | |
| 206 { "(?i)a*aa", "[Aa][Aa]+" }, | |
| 207 { "(?i)a*aab", "[Aa][Aa]+[Bb]" }, | |
| 208 { "(?i)a*(?-i)aa", "[Aa]*aa" }, | |
| 209 { "(?i)a*(?-i)aab", "[Aa]*aab" }, | |
| 210 // Negative tests with mismatching ops: | |
| 211 { "a*b*", "a*b*" }, | |
| 212 { "\\d*\\D*", "[0-9]*[^0-9]*" }, | |
| 213 { "a+b", "a+b" }, | |
| 214 { "\\d+\\D", "[0-9]+[^0-9]" }, | |
| 215 { "a?bb", "a?bb" }, | |
| 216 // Negative tests with capturing groups: | |
| 217 { "(a*)a*", "(a*)a*" }, | |
| 218 { "a+(a)", "a+(a)" }, | |
| 219 { "(a?)(aa)", "(a?)(aa)" }, | |
| 220 // Just for fun: | |
| 221 { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" }, | |
| 222 | |
| 223 // During coalescing, the child of the repeat changes, so we build a new | |
| 224 // repeat. The new repeat must have the min and max of the old repeat. | |
| 225 // Failure to copy them results in min=0 and max=0 -> empty match. | |
| 226 { "(?:a*aab){2}", "aa+baa+b" }, | |
| 227 | |
| 228 // During coalescing, the child of the capture changes, so we build a new | |
| 229 // capture. The new capture must have the cap of the old capture. | |
| 230 // Failure to copy it results in cap=0 -> ToString() logs a fatal error. | |
| 231 { "(a*aab)", "(aa+b)" }, | |
| 232 }; | |
| 233 | |
| 234 TEST(TestSimplify, SimpleRegexps) { | |
| 235 for (int i = 0; i < arraysize(tests); i++) { | |
| 236 RegexpStatus status; | |
| 237 VLOG(1) << "Testing " << tests[i].regexp; | |
| 238 Regexp* re = Regexp::Parse(tests[i].regexp, | |
| 239 Regexp::MatchNL | (Regexp::LikePerl & | |
| 240 ~Regexp::OneLine), | |
| 241 &status); | |
| 242 CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text(); | |
| 243 Regexp* sre = re->Simplify(); | |
| 244 CHECK(sre != NULL); | |
| 245 | |
| 246 // Check that already-simple regexps don't allocate new ones. | |
| 247 if (strcmp(tests[i].regexp, tests[i].simplified) == 0) { | |
| 248 CHECK(re == sre) << " " << tests[i].regexp | |
| 249 << " " << re->ToString() << " " << sre->ToString(); | |
| 250 } | |
| 251 | |
| 252 EXPECT_EQ(tests[i].simplified, sre->ToString()) | |
| 253 << " " << tests[i].regexp << " " << sre->Dump(); | |
| 254 | |
| 255 re->Decref(); | |
| 256 sre->Decref(); | |
| 257 } | |
| 258 } | |
| 259 | |
| 260 } // namespace re2 | |
| OLD | NEW |