third_party/re2/re2/testing/simplify_test.cc - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/testing/simplify_test.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 // Test simplify.cc.

6

7 #include <string>

8 #include <vector>

9 #include "util/test.h"

10 #include "re2/regexp.h"

11

12 namespace re2 {

13

14 struct Test {

15 const char* regexp;

16 const char* simplified;

17 };

18

19 static Test tests[] = {

20 // Already-simple constructs

21 { "a", "a" },

22 { "ab", "ab" },

23 { "a\|b", "[a-b]" },

24 { "ab\|cd", "ab\|cd" },

25 { "(ab)", "(ab)" },

26 { "(ab)+", "(ab)+" },

27 { "(ab)?", "(ab)?" },

28 { ".", "." },

29 { "^", "^" },

30 { "$", "$" },

31 { "[ac]", "[ac]" },

32 { "[^ac]", "[^ac]" },

33

34 // Posix character classes

35 { "[[:alnum:]]", "[0-9A-Za-z]" },

36 { "[[:alpha:]]", "[A-Za-z]" },

37 { "[[:blank:]]", "[\\t ]" },

38 { "[[:cntrl:]]", "[\\x00-\\x1f\\x7f]" },

39 { "[[:digit:]]", "[0-9]" },

40 { "[[:graph:]]", "[!-~]" },

41 { "[[:lower:]]", "[a-z]" },

42 { "[[:print:]]", "[ -~]" },

43 { "[[:punct:]]", "[!-/:-@\\[-`{-~]" },

44 { "[[:space:]]" , "[\\t-\\r ]" },

45 { "[[:upper:]]", "[A-Z]" },

46 { "[[:xdigit:]]", "[0-9A-Fa-f]" },

47

48 // Perl character classes

49 { "\\d", "[0-9]" },

50 { "\\s", "[\\t-\\n\\f-\\r ]" },

51 { "\\w", "[0-9A-Z_a-z]" },

52 { "\\D", "[^0-9]" },

53 { "\\S", "[^\\t-\\n\\f-\\r ]" },

54 { "\\W", "[^0-9A-Z_a-z]" },

55 { "[\\d]", "[0-9]" },

56 { "[\\s]", "[\\t-\\n\\f-\\r ]" },

57 { "[\\w]", "[0-9A-Z_a-z]" },

58 { "[\\D]", "[^0-9]" },

59 { "[\\S]", "[^\\t-\\n\\f-\\r ]" },

60 { "[\\W]", "[^0-9A-Z_a-z]" },

61

62 // Posix repetitions

63 { "a{1}", "a" },

64 { "a{2}", "aa" },

65 { "a{5}", "aaaaa" },

66 { "a{0,1}", "a?" },

67 // The next three are illegible because Simplify inserts (?:)

68 // parens instead of () parens to avoid creating extra

69 // captured subexpressions. The comments show a version fewer parens.

70 { "(a){0,2}", "(?:(a)(a)?)?" }, // (aa?)?

71 { "(a){0,4}", "(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // (a(a(aa?)?)?)?

72 { "(a){2,6}", "(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?" }, // aa(a(a(aa?)?)?)?

73 { "a{0,2}", "(?:aa?)?" }, // (aa?)?

74 { "a{0,4}", "(?:a(?:a(?:aa?)?)?)?" }, // (a(a(aa?)?)?)?

75 { "a{2,6}", "aa(?:a(?:a(?:aa?)?)?)?" }, // aa(a(a(aa?)?)?)?

76 { "a{0,}", "a*" },

77 { "a{1,}", "a+" },

78 { "a{2,}", "aa+" },

79 { "a{5,}", "aaaaa+" },

80

81 // Test that operators simplify their arguments.

82 // (Simplify used to not simplify arguments to a {} repeat.)

83 { "(?:a{1,}){1,}", "a+" },

84 { "(a{1,}b{1,})", "(a+b+)" },

85 { "a{1,}\|b{1,}", "a+\|b+" },

86 { "(?:a{1,})", "(?:a+)" },

87 { "(?:a{1,})+", "a+" },

88 { "(?:a{1,})?", "(?:a+)?" },

89 { "a{0}", "" },

90

91 // Character class simplification

92 { "[ab]", "[a-b]" },

93 { "[a-za-za-z]", "[a-z]" },

94 { "[A-Za-zA-Za-z]", "[A-Za-z]" },

95 { "[ABCDEFGH]", "[A-H]" },

96 { "[AB-CD-EF-GH]", "[A-H]" },

97 { "[W-ZP-XE-R]", "[E-Z]" },

98 { "[a-ee-gg-m]", "[a-m]" },

99 { "[a-ea-ha-m]", "[a-m]" },

100 { "[a-ma-ha-e]", "[a-m]" },

101 { "[a-zA-Z0-9 -~]", "[ -~]" },

102

103 // Empty character classes

104 { "[^[:cntrl:][:^cntrl:]]", "[^\\x00-\\x{10ffff}]" },

105

106 // Full character classes

107 { "[[:cntrl:][:^cntrl:]]", "." },

108

109 // Unicode case folding.

110 { "(?i)A", "[Aa]" },

111 { "(?i)a", "[Aa]" },

112 { "(?i)K", "[Kk\\x{212a}]" },

113 { "(?i)k", "[Kk\\x{212a}]" },

114 { "(?i)\\x{212a}", "[Kk\\x{212a}]" },

115 { "(?i)[a-z]", "[A-Za-z\\x{17f}\\x{212a}]" },

116 { "(?i)[\\x00-\\x{FFFD}]", "[\\x00-\\x{fffd}]" },

117 { "(?i)[\\x00-\\x{10ffff}]", "." },

118

119 // Empty string as a regular expression.

120 // Empty string must be preserved inside parens in order

121 // to make submatches work right, so these are less

122 // interesting than they used to be. ToString inserts

123 // explicit (?:) in place of non-parenthesized empty strings,

124 // to make them easier to spot for other parsers.

125 { "(a\|b\|)", "([a-b]\|(?:))" },

126 { "(\|)", "()" },

127 { "a()", "a()" },

128 { "(()\|())", "(()\|())" },

129 { "(a\|)", "(a\|(?:))" },

130 { "ab()cd()", "ab()cd()" },

131 { "()", "()" },

132 { "()", "()" },

133 { "()+", "()+" },

134 { "()?" , "()?" },

135 { "(){0}", "" },

136 { "(){1}", "()" },

137 { "(){1,}", "()+" },

138 { "(){0,2}", "(?:()()?)?" },

139

140 // Test that coalescing occurs and that the resulting repeats are simplified.

141 // Two-op combinations of *, +, ?, {n}, {n,} and {n,m} with a literal:

142 { "aa", "a*" },

143 { "a*a+", "a+" },

144 { "aa?", "a" },

145 { "a*a{2}", "aa+" },

146 { "a*a{2,}", "aa+" },

147 { "a*a{2,3}", "aa+" },

148 { "a+a*", "a+" },

149 { "a+a+", "aa+" },

150 { "a+a?", "a+" },

151 { "a+a{2}", "aaa+" },

152 { "a+a{2,}", "aaa+" },

153 { "a+a{2,3}", "aaa+" },

154 { "a?a", "a" },

155 { "a?a+", "a+" },

156 { "a?a?", "(?:aa?)?" },

157 { "a?a{2}", "aaa?" },

158 { "a?a{2,}", "aa+" },

159 { "a?a{2,3}", "aa(?:aa?)?" },

160 { "a{2}a*", "aa+" },

161 { "a{2}a+", "aaa+" },

162 { "a{2}a?", "aaa?" },

163 { "a{2}a{2}", "aaaa" },

164 { "a{2}a{2,}", "aaaa+" },

165 { "a{2}a{2,3}", "aaaaa?" },

166 { "a{2,}a*", "aa+" },

167 { "a{2,}a+", "aaa+" },

168 { "a{2,}a?", "aa+" },

169 { "a{2,}a{2}", "aaaa+" },

170 { "a{2,}a{2,}", "aaaa+" },

171 { "a{2,}a{2,3}", "aaaa+" },

172 { "a{2,3}a*", "aa+" },

173 { "a{2,3}a+", "aaa+" },

174 { "a{2,3}a?", "aa(?:aa?)?" },

175 { "a{2,3}a{2}", "aaaaa?" },

176 { "a{2,3}a{2,}", "aaaa+" },

177 { "a{2,3}a{2,3}", "aaaa(?:aa?)?" },

178 // With a char class, any char and any byte:

179 { "\\d\\d", "[0-9]*" },

180 { "..", ".*" },

181 { "\\C\\C", "\\C*" },

182 // FoldCase works, but must be consistent:

183 { "(?i)Aa", "[Aa]*" },

184 { "(?i)a+A+", "[Aa][Aa]+" },

185 { "(?i)A(?-i)a", "[Aa]a" },

186 { "(?i)a+(?-i)A+", "[Aa]+A+" },

187 // NonGreedy works, but must be consistent:

188 { "a?a?", "a*?" },

189 { "a+?a+?", "aa+?" },

190 { "a?a", "a?a" },

191 { "a+a+?", "a+a+?" },

192 // The second element is the literal, char class, any char or any byte:

193 { "a*a", "a+" },

194 { "\\d*\\d", "[0-9]+" },

195 { ".*.", ".+" },

196 { "\\C*\\C", "\\C+" },

197 // FoldCase works, but must be consistent:

198 { "(?i)A*a", "[Aa]+" },

199 { "(?i)a+A", "[Aa][Aa]+" },

200 { "(?i)A(?-i)a", "[Aa]a" },

201 { "(?i)a+(?-i)A", "[Aa]+A" },

202 // The second element is a literal string that begins with the literal:

203 { "a*aa", "aa+" },

204 { "a*aab", "aa+b" },

205 // FoldCase works, but must be consistent:

206 { "(?i)a*aa", "[Aa][Aa]+" },

207 { "(?i)a*aab", "[Aa][Aa]+[Bb]" },

208 { "(?i)a(?-i)aa", "[Aa]aa" },

209 { "(?i)a(?-i)aab", "[Aa]aab" },

210 // Negative tests with mismatching ops:

211 { "ab", "ab" },

212 { "\\d\\D", "[0-9][^0-9]" },

213 { "a+b", "a+b" },

214 { "\\d+\\D", "[0-9]+[^0-9]" },

215 { "a?bb", "a?bb" },

216 // Negative tests with capturing groups:

217 { "(a)a", "(a)a" },

218 { "a+(a)", "a+(a)" },

219 { "(a?)(aa)", "(a?)(aa)" },

220 // Just for fun:

221 { "aa*aa+aa?aa{2}aaa{2,}aaa{2,3}a", "aaaaaaaaaaaaaaaa+" },

222

223 // During coalescing, the child of the repeat changes, so we build a new

224 // repeat. The new repeat must have the min and max of the old repeat.

225 // Failure to copy them results in min=0 and max=0 -> empty match.

226 { "(?:a*aab){2}", "aa+baa+b" },

227

228 // During coalescing, the child of the capture changes, so we build a new

229 // capture. The new capture must have the cap of the old capture.

230 // Failure to copy it results in cap=0 -> ToString() logs a fatal error.

231 { "(a*aab)", "(aa+b)" },

232 };

233

234 TEST(TestSimplify, SimpleRegexps) {

235 for (int i = 0; i < arraysize(tests); i++) {

236 RegexpStatus status;

237 VLOG(1) << "Testing " << tests[i].regexp;

238 Regexp* re = Regexp::Parse(tests[i].regexp,

239 Regexp::MatchNL \| (Regexp::LikePerl &

240 ~Regexp::OneLine),

241 &status);

242 CHECK(re != NULL) << " " << tests[i].regexp << " " << status.Text();

243 Regexp* sre = re->Simplify();

244 CHECK(sre != NULL);

245

246 // Check that already-simple regexps don't allocate new ones.

247 if (strcmp(tests[i].regexp, tests[i].simplified) == 0) {

248 CHECK(re == sre) << " " << tests[i].regexp

249 << " " << re->ToString() << " " << sre->ToString();

250 }

251

252 EXPECT_EQ(tests[i].simplified, sre->ToString())

253 << " " << tests[i].regexp << " " << sre->Dump();

254

255 re->Decref();

256 sre->Decref();

257 }

258 }

259

260 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/testing/set_test.cc ('k') | third_party/re2/re2/testing/string_generator.h » ('j') | no next file with comments »