OLD | NEW |
| (Empty) |
1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Format a regular expression structure as a string. | |
6 // Tested by parse_test.cc | |
7 | |
8 #include "util/util.h" | |
9 #include "re2/regexp.h" | |
10 #include "re2/walker-inl.h" | |
11 | |
12 namespace re2 { | |
13 | |
14 enum { | |
15 PrecAtom, | |
16 PrecUnary, | |
17 PrecConcat, | |
18 PrecAlternate, | |
19 PrecEmpty, | |
20 PrecParen, | |
21 PrecToplevel, | |
22 }; | |
23 | |
24 // Helper function. See description below. | |
25 static void AppendCCRange(string* t, Rune lo, Rune hi); | |
26 | |
27 // Walker to generate string in s_. | |
28 // The arg pointers are actually integers giving the | |
29 // context precedence. | |
30 // The child_args are always NULL. | |
31 class ToStringWalker : public Regexp::Walker<int> { | |
32 public: | |
33 explicit ToStringWalker(string* t) : t_(t) {} | |
34 | |
35 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); | |
36 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, | |
37 int* child_args, int nchild_args); | |
38 virtual int ShortVisit(Regexp* re, int parent_arg) { | |
39 return 0; | |
40 } | |
41 | |
42 private: | |
43 string* t_; // The string the walker appends to. | |
44 | |
45 DISALLOW_COPY_AND_ASSIGN(ToStringWalker); | |
46 }; | |
47 | |
48 string Regexp::ToString() { | |
49 string t; | |
50 ToStringWalker w(&t); | |
51 w.WalkExponential(this, PrecToplevel, 100000); | |
52 if (w.stopped_early()) | |
53 t += " [truncated]"; | |
54 return t; | |
55 } | |
56 | |
57 #define ToString DontCallToString // Avoid accidental recursion. | |
58 | |
59 // Visits re before children are processed. | |
60 // Appends ( if needed and passes new precedence to children. | |
61 int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { | |
62 int prec = parent_arg; | |
63 int nprec = PrecAtom; | |
64 | |
65 switch (re->op()) { | |
66 case kRegexpNoMatch: | |
67 case kRegexpEmptyMatch: | |
68 case kRegexpLiteral: | |
69 case kRegexpAnyChar: | |
70 case kRegexpAnyByte: | |
71 case kRegexpBeginLine: | |
72 case kRegexpEndLine: | |
73 case kRegexpBeginText: | |
74 case kRegexpEndText: | |
75 case kRegexpWordBoundary: | |
76 case kRegexpNoWordBoundary: | |
77 case kRegexpCharClass: | |
78 case kRegexpHaveMatch: | |
79 nprec = PrecAtom; | |
80 break; | |
81 | |
82 case kRegexpConcat: | |
83 case kRegexpLiteralString: | |
84 if (prec < PrecConcat) | |
85 t_->append("(?:"); | |
86 nprec = PrecConcat; | |
87 break; | |
88 | |
89 case kRegexpAlternate: | |
90 if (prec < PrecAlternate) | |
91 t_->append("(?:"); | |
92 nprec = PrecAlternate; | |
93 break; | |
94 | |
95 case kRegexpCapture: | |
96 t_->append("("); | |
97 if (re->cap() == 0) | |
98 LOG(DFATAL) << "kRegexpCapture cap() == 0"; | |
99 if (re->name()) { | |
100 t_->append("?P<"); | |
101 t_->append(*re->name()); | |
102 t_->append(">"); | |
103 } | |
104 nprec = PrecParen; | |
105 break; | |
106 | |
107 case kRegexpStar: | |
108 case kRegexpPlus: | |
109 case kRegexpQuest: | |
110 case kRegexpRepeat: | |
111 if (prec < PrecUnary) | |
112 t_->append("(?:"); | |
113 // The subprecedence here is PrecAtom instead of PrecUnary | |
114 // because PCRE treats two unary ops in a row as a parse error. | |
115 nprec = PrecAtom; | |
116 break; | |
117 } | |
118 | |
119 return nprec; | |
120 } | |
121 | |
122 static void AppendLiteral(string *t, Rune r, bool foldcase) { | |
123 if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { | |
124 t->append(1, '\\'); | |
125 t->append(1, static_cast<char>(r)); | |
126 } else if (foldcase && 'a' <= r && r <= 'z') { | |
127 if ('a' <= r && r <= 'z') | |
128 r += 'A' - 'a'; | |
129 t->append(1, '['); | |
130 t->append(1, static_cast<char>(r)); | |
131 t->append(1, static_cast<char>(r) + 'a' - 'A'); | |
132 t->append(1, ']'); | |
133 } else { | |
134 AppendCCRange(t, r, r); | |
135 } | |
136 } | |
137 | |
138 // Visits re after children are processed. | |
139 // For childless regexps, all the work is done here. | |
140 // For regexps with children, append any unary suffixes or ). | |
141 int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, | |
142 int* child_args, int nchild_args) { | |
143 int prec = parent_arg; | |
144 switch (re->op()) { | |
145 case kRegexpNoMatch: | |
146 // There's no simple symbol for "no match", but | |
147 // [^0-Runemax] excludes everything. | |
148 t_->append("[^\\x00-\\x{10ffff}]"); | |
149 break; | |
150 | |
151 case kRegexpEmptyMatch: | |
152 // Append (?:) to make empty string visible, | |
153 // unless this is already being parenthesized. | |
154 if (prec < PrecEmpty) | |
155 t_->append("(?:)"); | |
156 break; | |
157 | |
158 case kRegexpLiteral: | |
159 AppendLiteral(t_, re->rune(), | |
160 (re->parse_flags() & Regexp::FoldCase) != 0); | |
161 break; | |
162 | |
163 case kRegexpLiteralString: | |
164 for (int i = 0; i < re->nrunes(); i++) | |
165 AppendLiteral(t_, re->runes()[i], | |
166 (re->parse_flags() & Regexp::FoldCase) != 0); | |
167 if (prec < PrecConcat) | |
168 t_->append(")"); | |
169 break; | |
170 | |
171 case kRegexpConcat: | |
172 if (prec < PrecConcat) | |
173 t_->append(")"); | |
174 break; | |
175 | |
176 case kRegexpAlternate: | |
177 // Clumsy but workable: the children all appended | | |
178 // at the end of their strings, so just remove the last one. | |
179 if ((*t_)[t_->size()-1] == '|') | |
180 t_->erase(t_->size()-1); | |
181 else | |
182 LOG(DFATAL) << "Bad final char: " << t_; | |
183 if (prec < PrecAlternate) | |
184 t_->append(")"); | |
185 break; | |
186 | |
187 case kRegexpStar: | |
188 t_->append("*"); | |
189 if (re->parse_flags() & Regexp::NonGreedy) | |
190 t_->append("?"); | |
191 if (prec < PrecUnary) | |
192 t_->append(")"); | |
193 break; | |
194 | |
195 case kRegexpPlus: | |
196 t_->append("+"); | |
197 if (re->parse_flags() & Regexp::NonGreedy) | |
198 t_->append("?"); | |
199 if (prec < PrecUnary) | |
200 t_->append(")"); | |
201 break; | |
202 | |
203 case kRegexpQuest: | |
204 t_->append("?"); | |
205 if (re->parse_flags() & Regexp::NonGreedy) | |
206 t_->append("?"); | |
207 if (prec < PrecUnary) | |
208 t_->append(")"); | |
209 break; | |
210 | |
211 case kRegexpRepeat: | |
212 if (re->max() == -1) | |
213 t_->append(StringPrintf("{%d,}", re->min())); | |
214 else if (re->min() == re->max()) | |
215 t_->append(StringPrintf("{%d}", re->min())); | |
216 else | |
217 t_->append(StringPrintf("{%d,%d}", re->min(), re->max())); | |
218 if (re->parse_flags() & Regexp::NonGreedy) | |
219 t_->append("?"); | |
220 if (prec < PrecUnary) | |
221 t_->append(")"); | |
222 break; | |
223 | |
224 case kRegexpAnyChar: | |
225 t_->append("."); | |
226 break; | |
227 | |
228 case kRegexpAnyByte: | |
229 t_->append("\\C"); | |
230 break; | |
231 | |
232 case kRegexpBeginLine: | |
233 t_->append("^"); | |
234 break; | |
235 | |
236 case kRegexpEndLine: | |
237 t_->append("$"); | |
238 break; | |
239 | |
240 case kRegexpBeginText: | |
241 t_->append("(?-m:^)"); | |
242 break; | |
243 | |
244 case kRegexpEndText: | |
245 if (re->parse_flags() & Regexp::WasDollar) | |
246 t_->append("(?-m:$)"); | |
247 else | |
248 t_->append("\\z"); | |
249 break; | |
250 | |
251 case kRegexpWordBoundary: | |
252 t_->append("\\b"); | |
253 break; | |
254 | |
255 case kRegexpNoWordBoundary: | |
256 t_->append("\\B"); | |
257 break; | |
258 | |
259 case kRegexpCharClass: { | |
260 if (re->cc()->size() == 0) { | |
261 t_->append("[^\\x00-\\x{10ffff}]"); | |
262 break; | |
263 } | |
264 t_->append("["); | |
265 // Heuristic: show class as negated if it contains the | |
266 // non-character 0xFFFE. | |
267 CharClass* cc = re->cc(); | |
268 if (cc->Contains(0xFFFE)) { | |
269 cc = cc->Negate(); | |
270 t_->append("^"); | |
271 } | |
272 for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) | |
273 AppendCCRange(t_, i->lo, i->hi); | |
274 if (cc != re->cc()) | |
275 cc->Delete(); | |
276 t_->append("]"); | |
277 break; | |
278 } | |
279 | |
280 case kRegexpCapture: | |
281 t_->append(")"); | |
282 break; | |
283 | |
284 case kRegexpHaveMatch: | |
285 // There's no syntax accepted by the parser to generate | |
286 // this node (it is generated by RE2::Set) so make something | |
287 // up that is readable but won't compile. | |
288 t_->append("(?HaveMatch:%d)", re->match_id()); | |
289 break; | |
290 } | |
291 | |
292 // If the parent is an alternation, append the | for it. | |
293 if (prec == PrecAlternate) | |
294 t_->append("|"); | |
295 | |
296 return 0; | |
297 } | |
298 | |
299 // Appends a rune for use in a character class to the string t. | |
300 static void AppendCCChar(string* t, Rune r) { | |
301 if (0x20 <= r && r <= 0x7E) { | |
302 if (strchr("[]^-\\", r)) | |
303 t->append("\\"); | |
304 t->append(1, static_cast<char>(r)); | |
305 return; | |
306 } | |
307 switch (r) { | |
308 default: | |
309 break; | |
310 | |
311 case '\r': | |
312 t->append("\\r"); | |
313 return; | |
314 | |
315 case '\t': | |
316 t->append("\\t"); | |
317 return; | |
318 | |
319 case '\n': | |
320 t->append("\\n"); | |
321 return; | |
322 | |
323 case '\f': | |
324 t->append("\\f"); | |
325 return; | |
326 } | |
327 | |
328 if (r < 0x100) { | |
329 StringAppendF(t, "\\x%02x", static_cast<int>(r)); | |
330 return; | |
331 } | |
332 StringAppendF(t, "\\x{%x}", static_cast<int>(r)); | |
333 } | |
334 | |
335 static void AppendCCRange(string* t, Rune lo, Rune hi) { | |
336 if (lo > hi) | |
337 return; | |
338 AppendCCChar(t, lo); | |
339 if (lo < hi) { | |
340 t->append("-"); | |
341 AppendCCChar(t, hi); | |
342 } | |
343 } | |
344 | |
345 } // namespace re2 | |
OLD | NEW |