OLD | NEW |
| (Empty) |
1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Rewrite POSIX and other features in re | |
6 // to use simple extended regular expression features. | |
7 // Also sort and simplify character classes. | |
8 | |
9 #include "util/util.h" | |
10 #include "re2/regexp.h" | |
11 #include "re2/walker-inl.h" | |
12 | |
13 namespace re2 { | |
14 | |
15 // Parses the regexp src and then simplifies it and sets *dst to the | |
16 // string representation of the simplified form. Returns true on success. | |
17 // Returns false and sets *error (if error != NULL) on error. | |
18 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags, | |
19 string* dst, | |
20 RegexpStatus* status) { | |
21 Regexp* re = Parse(src, flags, status); | |
22 if (re == NULL) | |
23 return false; | |
24 Regexp* sre = re->Simplify(); | |
25 re->Decref(); | |
26 if (sre == NULL) { | |
27 // Should not happen, since Simplify never fails. | |
28 LOG(ERROR) << "Simplify failed on " << src; | |
29 if (status) { | |
30 status->set_code(kRegexpInternalError); | |
31 status->set_error_arg(src); | |
32 } | |
33 return false; | |
34 } | |
35 *dst = sre->ToString(); | |
36 sre->Decref(); | |
37 return true; | |
38 } | |
39 | |
40 // Assuming the simple_ flags on the children are accurate, | |
41 // is this Regexp* simple? | |
42 bool Regexp::ComputeSimple() { | |
43 Regexp** subs; | |
44 switch (op_) { | |
45 case kRegexpNoMatch: | |
46 case kRegexpEmptyMatch: | |
47 case kRegexpLiteral: | |
48 case kRegexpLiteralString: | |
49 case kRegexpBeginLine: | |
50 case kRegexpEndLine: | |
51 case kRegexpBeginText: | |
52 case kRegexpWordBoundary: | |
53 case kRegexpNoWordBoundary: | |
54 case kRegexpEndText: | |
55 case kRegexpAnyChar: | |
56 case kRegexpAnyByte: | |
57 case kRegexpHaveMatch: | |
58 return true; | |
59 case kRegexpConcat: | |
60 case kRegexpAlternate: | |
61 // These are simple as long as the subpieces are simple. | |
62 subs = sub(); | |
63 for (int i = 0; i < nsub_; i++) | |
64 if (!subs[i]->simple()) | |
65 return false; | |
66 return true; | |
67 case kRegexpCharClass: | |
68 // Simple as long as the char class is not empty, not full. | |
69 if (ccb_ != NULL) | |
70 return !ccb_->empty() && !ccb_->full(); | |
71 return !cc_->empty() && !cc_->full(); | |
72 case kRegexpCapture: | |
73 subs = sub(); | |
74 return subs[0]->simple(); | |
75 case kRegexpStar: | |
76 case kRegexpPlus: | |
77 case kRegexpQuest: | |
78 subs = sub(); | |
79 if (!subs[0]->simple()) | |
80 return false; | |
81 switch (subs[0]->op_) { | |
82 case kRegexpStar: | |
83 case kRegexpPlus: | |
84 case kRegexpQuest: | |
85 case kRegexpEmptyMatch: | |
86 case kRegexpNoMatch: | |
87 return false; | |
88 default: | |
89 break; | |
90 } | |
91 return true; | |
92 case kRegexpRepeat: | |
93 return false; | |
94 } | |
95 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_; | |
96 return false; | |
97 } | |
98 | |
99 // Walker subclass used by Simplify. | |
100 // Coalesces runs of star/plus/quest/repeat of the same literal along with any | |
101 // occurrences of that literal into repeats of that literal. It also works for | |
102 // char classes, any char and any byte. | |
103 // PostVisit creates the coalesced result, which should then be simplified. | |
104 class CoalesceWalker : public Regexp::Walker<Regexp*> { | |
105 public: | |
106 CoalesceWalker() {} | |
107 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, | |
108 Regexp** child_args, int nchild_args); | |
109 virtual Regexp* Copy(Regexp* re); | |
110 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); | |
111 | |
112 private: | |
113 // These functions are declared inside CoalesceWalker so that | |
114 // they can edit the private fields of the Regexps they construct. | |
115 | |
116 // Returns true if r1 and r2 can be coalesced. In particular, ensures that | |
117 // the parse flags are consistent. (They will not be checked again later.) | |
118 static bool CanCoalesce(Regexp* r1, Regexp* r2); | |
119 | |
120 // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards | |
121 // will be empty match and the coalesced op. In other cases, where part of a | |
122 // literal string was removed to be coalesced, the array elements afterwards | |
123 // will be the coalesced op and the remainder of the literal string. | |
124 static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr); | |
125 | |
126 DISALLOW_COPY_AND_ASSIGN(CoalesceWalker); | |
127 }; | |
128 | |
129 // Walker subclass used by Simplify. | |
130 // The simplify walk is purely post-recursive: given the simplified children, | |
131 // PostVisit creates the simplified result. | |
132 // The child_args are simplified Regexp*s. | |
133 class SimplifyWalker : public Regexp::Walker<Regexp*> { | |
134 public: | |
135 SimplifyWalker() {} | |
136 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop); | |
137 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg, | |
138 Regexp** child_args, int nchild_args); | |
139 virtual Regexp* Copy(Regexp* re); | |
140 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg); | |
141 | |
142 private: | |
143 // These functions are declared inside SimplifyWalker so that | |
144 // they can edit the private fields of the Regexps they construct. | |
145 | |
146 // Creates a concatenation of two Regexp, consuming refs to re1 and re2. | |
147 // Caller must Decref return value when done with it. | |
148 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags); | |
149 | |
150 // Simplifies the expression re{min,max} in terms of *, +, and ?. | |
151 // Returns a new regexp. Does not edit re. Does not consume reference to re. | |
152 // Caller must Decref return value when done with it. | |
153 static Regexp* SimplifyRepeat(Regexp* re, int min, int max, | |
154 Regexp::ParseFlags parse_flags); | |
155 | |
156 // Simplifies a character class by expanding any named classes | |
157 // into rune ranges. Does not edit re. Does not consume ref to re. | |
158 // Caller must Decref return value when done with it. | |
159 static Regexp* SimplifyCharClass(Regexp* re); | |
160 | |
161 DISALLOW_COPY_AND_ASSIGN(SimplifyWalker); | |
162 }; | |
163 | |
164 // Simplifies a regular expression, returning a new regexp. | |
165 // The new regexp uses traditional Unix egrep features only, | |
166 // plus the Perl (?:) non-capturing parentheses. | |
167 // Otherwise, no POSIX or Perl additions. The new regexp | |
168 // captures exactly the same subexpressions (with the same indices) | |
169 // as the original. | |
170 // Does not edit current object. | |
171 // Caller must Decref() return value when done with it. | |
172 | |
173 Regexp* Regexp::Simplify() { | |
174 CoalesceWalker cw; | |
175 Regexp* cre = cw.Walk(this, NULL); | |
176 if (cre == NULL) | |
177 return cre; | |
178 SimplifyWalker sw; | |
179 Regexp* sre = sw.Walk(cre, NULL); | |
180 cre->Decref(); | |
181 return sre; | |
182 } | |
183 | |
184 #define Simplify DontCallSimplify // Avoid accidental recursion | |
185 | |
186 // Utility function for PostVisit implementations that compares re->sub() with | |
187 // child_args to determine whether any child_args changed. In the common case, | |
188 // where nothing changed, calls Decref() for all child_args and returns false, | |
189 // so PostVisit must return re->Incref(). Otherwise, returns true. | |
190 static bool ChildArgsChanged(Regexp* re, Regexp** child_args) { | |
191 for (int i = 0; i < re->nsub(); i++) { | |
192 Regexp* sub = re->sub()[i]; | |
193 Regexp* newsub = child_args[i]; | |
194 if (newsub != sub) | |
195 return true; | |
196 } | |
197 for (int i = 0; i < re->nsub(); i++) { | |
198 Regexp* newsub = child_args[i]; | |
199 newsub->Decref(); | |
200 } | |
201 return false; | |
202 } | |
203 | |
204 Regexp* CoalesceWalker::Copy(Regexp* re) { | |
205 return re->Incref(); | |
206 } | |
207 | |
208 Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { | |
209 // This should never be called, since we use Walk and not | |
210 // WalkExponential. | |
211 LOG(DFATAL) << "CoalesceWalker::ShortVisit called"; | |
212 return re->Incref(); | |
213 } | |
214 | |
215 Regexp* CoalesceWalker::PostVisit(Regexp* re, | |
216 Regexp* parent_arg, | |
217 Regexp* pre_arg, | |
218 Regexp** child_args, | |
219 int nchild_args) { | |
220 if (re->nsub() == 0) | |
221 return re->Incref(); | |
222 | |
223 if (re->op() != kRegexpConcat) { | |
224 if (!ChildArgsChanged(re, child_args)) | |
225 return re->Incref(); | |
226 | |
227 // Something changed. Build a new op. | |
228 Regexp* nre = new Regexp(re->op(), re->parse_flags()); | |
229 nre->AllocSub(re->nsub()); | |
230 Regexp** nre_subs = nre->sub(); | |
231 for (int i = 0; i < re->nsub(); i++) | |
232 nre_subs[i] = child_args[i]; | |
233 // Repeats and Captures have additional data that must be copied. | |
234 if (re->op() == kRegexpRepeat) { | |
235 nre->min_ = re->min(); | |
236 nre->max_ = re->max(); | |
237 } else if (re->op() == kRegexpCapture) { | |
238 nre->cap_ = re->cap(); | |
239 } | |
240 return nre; | |
241 } | |
242 | |
243 bool can_coalesce = false; | |
244 for (int i = 0; i < re->nsub(); i++) { | |
245 if (i+1 < re->nsub() && | |
246 CanCoalesce(child_args[i], child_args[i+1])) { | |
247 can_coalesce = true; | |
248 break; | |
249 } | |
250 } | |
251 if (!can_coalesce) { | |
252 if (!ChildArgsChanged(re, child_args)) | |
253 return re->Incref(); | |
254 | |
255 // Something changed. Build a new op. | |
256 Regexp* nre = new Regexp(re->op(), re->parse_flags()); | |
257 nre->AllocSub(re->nsub()); | |
258 Regexp** nre_subs = nre->sub(); | |
259 for (int i = 0; i < re->nsub(); i++) | |
260 nre_subs[i] = child_args[i]; | |
261 return nre; | |
262 } | |
263 | |
264 for (int i = 0; i < re->nsub(); i++) { | |
265 if (i+1 < re->nsub() && | |
266 CanCoalesce(child_args[i], child_args[i+1])) | |
267 DoCoalesce(&child_args[i], &child_args[i+1]); | |
268 } | |
269 // Determine how many empty matches were left by DoCoalesce. | |
270 int n = 0; | |
271 for (int i = n; i < re->nsub(); i++) { | |
272 if (child_args[i]->op() == kRegexpEmptyMatch) | |
273 n++; | |
274 } | |
275 // Build a new op. | |
276 Regexp* nre = new Regexp(re->op(), re->parse_flags()); | |
277 nre->AllocSub(re->nsub() - n); | |
278 Regexp** nre_subs = nre->sub(); | |
279 for (int i = 0, j = 0; i < re->nsub(); i++) { | |
280 if (child_args[i]->op() == kRegexpEmptyMatch) { | |
281 child_args[i]->Decref(); | |
282 continue; | |
283 } | |
284 nre_subs[j] = child_args[i]; | |
285 j++; | |
286 } | |
287 return nre; | |
288 } | |
289 | |
290 bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) { | |
291 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or | |
292 // any byte. | |
293 if ((r1->op() == kRegexpStar || | |
294 r1->op() == kRegexpPlus || | |
295 r1->op() == kRegexpQuest || | |
296 r1->op() == kRegexpRepeat) && | |
297 (r1->sub()[0]->op() == kRegexpLiteral || | |
298 r1->sub()[0]->op() == kRegexpCharClass || | |
299 r1->sub()[0]->op() == kRegexpAnyChar || | |
300 r1->sub()[0]->op() == kRegexpAnyByte)) { | |
301 // r2 must be a star/plus/quest/repeat of the same literal, char class, | |
302 // any char or any byte. | |
303 if ((r2->op() == kRegexpStar || | |
304 r2->op() == kRegexpPlus || | |
305 r2->op() == kRegexpQuest || | |
306 r2->op() == kRegexpRepeat) && | |
307 Regexp::Equal(r1->sub()[0], r2->sub()[0]) && | |
308 // The parse flags must be consistent. | |
309 ((r1->parse_flags() & Regexp::NonGreedy) == | |
310 (r2->parse_flags() & Regexp::NonGreedy))) { | |
311 return true; | |
312 } | |
313 // ... OR an occurrence of that literal, char class, any char or any byte | |
314 if (Regexp::Equal(r1->sub()[0], r2)) { | |
315 return true; | |
316 } | |
317 // ... OR a literal string that begins with that literal. | |
318 if (r1->sub()[0]->op() == kRegexpLiteral && | |
319 r2->op() == kRegexpLiteralString && | |
320 r2->runes()[0] == r1->sub()[0]->rune() && | |
321 // The parse flags must be consistent. | |
322 ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) == | |
323 (r2->parse_flags() & Regexp::FoldCase))) { | |
324 return true; | |
325 } | |
326 } | |
327 return false; | |
328 } | |
329 | |
330 void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) { | |
331 Regexp* r1 = *r1ptr; | |
332 Regexp* r2 = *r2ptr; | |
333 | |
334 Regexp* nre = Regexp::Repeat( | |
335 r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0); | |
336 | |
337 switch (r1->op()) { | |
338 case kRegexpStar: | |
339 nre->min_ = 0; | |
340 nre->max_ = -1; | |
341 break; | |
342 | |
343 case kRegexpPlus: | |
344 nre->min_ = 1; | |
345 nre->max_ = -1; | |
346 break; | |
347 | |
348 case kRegexpQuest: | |
349 nre->min_ = 0; | |
350 nre->max_ = 1; | |
351 break; | |
352 | |
353 case kRegexpRepeat: | |
354 nre->min_ = r1->min(); | |
355 nre->max_ = r1->max(); | |
356 break; | |
357 | |
358 default: | |
359 LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op(); | |
360 nre->Decref(); | |
361 return; | |
362 } | |
363 | |
364 switch (r2->op()) { | |
365 case kRegexpStar: | |
366 nre->max_ = -1; | |
367 goto LeaveEmpty; | |
368 | |
369 case kRegexpPlus: | |
370 nre->min_++; | |
371 nre->max_ = -1; | |
372 goto LeaveEmpty; | |
373 | |
374 case kRegexpQuest: | |
375 if (nre->max() != -1) | |
376 nre->max_++; | |
377 goto LeaveEmpty; | |
378 | |
379 case kRegexpRepeat: | |
380 nre->min_ += r2->min(); | |
381 if (r2->max() == -1) | |
382 nre->max_ = -1; | |
383 else if (nre->max() != -1) | |
384 nre->max_ += r2->max(); | |
385 goto LeaveEmpty; | |
386 | |
387 case kRegexpLiteral: | |
388 case kRegexpCharClass: | |
389 case kRegexpAnyChar: | |
390 case kRegexpAnyByte: | |
391 nre->min_++; | |
392 if (nre->max() != -1) | |
393 nre->max_++; | |
394 goto LeaveEmpty; | |
395 | |
396 LeaveEmpty: | |
397 *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags); | |
398 *r2ptr = nre; | |
399 break; | |
400 | |
401 case kRegexpLiteralString: { | |
402 Rune r = r1->sub()[0]->rune(); | |
403 // Determine how much of the literal string is removed. | |
404 // We know that we have at least one rune. :) | |
405 int n = 1; | |
406 while (n < r2->nrunes() && r2->runes()[n] == r) | |
407 n++; | |
408 nre->min_ += n; | |
409 if (nre->max() != -1) | |
410 nre->max_ += n; | |
411 if (n == r2->nrunes()) | |
412 goto LeaveEmpty; | |
413 *r1ptr = nre; | |
414 *r2ptr = Regexp::LiteralString( | |
415 &r2->runes()[n], r2->nrunes() - n, r2->parse_flags()); | |
416 break; | |
417 } | |
418 | |
419 default: | |
420 LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op(); | |
421 nre->Decref(); | |
422 return; | |
423 } | |
424 | |
425 r1->Decref(); | |
426 r2->Decref(); | |
427 } | |
428 | |
429 Regexp* SimplifyWalker::Copy(Regexp* re) { | |
430 return re->Incref(); | |
431 } | |
432 | |
433 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) { | |
434 // This should never be called, since we use Walk and not | |
435 // WalkExponential. | |
436 LOG(DFATAL) << "SimplifyWalker::ShortVisit called"; | |
437 return re->Incref(); | |
438 } | |
439 | |
440 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) { | |
441 if (re->simple()) { | |
442 *stop = true; | |
443 return re->Incref(); | |
444 } | |
445 return NULL; | |
446 } | |
447 | |
448 Regexp* SimplifyWalker::PostVisit(Regexp* re, | |
449 Regexp* parent_arg, | |
450 Regexp* pre_arg, | |
451 Regexp** child_args, | |
452 int nchild_args) { | |
453 switch (re->op()) { | |
454 case kRegexpNoMatch: | |
455 case kRegexpEmptyMatch: | |
456 case kRegexpLiteral: | |
457 case kRegexpLiteralString: | |
458 case kRegexpBeginLine: | |
459 case kRegexpEndLine: | |
460 case kRegexpBeginText: | |
461 case kRegexpWordBoundary: | |
462 case kRegexpNoWordBoundary: | |
463 case kRegexpEndText: | |
464 case kRegexpAnyChar: | |
465 case kRegexpAnyByte: | |
466 case kRegexpHaveMatch: | |
467 // All these are always simple. | |
468 re->simple_ = true; | |
469 return re->Incref(); | |
470 | |
471 case kRegexpConcat: | |
472 case kRegexpAlternate: { | |
473 // These are simple as long as the subpieces are simple. | |
474 if (!ChildArgsChanged(re, child_args)) { | |
475 re->simple_ = true; | |
476 return re->Incref(); | |
477 } | |
478 Regexp* nre = new Regexp(re->op(), re->parse_flags()); | |
479 nre->AllocSub(re->nsub()); | |
480 Regexp** nre_subs = nre->sub(); | |
481 for (int i = 0; i < re->nsub(); i++) | |
482 nre_subs[i] = child_args[i]; | |
483 nre->simple_ = true; | |
484 return nre; | |
485 } | |
486 | |
487 case kRegexpCapture: { | |
488 Regexp* newsub = child_args[0]; | |
489 if (newsub == re->sub()[0]) { | |
490 newsub->Decref(); | |
491 re->simple_ = true; | |
492 return re->Incref(); | |
493 } | |
494 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags()); | |
495 nre->AllocSub(1); | |
496 nre->sub()[0] = newsub; | |
497 nre->cap_ = re->cap(); | |
498 nre->simple_ = true; | |
499 return nre; | |
500 } | |
501 | |
502 case kRegexpStar: | |
503 case kRegexpPlus: | |
504 case kRegexpQuest: { | |
505 Regexp* newsub = child_args[0]; | |
506 // Special case: repeat the empty string as much as | |
507 // you want, but it's still the empty string. | |
508 if (newsub->op() == kRegexpEmptyMatch) | |
509 return newsub; | |
510 | |
511 // These are simple as long as the subpiece is simple. | |
512 if (newsub == re->sub()[0]) { | |
513 newsub->Decref(); | |
514 re->simple_ = true; | |
515 return re->Incref(); | |
516 } | |
517 | |
518 // These are also idempotent if flags are constant. | |
519 if (re->op() == newsub->op() && | |
520 re->parse_flags() == newsub->parse_flags()) | |
521 return newsub; | |
522 | |
523 Regexp* nre = new Regexp(re->op(), re->parse_flags()); | |
524 nre->AllocSub(1); | |
525 nre->sub()[0] = newsub; | |
526 nre->simple_ = true; | |
527 return nre; | |
528 } | |
529 | |
530 case kRegexpRepeat: { | |
531 Regexp* newsub = child_args[0]; | |
532 // Special case: repeat the empty string as much as | |
533 // you want, but it's still the empty string. | |
534 if (newsub->op() == kRegexpEmptyMatch) | |
535 return newsub; | |
536 | |
537 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_, | |
538 re->parse_flags()); | |
539 newsub->Decref(); | |
540 nre->simple_ = true; | |
541 return nre; | |
542 } | |
543 | |
544 case kRegexpCharClass: { | |
545 Regexp* nre = SimplifyCharClass(re); | |
546 nre->simple_ = true; | |
547 return nre; | |
548 } | |
549 } | |
550 | |
551 LOG(ERROR) << "Simplify case not handled: " << re->op(); | |
552 return re->Incref(); | |
553 } | |
554 | |
555 // Creates a concatenation of two Regexp, consuming refs to re1 and re2. | |
556 // Returns a new Regexp, handing the ref to the caller. | |
557 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2, | |
558 Regexp::ParseFlags parse_flags) { | |
559 Regexp* re = new Regexp(kRegexpConcat, parse_flags); | |
560 re->AllocSub(2); | |
561 Regexp** subs = re->sub(); | |
562 subs[0] = re1; | |
563 subs[1] = re2; | |
564 return re; | |
565 } | |
566 | |
567 // Simplifies the expression re{min,max} in terms of *, +, and ?. | |
568 // Returns a new regexp. Does not edit re. Does not consume reference to re. | |
569 // Caller must Decref return value when done with it. | |
570 // The result will *not* necessarily have the right capturing parens | |
571 // if you call ToString() and re-parse it: (x){2} becomes (x)(x), | |
572 // but in the Regexp* representation, both (x) are marked as $1. | |
573 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max, | |
574 Regexp::ParseFlags f) { | |
575 // x{n,} means at least n matches of x. | |
576 if (max == -1) { | |
577 // Special case: x{0,} is x* | |
578 if (min == 0) | |
579 return Regexp::Star(re->Incref(), f); | |
580 | |
581 // Special case: x{1,} is x+ | |
582 if (min == 1) | |
583 return Regexp::Plus(re->Incref(), f); | |
584 | |
585 // General case: x{4,} is xxxx+ | |
586 Regexp* nre = new Regexp(kRegexpConcat, f); | |
587 nre->AllocSub(min); | |
588 Regexp** nre_subs = nre->sub(); | |
589 for (int i = 0; i < min-1; i++) | |
590 nre_subs[i] = re->Incref(); | |
591 nre_subs[min-1] = Regexp::Plus(re->Incref(), f); | |
592 return nre; | |
593 } | |
594 | |
595 // Special case: (x){0} matches only empty string. | |
596 if (min == 0 && max == 0) | |
597 return new Regexp(kRegexpEmptyMatch, f); | |
598 | |
599 // Special case: x{1} is just x. | |
600 if (min == 1 && max == 1) | |
601 return re->Incref(); | |
602 | |
603 // General case: x{n,m} means n copies of x and m copies of x?. | |
604 // The machine will do less work if we nest the final m copies, | |
605 // so that x{2,5} = xx(x(x(x)?)?)? | |
606 | |
607 // Build leading prefix: xx. Capturing only on the last one. | |
608 Regexp* nre = NULL; | |
609 if (min > 0) { | |
610 nre = new Regexp(kRegexpConcat, f); | |
611 nre->AllocSub(min); | |
612 Regexp** nre_subs = nre->sub(); | |
613 for (int i = 0; i < min; i++) | |
614 nre_subs[i] = re->Incref(); | |
615 } | |
616 | |
617 // Build and attach suffix: (x(x(x)?)?)? | |
618 if (max > min) { | |
619 Regexp* suf = Regexp::Quest(re->Incref(), f); | |
620 for (int i = min+1; i < max; i++) | |
621 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f); | |
622 if (nre == NULL) | |
623 nre = suf; | |
624 else | |
625 nre = Concat2(nre, suf, f); | |
626 } | |
627 | |
628 if (nre == NULL) { | |
629 // Some degenerate case, like min > max, or min < max < 0. | |
630 // This shouldn't happen, because the parser rejects such regexps. | |
631 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " <<
max; | |
632 return new Regexp(kRegexpNoMatch, f); | |
633 } | |
634 | |
635 return nre; | |
636 } | |
637 | |
638 // Simplifies a character class. | |
639 // Caller must Decref return value when done with it. | |
640 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) { | |
641 CharClass* cc = re->cc(); | |
642 | |
643 // Special cases | |
644 if (cc->empty()) | |
645 return new Regexp(kRegexpNoMatch, re->parse_flags()); | |
646 if (cc->full()) | |
647 return new Regexp(kRegexpAnyChar, re->parse_flags()); | |
648 | |
649 return re->Incref(); | |
650 } | |
651 | |
652 } // namespace re2 | |
OLD | NEW |