OLD | NEW |
| (Empty) |
1 // Copyright 2006 The RE2 Authors. All Rights Reserved. | |
2 // Use of this source code is governed by a BSD-style | |
3 // license that can be found in the LICENSE file. | |
4 | |
5 // Regular expression representation. | |
6 // Tested by parse_test.cc | |
7 | |
8 #include "util/util.h" | |
9 #include "re2/regexp.h" | |
10 #include "re2/stringpiece.h" | |
11 #include "re2/walker-inl.h" | |
12 | |
13 namespace re2 { | |
14 | |
15 // Constructor. Allocates vectors as appropriate for operator. | |
16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) | |
17 : op_(static_cast<uint8>(op)), | |
18 simple_(false), | |
19 parse_flags_(static_cast<uint16>(parse_flags)), | |
20 ref_(1), | |
21 nsub_(0), | |
22 down_(NULL) { | |
23 subone_ = NULL; | |
24 memset(the_union_, 0, sizeof the_union_); | |
25 } | |
26 | |
27 // Destructor. Assumes already cleaned up children. | |
28 // Private: use Decref() instead of delete to destroy Regexps. | |
29 // Can't call Decref on the sub-Regexps here because | |
30 // that could cause arbitrarily deep recursion, so | |
31 // required Decref() to have handled them for us. | |
32 Regexp::~Regexp() { | |
33 if (nsub_ > 0) | |
34 LOG(DFATAL) << "Regexp not destroyed."; | |
35 | |
36 switch (op_) { | |
37 default: | |
38 break; | |
39 case kRegexpCapture: | |
40 delete name_; | |
41 break; | |
42 case kRegexpLiteralString: | |
43 delete[] runes_; | |
44 break; | |
45 case kRegexpCharClass: | |
46 if (cc_) | |
47 cc_->Delete(); | |
48 delete ccb_; | |
49 break; | |
50 } | |
51 } | |
52 | |
53 // If it's possible to destroy this regexp without recurring, | |
54 // do so and return true. Else return false. | |
55 bool Regexp::QuickDestroy() { | |
56 if (nsub_ == 0) { | |
57 delete this; | |
58 return true; | |
59 } | |
60 return false; | |
61 } | |
62 | |
63 static map<Regexp*, int> *ref_map; | |
64 GLOBAL_MUTEX(ref_mutex); | |
65 | |
66 int Regexp::Ref() { | |
67 if (ref_ < kMaxRef) | |
68 return ref_; | |
69 | |
70 GLOBAL_MUTEX_LOCK(ref_mutex); | |
71 int r = 0; | |
72 if (ref_map != NULL) { | |
73 r = (*ref_map)[this]; | |
74 } | |
75 GLOBAL_MUTEX_UNLOCK(ref_mutex); | |
76 return r; | |
77 } | |
78 | |
79 // Increments reference count, returns object as convenience. | |
80 Regexp* Regexp::Incref() { | |
81 if (ref_ >= kMaxRef-1) { | |
82 // Store ref count in overflow map. | |
83 GLOBAL_MUTEX_LOCK(ref_mutex); | |
84 if (ref_map == NULL) { | |
85 ref_map = new map<Regexp*, int>; | |
86 } | |
87 if (ref_ == kMaxRef) { | |
88 // already overflowed | |
89 (*ref_map)[this]++; | |
90 } else { | |
91 // overflowing now | |
92 (*ref_map)[this] = kMaxRef; | |
93 ref_ = kMaxRef; | |
94 } | |
95 GLOBAL_MUTEX_UNLOCK(ref_mutex); | |
96 return this; | |
97 } | |
98 | |
99 ref_++; | |
100 return this; | |
101 } | |
102 | |
103 // Decrements reference count and deletes this object if count reaches 0. | |
104 void Regexp::Decref() { | |
105 if (ref_ == kMaxRef) { | |
106 // Ref count is stored in overflow map. | |
107 GLOBAL_MUTEX_LOCK(ref_mutex); | |
108 int r = (*ref_map)[this] - 1; | |
109 if (r < kMaxRef) { | |
110 ref_ = static_cast<uint16>(r); | |
111 ref_map->erase(this); | |
112 } else { | |
113 (*ref_map)[this] = r; | |
114 } | |
115 GLOBAL_MUTEX_UNLOCK(ref_mutex); | |
116 return; | |
117 } | |
118 ref_--; | |
119 if (ref_ == 0) | |
120 Destroy(); | |
121 } | |
122 | |
123 // Deletes this object; ref count has count reached 0. | |
124 void Regexp::Destroy() { | |
125 if (QuickDestroy()) | |
126 return; | |
127 | |
128 // Handle recursive Destroy with explicit stack | |
129 // to avoid arbitrarily deep recursion on process stack [sigh]. | |
130 down_ = NULL; | |
131 Regexp* stack = this; | |
132 while (stack != NULL) { | |
133 Regexp* re = stack; | |
134 stack = re->down_; | |
135 if (re->ref_ != 0) | |
136 LOG(DFATAL) << "Bad reference count " << re->ref_; | |
137 if (re->nsub_ > 0) { | |
138 Regexp** subs = re->sub(); | |
139 for (int i = 0; i < re->nsub_; i++) { | |
140 Regexp* sub = subs[i]; | |
141 if (sub == NULL) | |
142 continue; | |
143 if (sub->ref_ == kMaxRef) | |
144 sub->Decref(); | |
145 else | |
146 --sub->ref_; | |
147 if (sub->ref_ == 0 && !sub->QuickDestroy()) { | |
148 sub->down_ = stack; | |
149 stack = sub; | |
150 } | |
151 } | |
152 if (re->nsub_ > 1) | |
153 delete[] subs; | |
154 re->nsub_ = 0; | |
155 } | |
156 delete re; | |
157 } | |
158 } | |
159 | |
160 void Regexp::AddRuneToString(Rune r) { | |
161 DCHECK(op_ == kRegexpLiteralString); | |
162 if (nrunes_ == 0) { | |
163 // start with 8 | |
164 runes_ = new Rune[8]; | |
165 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { | |
166 // double on powers of two | |
167 Rune *old = runes_; | |
168 runes_ = new Rune[nrunes_ * 2]; | |
169 for (int i = 0; i < nrunes_; i++) | |
170 runes_[i] = old[i]; | |
171 delete[] old; | |
172 } | |
173 | |
174 runes_[nrunes_++] = r; | |
175 } | |
176 | |
177 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { | |
178 Regexp* re = new Regexp(kRegexpHaveMatch, flags); | |
179 re->match_id_ = match_id; | |
180 return re; | |
181 } | |
182 | |
183 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { | |
184 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags) | |
185 return sub; | |
186 Regexp* re = new Regexp(kRegexpPlus, flags); | |
187 re->AllocSub(1); | |
188 re->sub()[0] = sub; | |
189 return re; | |
190 } | |
191 | |
192 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { | |
193 if (sub->op() == kRegexpStar && sub->parse_flags() == flags) | |
194 return sub; | |
195 Regexp* re = new Regexp(kRegexpStar, flags); | |
196 re->AllocSub(1); | |
197 re->sub()[0] = sub; | |
198 return re; | |
199 } | |
200 | |
201 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { | |
202 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags) | |
203 return sub; | |
204 Regexp* re = new Regexp(kRegexpQuest, flags); | |
205 re->AllocSub(1); | |
206 re->sub()[0] = sub; | |
207 return re; | |
208 } | |
209 | |
210 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, | |
211 ParseFlags flags, bool can_factor) { | |
212 if (nsub == 1) | |
213 return sub[0]; | |
214 | |
215 if (nsub == 0) { | |
216 if (op == kRegexpAlternate) | |
217 return new Regexp(kRegexpNoMatch, flags); | |
218 else | |
219 return new Regexp(kRegexpEmptyMatch, flags); | |
220 } | |
221 | |
222 Regexp** subcopy = NULL; | |
223 if (op == kRegexpAlternate && can_factor) { | |
224 // Going to edit sub; make a copy so we don't step on caller. | |
225 subcopy = new Regexp*[nsub]; | |
226 memmove(subcopy, sub, nsub * sizeof sub[0]); | |
227 sub = subcopy; | |
228 nsub = FactorAlternation(sub, nsub, flags); | |
229 if (nsub == 1) { | |
230 Regexp* re = sub[0]; | |
231 delete[] subcopy; | |
232 return re; | |
233 } | |
234 } | |
235 | |
236 if (nsub > kMaxNsub) { | |
237 // Too many subexpressions to fit in a single Regexp. | |
238 // Make a two-level tree. Two levels gets us to 65535^2. | |
239 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; | |
240 Regexp* re = new Regexp(op, flags); | |
241 re->AllocSub(nbigsub); | |
242 Regexp** subs = re->sub(); | |
243 for (int i = 0; i < nbigsub - 1; i++) | |
244 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); | |
245 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, | |
246 nsub - (nbigsub-1)*kMaxNsub, flags, | |
247 false); | |
248 delete[] subcopy; | |
249 return re; | |
250 } | |
251 | |
252 Regexp* re = new Regexp(op, flags); | |
253 re->AllocSub(nsub); | |
254 Regexp** subs = re->sub(); | |
255 for (int i = 0; i < nsub; i++) | |
256 subs[i] = sub[i]; | |
257 | |
258 delete[] subcopy; | |
259 return re; | |
260 } | |
261 | |
262 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { | |
263 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); | |
264 } | |
265 | |
266 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { | |
267 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); | |
268 } | |
269 | |
270 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { | |
271 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); | |
272 } | |
273 | |
274 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { | |
275 Regexp* re = new Regexp(kRegexpCapture, flags); | |
276 re->AllocSub(1); | |
277 re->sub()[0] = sub; | |
278 re->cap_ = cap; | |
279 return re; | |
280 } | |
281 | |
282 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { | |
283 Regexp* re = new Regexp(kRegexpRepeat, flags); | |
284 re->AllocSub(1); | |
285 re->sub()[0] = sub; | |
286 re->min_ = min; | |
287 re->max_ = max; | |
288 return re; | |
289 } | |
290 | |
291 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { | |
292 Regexp* re = new Regexp(kRegexpLiteral, flags); | |
293 re->rune_ = rune; | |
294 return re; | |
295 } | |
296 | |
297 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { | |
298 if (nrunes <= 0) | |
299 return new Regexp(kRegexpEmptyMatch, flags); | |
300 if (nrunes == 1) | |
301 return NewLiteral(runes[0], flags); | |
302 Regexp* re = new Regexp(kRegexpLiteralString, flags); | |
303 for (int i = 0; i < nrunes; i++) | |
304 re->AddRuneToString(runes[i]); | |
305 return re; | |
306 } | |
307 | |
308 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { | |
309 Regexp* re = new Regexp(kRegexpCharClass, flags); | |
310 re->cc_ = cc; | |
311 return re; | |
312 } | |
313 | |
314 // Swaps this and that in place. | |
315 void Regexp::Swap(Regexp* that) { | |
316 // Can use memmove because Regexp is just a struct (no vtable). | |
317 char tmp[sizeof *this]; | |
318 memmove(tmp, this, sizeof tmp); | |
319 memmove(this, that, sizeof tmp); | |
320 memmove(that, tmp, sizeof tmp); | |
321 } | |
322 | |
323 // Tests equality of all top-level structure but not subregexps. | |
324 static bool TopEqual(Regexp* a, Regexp* b) { | |
325 if (a->op() != b->op()) | |
326 return false; | |
327 | |
328 switch (a->op()) { | |
329 case kRegexpNoMatch: | |
330 case kRegexpEmptyMatch: | |
331 case kRegexpAnyChar: | |
332 case kRegexpAnyByte: | |
333 case kRegexpBeginLine: | |
334 case kRegexpEndLine: | |
335 case kRegexpWordBoundary: | |
336 case kRegexpNoWordBoundary: | |
337 case kRegexpBeginText: | |
338 return true; | |
339 | |
340 case kRegexpEndText: | |
341 // The parse flags remember whether it's \z or (?-m:$), | |
342 // which matters when testing against PCRE. | |
343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; | |
344 | |
345 case kRegexpLiteral: | |
346 return a->rune() == b->rune() && | |
347 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; | |
348 | |
349 case kRegexpLiteralString: | |
350 return a->nrunes() == b->nrunes() && | |
351 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && | |
352 memcmp(a->runes(), b->runes(), | |
353 a->nrunes() * sizeof a->runes()[0]) == 0; | |
354 | |
355 case kRegexpAlternate: | |
356 case kRegexpConcat: | |
357 return a->nsub() == b->nsub(); | |
358 | |
359 case kRegexpStar: | |
360 case kRegexpPlus: | |
361 case kRegexpQuest: | |
362 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; | |
363 | |
364 case kRegexpRepeat: | |
365 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && | |
366 a->min() == b->min() && | |
367 a->max() == b->max(); | |
368 | |
369 case kRegexpCapture: | |
370 return a->cap() == b->cap() && a->name() == b->name(); | |
371 | |
372 case kRegexpHaveMatch: | |
373 return a->match_id() == b->match_id(); | |
374 | |
375 case kRegexpCharClass: { | |
376 CharClass* acc = a->cc(); | |
377 CharClass* bcc = b->cc(); | |
378 return acc->size() == bcc->size() && | |
379 acc->end() - acc->begin() == bcc->end() - bcc->begin() && | |
380 memcmp(acc->begin(), bcc->begin(), | |
381 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; | |
382 } | |
383 } | |
384 | |
385 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); | |
386 return 0; | |
387 } | |
388 | |
389 bool Regexp::Equal(Regexp* a, Regexp* b) { | |
390 if (a == NULL || b == NULL) | |
391 return a == b; | |
392 | |
393 if (!TopEqual(a, b)) | |
394 return false; | |
395 | |
396 // Fast path: | |
397 // return without allocating vector if there are no subregexps. | |
398 switch (a->op()) { | |
399 case kRegexpAlternate: | |
400 case kRegexpConcat: | |
401 case kRegexpStar: | |
402 case kRegexpPlus: | |
403 case kRegexpQuest: | |
404 case kRegexpRepeat: | |
405 case kRegexpCapture: | |
406 break; | |
407 | |
408 default: | |
409 return true; | |
410 } | |
411 | |
412 // Committed to doing real work. | |
413 // The stack (vector) has pairs of regexps waiting to | |
414 // be compared. The regexps are only equal if | |
415 // all the pairs end up being equal. | |
416 vector<Regexp*> stk; | |
417 | |
418 for (;;) { | |
419 // Invariant: TopEqual(a, b) == true. | |
420 Regexp* a2; | |
421 Regexp* b2; | |
422 switch (a->op()) { | |
423 default: | |
424 break; | |
425 case kRegexpAlternate: | |
426 case kRegexpConcat: | |
427 for (int i = 0; i < a->nsub(); i++) { | |
428 a2 = a->sub()[i]; | |
429 b2 = b->sub()[i]; | |
430 if (!TopEqual(a2, b2)) | |
431 return false; | |
432 stk.push_back(a2); | |
433 stk.push_back(b2); | |
434 } | |
435 break; | |
436 | |
437 case kRegexpStar: | |
438 case kRegexpPlus: | |
439 case kRegexpQuest: | |
440 case kRegexpRepeat: | |
441 case kRegexpCapture: | |
442 a2 = a->sub()[0]; | |
443 b2 = b->sub()[0]; | |
444 if (!TopEqual(a2, b2)) | |
445 return false; | |
446 // Really: | |
447 // stk.push_back(a2); | |
448 // stk.push_back(b2); | |
449 // break; | |
450 // but faster to assign directly and loop. | |
451 a = a2; | |
452 b = b2; | |
453 continue; | |
454 } | |
455 | |
456 size_t n = stk.size(); | |
457 if (n == 0) | |
458 break; | |
459 | |
460 DCHECK_GE(n, 2); | |
461 a = stk[n-2]; | |
462 b = stk[n-1]; | |
463 stk.resize(n-2); | |
464 } | |
465 | |
466 return true; | |
467 } | |
468 | |
469 // Keep in sync with enum RegexpStatusCode in regexp.h | |
470 static const char *kErrorStrings[] = { | |
471 "no error", | |
472 "unexpected error", | |
473 "invalid escape sequence", | |
474 "invalid character class", | |
475 "invalid character class range", | |
476 "missing ]", | |
477 "missing )", | |
478 "trailing \\", | |
479 "no argument for repetition operator", | |
480 "invalid repetition size", | |
481 "bad repetition operator", | |
482 "invalid perl operator", | |
483 "invalid UTF-8", | |
484 "invalid named capture group", | |
485 }; | |
486 | |
487 string RegexpStatus::CodeText(enum RegexpStatusCode code) { | |
488 if (code < 0 || code >= arraysize(kErrorStrings)) | |
489 code = kRegexpInternalError; | |
490 return kErrorStrings[code]; | |
491 } | |
492 | |
493 string RegexpStatus::Text() const { | |
494 if (error_arg_.empty()) | |
495 return CodeText(code_); | |
496 string s; | |
497 s.append(CodeText(code_)); | |
498 s.append(": "); | |
499 s.append(error_arg_.data(), error_arg_.size()); | |
500 return s; | |
501 } | |
502 | |
503 void RegexpStatus::Copy(const RegexpStatus& status) { | |
504 code_ = status.code_; | |
505 error_arg_ = status.error_arg_; | |
506 } | |
507 | |
508 typedef int Ignored; // Walker<void> doesn't exist | |
509 | |
510 // Walker subclass to count capturing parens in regexp. | |
511 class NumCapturesWalker : public Regexp::Walker<Ignored> { | |
512 public: | |
513 NumCapturesWalker() : ncapture_(0) {} | |
514 int ncapture() { return ncapture_; } | |
515 | |
516 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { | |
517 if (re->op() == kRegexpCapture) | |
518 ncapture_++; | |
519 return ignored; | |
520 } | |
521 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { | |
522 // Should never be called: we use Walk not WalkExponential. | |
523 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; | |
524 return ignored; | |
525 } | |
526 | |
527 private: | |
528 int ncapture_; | |
529 DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker); | |
530 }; | |
531 | |
532 int Regexp::NumCaptures() { | |
533 NumCapturesWalker w; | |
534 w.Walk(this, 0); | |
535 return w.ncapture(); | |
536 } | |
537 | |
538 // Walker class to build map of named capture groups and their indices. | |
539 class NamedCapturesWalker : public Regexp::Walker<Ignored> { | |
540 public: | |
541 NamedCapturesWalker() : map_(NULL) {} | |
542 ~NamedCapturesWalker() { delete map_; } | |
543 | |
544 map<string, int>* TakeMap() { | |
545 map<string, int>* m = map_; | |
546 map_ = NULL; | |
547 return m; | |
548 } | |
549 | |
550 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { | |
551 if (re->op() == kRegexpCapture && re->name() != NULL) { | |
552 // Allocate map once we find a name. | |
553 if (map_ == NULL) | |
554 map_ = new map<string, int>; | |
555 | |
556 // Record first occurrence of each name. | |
557 // (The rule is that if you have the same name | |
558 // multiple times, only the leftmost one counts.) | |
559 if (map_->find(*re->name()) == map_->end()) | |
560 (*map_)[*re->name()] = re->cap(); | |
561 } | |
562 return ignored; | |
563 } | |
564 | |
565 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { | |
566 // Should never be called: we use Walk not WalkExponential. | |
567 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; | |
568 return ignored; | |
569 } | |
570 | |
571 private: | |
572 map<string, int>* map_; | |
573 DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker); | |
574 }; | |
575 | |
576 map<string, int>* Regexp::NamedCaptures() { | |
577 NamedCapturesWalker w; | |
578 w.Walk(this, 0); | |
579 return w.TakeMap(); | |
580 } | |
581 | |
582 // Walker class to build map from capture group indices to their names. | |
583 class CaptureNamesWalker : public Regexp::Walker<Ignored> { | |
584 public: | |
585 CaptureNamesWalker() : map_(NULL) {} | |
586 ~CaptureNamesWalker() { delete map_; } | |
587 | |
588 map<int, string>* TakeMap() { | |
589 map<int, string>* m = map_; | |
590 map_ = NULL; | |
591 return m; | |
592 } | |
593 | |
594 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { | |
595 if (re->op() == kRegexpCapture && re->name() != NULL) { | |
596 // Allocate map once we find a name. | |
597 if (map_ == NULL) | |
598 map_ = new map<int, string>; | |
599 | |
600 (*map_)[re->cap()] = *re->name(); | |
601 } | |
602 return ignored; | |
603 } | |
604 | |
605 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { | |
606 // Should never be called: we use Walk not WalkExponential. | |
607 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; | |
608 return ignored; | |
609 } | |
610 | |
611 private: | |
612 map<int, string>* map_; | |
613 DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker); | |
614 }; | |
615 | |
616 map<int, string>* Regexp::CaptureNames() { | |
617 CaptureNamesWalker w; | |
618 w.Walk(this, 0); | |
619 return w.TakeMap(); | |
620 } | |
621 | |
622 // Determines whether regexp matches must be anchored | |
623 // with a fixed string prefix. If so, returns the prefix and | |
624 // the regexp that remains after the prefix. The prefix might | |
625 // be ASCII case-insensitive. | |
626 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) { | |
627 // No need for a walker: the regexp must be of the form | |
628 // 1. some number of ^ anchors | |
629 // 2. a literal char or string | |
630 // 3. the rest | |
631 prefix->clear(); | |
632 *foldcase = false; | |
633 *suffix = NULL; | |
634 if (op_ != kRegexpConcat) | |
635 return false; | |
636 | |
637 // Some number of anchors, then a literal or concatenation. | |
638 int i = 0; | |
639 Regexp** sub = this->sub(); | |
640 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText) | |
641 i++; | |
642 if (i == 0 || i >= nsub_) | |
643 return false; | |
644 | |
645 Regexp* re = sub[i]; | |
646 switch (re->op_) { | |
647 default: | |
648 return false; | |
649 | |
650 case kRegexpLiteralString: | |
651 // Convert to string in proper encoding. | |
652 if (re->parse_flags() & Latin1) { | |
653 prefix->resize(re->nrunes_); | |
654 for (int j = 0; j < re->nrunes_; j++) | |
655 (*prefix)[j] = static_cast<char>(re->runes_[j]); | |
656 } else { | |
657 // Convert to UTF-8 in place. | |
658 // Assume worst-case space and then trim. | |
659 prefix->resize(re->nrunes_ * UTFmax); | |
660 char *p = &(*prefix)[0]; | |
661 for (int j = 0; j < re->nrunes_; j++) { | |
662 Rune r = re->runes_[j]; | |
663 if (r < Runeself) | |
664 *p++ = static_cast<char>(r); | |
665 else | |
666 p += runetochar(p, &r); | |
667 } | |
668 prefix->resize(p - &(*prefix)[0]); | |
669 } | |
670 break; | |
671 | |
672 case kRegexpLiteral: | |
673 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) { | |
674 prefix->append(1, static_cast<char>(re->rune_)); | |
675 } else { | |
676 char buf[UTFmax]; | |
677 prefix->append(buf, runetochar(buf, &re->rune_)); | |
678 } | |
679 break; | |
680 } | |
681 *foldcase = (sub[i]->parse_flags() & FoldCase) != 0; | |
682 i++; | |
683 | |
684 // The rest. | |
685 if (i < nsub_) { | |
686 for (int j = i; j < nsub_; j++) | |
687 sub[j]->Incref(); | |
688 re = Concat(sub + i, nsub_ - i, parse_flags()); | |
689 } else { | |
690 re = new Regexp(kRegexpEmptyMatch, parse_flags()); | |
691 } | |
692 *suffix = re; | |
693 return true; | |
694 } | |
695 | |
696 // Character class builder is a balanced binary tree (STL set) | |
697 // containing non-overlapping, non-abutting RuneRanges. | |
698 // The less-than operator used in the tree treats two | |
699 // ranges as equal if they overlap at all, so that | |
700 // lookups for a particular Rune are possible. | |
701 | |
702 CharClassBuilder::CharClassBuilder() { | |
703 nrunes_ = 0; | |
704 upper_ = 0; | |
705 lower_ = 0; | |
706 } | |
707 | |
708 // Add lo-hi to the class; return whether class got bigger. | |
709 bool CharClassBuilder::AddRange(Rune lo, Rune hi) { | |
710 if (hi < lo) | |
711 return false; | |
712 | |
713 if (lo <= 'z' && hi >= 'A') { | |
714 // Overlaps some alpha, maybe not all. | |
715 // Update bitmaps telling which ASCII letters are in the set. | |
716 Rune lo1 = max<Rune>(lo, 'A'); | |
717 Rune hi1 = min<Rune>(hi, 'Z'); | |
718 if (lo1 <= hi1) | |
719 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); | |
720 | |
721 lo1 = max<Rune>(lo, 'a'); | |
722 hi1 = min<Rune>(hi, 'z'); | |
723 if (lo1 <= hi1) | |
724 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); | |
725 } | |
726 | |
727 { // Check whether lo, hi is already in the class. | |
728 iterator it = ranges_.find(RuneRange(lo, lo)); | |
729 if (it != end() && it->lo <= lo && hi <= it->hi) | |
730 return false; | |
731 } | |
732 | |
733 // Look for a range abutting lo on the left. | |
734 // If it exists, take it out and increase our range. | |
735 if (lo > 0) { | |
736 iterator it = ranges_.find(RuneRange(lo-1, lo-1)); | |
737 if (it != end()) { | |
738 lo = it->lo; | |
739 if (it->hi > hi) | |
740 hi = it->hi; | |
741 nrunes_ -= it->hi - it->lo + 1; | |
742 ranges_.erase(it); | |
743 } | |
744 } | |
745 | |
746 // Look for a range abutting hi on the right. | |
747 // If it exists, take it out and increase our range. | |
748 if (hi < Runemax) { | |
749 iterator it = ranges_.find(RuneRange(hi+1, hi+1)); | |
750 if (it != end()) { | |
751 hi = it->hi; | |
752 nrunes_ -= it->hi - it->lo + 1; | |
753 ranges_.erase(it); | |
754 } | |
755 } | |
756 | |
757 // Look for ranges between lo and hi. Take them out. | |
758 // This is only safe because the set has no overlapping ranges. | |
759 // We've already removed any ranges abutting lo and hi, so | |
760 // any that overlap [lo, hi] must be contained within it. | |
761 for (;;) { | |
762 iterator it = ranges_.find(RuneRange(lo, hi)); | |
763 if (it == end()) | |
764 break; | |
765 nrunes_ -= it->hi - it->lo + 1; | |
766 ranges_.erase(it); | |
767 } | |
768 | |
769 // Finally, add [lo, hi]. | |
770 nrunes_ += hi - lo + 1; | |
771 ranges_.insert(RuneRange(lo, hi)); | |
772 return true; | |
773 } | |
774 | |
775 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { | |
776 for (iterator it = cc->begin(); it != cc->end(); ++it) | |
777 AddRange(it->lo, it->hi); | |
778 } | |
779 | |
780 bool CharClassBuilder::Contains(Rune r) { | |
781 return ranges_.find(RuneRange(r, r)) != end(); | |
782 } | |
783 | |
784 // Does the character class behave the same on A-Z as on a-z? | |
785 bool CharClassBuilder::FoldsASCII() { | |
786 return ((upper_ ^ lower_) & AlphaMask) == 0; | |
787 } | |
788 | |
789 CharClassBuilder* CharClassBuilder::Copy() { | |
790 CharClassBuilder* cc = new CharClassBuilder; | |
791 for (iterator it = begin(); it != end(); ++it) | |
792 cc->ranges_.insert(RuneRange(it->lo, it->hi)); | |
793 cc->upper_ = upper_; | |
794 cc->lower_ = lower_; | |
795 cc->nrunes_ = nrunes_; | |
796 return cc; | |
797 } | |
798 | |
799 | |
800 | |
801 void CharClassBuilder::RemoveAbove(Rune r) { | |
802 if (r >= Runemax) | |
803 return; | |
804 | |
805 if (r < 'z') { | |
806 if (r < 'a') | |
807 lower_ = 0; | |
808 else | |
809 lower_ &= AlphaMask >> ('z' - r); | |
810 } | |
811 | |
812 if (r < 'Z') { | |
813 if (r < 'A') | |
814 upper_ = 0; | |
815 else | |
816 upper_ &= AlphaMask >> ('Z' - r); | |
817 } | |
818 | |
819 for (;;) { | |
820 | |
821 iterator it = ranges_.find(RuneRange(r + 1, Runemax)); | |
822 if (it == end()) | |
823 break; | |
824 RuneRange rr = *it; | |
825 ranges_.erase(it); | |
826 nrunes_ -= rr.hi - rr.lo + 1; | |
827 if (rr.lo <= r) { | |
828 rr.hi = r; | |
829 ranges_.insert(rr); | |
830 nrunes_ += rr.hi - rr.lo + 1; | |
831 } | |
832 } | |
833 } | |
834 | |
835 void CharClassBuilder::Negate() { | |
836 // Build up negation and then copy in. | |
837 // Could edit ranges in place, but C++ won't let me. | |
838 vector<RuneRange> v; | |
839 v.reserve(ranges_.size() + 1); | |
840 | |
841 // In negation, first range begins at 0, unless | |
842 // the current class begins at 0. | |
843 iterator it = begin(); | |
844 if (it == end()) { | |
845 v.push_back(RuneRange(0, Runemax)); | |
846 } else { | |
847 int nextlo = 0; | |
848 if (it->lo == 0) { | |
849 nextlo = it->hi + 1; | |
850 ++it; | |
851 } | |
852 for (; it != end(); ++it) { | |
853 v.push_back(RuneRange(nextlo, it->lo - 1)); | |
854 nextlo = it->hi + 1; | |
855 } | |
856 if (nextlo <= Runemax) | |
857 v.push_back(RuneRange(nextlo, Runemax)); | |
858 } | |
859 | |
860 ranges_.clear(); | |
861 for (size_t i = 0; i < v.size(); i++) | |
862 ranges_.insert(v[i]); | |
863 | |
864 upper_ = AlphaMask & ~upper_; | |
865 lower_ = AlphaMask & ~lower_; | |
866 nrunes_ = Runemax+1 - nrunes_; | |
867 } | |
868 | |
869 // Character class is a sorted list of ranges. | |
870 // The ranges are allocated in the same block as the header, | |
871 // necessitating a special allocator and Delete method. | |
872 | |
873 CharClass* CharClass::New(int maxranges) { | |
874 CharClass* cc; | |
875 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; | |
876 cc = reinterpret_cast<CharClass*>(data); | |
877 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); | |
878 cc->nranges_ = 0; | |
879 cc->folds_ascii_ = false; | |
880 cc->nrunes_ = 0; | |
881 return cc; | |
882 } | |
883 | |
884 void CharClass::Delete() { | |
885 uint8 *data = reinterpret_cast<uint8*>(this); | |
886 delete[] data; | |
887 } | |
888 | |
889 CharClass* CharClass::Negate() { | |
890 CharClass* cc = CharClass::New(nranges_+1); | |
891 cc->folds_ascii_ = folds_ascii_; | |
892 cc->nrunes_ = Runemax + 1 - nrunes_; | |
893 int n = 0; | |
894 int nextlo = 0; | |
895 for (CharClass::iterator it = begin(); it != end(); ++it) { | |
896 if (it->lo == nextlo) { | |
897 nextlo = it->hi + 1; | |
898 } else { | |
899 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); | |
900 nextlo = it->hi + 1; | |
901 } | |
902 } | |
903 if (nextlo <= Runemax) | |
904 cc->ranges_[n++] = RuneRange(nextlo, Runemax); | |
905 cc->nranges_ = n; | |
906 return cc; | |
907 } | |
908 | |
909 bool CharClass::Contains(Rune r) { | |
910 RuneRange* rr = ranges_; | |
911 int n = nranges_; | |
912 while (n > 0) { | |
913 int m = n/2; | |
914 if (rr[m].hi < r) { | |
915 rr += m+1; | |
916 n -= m+1; | |
917 } else if (r < rr[m].lo) { | |
918 n = m; | |
919 } else { // rr[m].lo <= r && r <= rr[m].hi | |
920 return true; | |
921 } | |
922 } | |
923 return false; | |
924 } | |
925 | |
926 CharClass* CharClassBuilder::GetCharClass() { | |
927 CharClass* cc = CharClass::New(static_cast<int>(ranges_.size())); | |
928 int n = 0; | |
929 for (iterator it = begin(); it != end(); ++it) | |
930 cc->ranges_[n++] = *it; | |
931 cc->nranges_ = n; | |
932 DCHECK_LE(n, static_cast<int>(ranges_.size())); | |
933 cc->nrunes_ = nrunes_; | |
934 cc->folds_ascii_ = FoldsASCII(); | |
935 return cc; | |
936 } | |
937 | |
938 } // namespace re2 | |
OLD | NEW |