Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(360)

Side by Side Diff: third_party/re2/re2/regexp.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master
Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « third_party/re2/re2/regexp.h ('k') | third_party/re2/re2/set.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
(Empty)
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Regular expression representation.
6 // Tested by parse_test.cc
7
8 #include "util/util.h"
9 #include "re2/regexp.h"
10 #include "re2/stringpiece.h"
11 #include "re2/walker-inl.h"
12
13 namespace re2 {
14
15 // Constructor. Allocates vectors as appropriate for operator.
16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)
17 : op_(static_cast<uint8>(op)),
18 simple_(false),
19 parse_flags_(static_cast<uint16>(parse_flags)),
20 ref_(1),
21 nsub_(0),
22 down_(NULL) {
23 subone_ = NULL;
24 memset(the_union_, 0, sizeof the_union_);
25 }
26
27 // Destructor. Assumes already cleaned up children.
28 // Private: use Decref() instead of delete to destroy Regexps.
29 // Can't call Decref on the sub-Regexps here because
30 // that could cause arbitrarily deep recursion, so
31 // required Decref() to have handled them for us.
32 Regexp::~Regexp() {
33 if (nsub_ > 0)
34 LOG(DFATAL) << "Regexp not destroyed.";
35
36 switch (op_) {
37 default:
38 break;
39 case kRegexpCapture:
40 delete name_;
41 break;
42 case kRegexpLiteralString:
43 delete[] runes_;
44 break;
45 case kRegexpCharClass:
46 if (cc_)
47 cc_->Delete();
48 delete ccb_;
49 break;
50 }
51 }
52
53 // If it's possible to destroy this regexp without recurring,
54 // do so and return true. Else return false.
55 bool Regexp::QuickDestroy() {
56 if (nsub_ == 0) {
57 delete this;
58 return true;
59 }
60 return false;
61 }
62
63 static map<Regexp*, int> *ref_map;
64 GLOBAL_MUTEX(ref_mutex);
65
66 int Regexp::Ref() {
67 if (ref_ < kMaxRef)
68 return ref_;
69
70 GLOBAL_MUTEX_LOCK(ref_mutex);
71 int r = 0;
72 if (ref_map != NULL) {
73 r = (*ref_map)[this];
74 }
75 GLOBAL_MUTEX_UNLOCK(ref_mutex);
76 return r;
77 }
78
79 // Increments reference count, returns object as convenience.
80 Regexp* Regexp::Incref() {
81 if (ref_ >= kMaxRef-1) {
82 // Store ref count in overflow map.
83 GLOBAL_MUTEX_LOCK(ref_mutex);
84 if (ref_map == NULL) {
85 ref_map = new map<Regexp*, int>;
86 }
87 if (ref_ == kMaxRef) {
88 // already overflowed
89 (*ref_map)[this]++;
90 } else {
91 // overflowing now
92 (*ref_map)[this] = kMaxRef;
93 ref_ = kMaxRef;
94 }
95 GLOBAL_MUTEX_UNLOCK(ref_mutex);
96 return this;
97 }
98
99 ref_++;
100 return this;
101 }
102
103 // Decrements reference count and deletes this object if count reaches 0.
104 void Regexp::Decref() {
105 if (ref_ == kMaxRef) {
106 // Ref count is stored in overflow map.
107 GLOBAL_MUTEX_LOCK(ref_mutex);
108 int r = (*ref_map)[this] - 1;
109 if (r < kMaxRef) {
110 ref_ = static_cast<uint16>(r);
111 ref_map->erase(this);
112 } else {
113 (*ref_map)[this] = r;
114 }
115 GLOBAL_MUTEX_UNLOCK(ref_mutex);
116 return;
117 }
118 ref_--;
119 if (ref_ == 0)
120 Destroy();
121 }
122
123 // Deletes this object; ref count has count reached 0.
124 void Regexp::Destroy() {
125 if (QuickDestroy())
126 return;
127
128 // Handle recursive Destroy with explicit stack
129 // to avoid arbitrarily deep recursion on process stack [sigh].
130 down_ = NULL;
131 Regexp* stack = this;
132 while (stack != NULL) {
133 Regexp* re = stack;
134 stack = re->down_;
135 if (re->ref_ != 0)
136 LOG(DFATAL) << "Bad reference count " << re->ref_;
137 if (re->nsub_ > 0) {
138 Regexp** subs = re->sub();
139 for (int i = 0; i < re->nsub_; i++) {
140 Regexp* sub = subs[i];
141 if (sub == NULL)
142 continue;
143 if (sub->ref_ == kMaxRef)
144 sub->Decref();
145 else
146 --sub->ref_;
147 if (sub->ref_ == 0 && !sub->QuickDestroy()) {
148 sub->down_ = stack;
149 stack = sub;
150 }
151 }
152 if (re->nsub_ > 1)
153 delete[] subs;
154 re->nsub_ = 0;
155 }
156 delete re;
157 }
158 }
159
160 void Regexp::AddRuneToString(Rune r) {
161 DCHECK(op_ == kRegexpLiteralString);
162 if (nrunes_ == 0) {
163 // start with 8
164 runes_ = new Rune[8];
165 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {
166 // double on powers of two
167 Rune *old = runes_;
168 runes_ = new Rune[nrunes_ * 2];
169 for (int i = 0; i < nrunes_; i++)
170 runes_[i] = old[i];
171 delete[] old;
172 }
173
174 runes_[nrunes_++] = r;
175 }
176
177 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {
178 Regexp* re = new Regexp(kRegexpHaveMatch, flags);
179 re->match_id_ = match_id;
180 return re;
181 }
182
183 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {
184 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)
185 return sub;
186 Regexp* re = new Regexp(kRegexpPlus, flags);
187 re->AllocSub(1);
188 re->sub()[0] = sub;
189 return re;
190 }
191
192 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {
193 if (sub->op() == kRegexpStar && sub->parse_flags() == flags)
194 return sub;
195 Regexp* re = new Regexp(kRegexpStar, flags);
196 re->AllocSub(1);
197 re->sub()[0] = sub;
198 return re;
199 }
200
201 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {
202 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)
203 return sub;
204 Regexp* re = new Regexp(kRegexpQuest, flags);
205 re->AllocSub(1);
206 re->sub()[0] = sub;
207 return re;
208 }
209
210 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,
211 ParseFlags flags, bool can_factor) {
212 if (nsub == 1)
213 return sub[0];
214
215 if (nsub == 0) {
216 if (op == kRegexpAlternate)
217 return new Regexp(kRegexpNoMatch, flags);
218 else
219 return new Regexp(kRegexpEmptyMatch, flags);
220 }
221
222 Regexp** subcopy = NULL;
223 if (op == kRegexpAlternate && can_factor) {
224 // Going to edit sub; make a copy so we don't step on caller.
225 subcopy = new Regexp*[nsub];
226 memmove(subcopy, sub, nsub * sizeof sub[0]);
227 sub = subcopy;
228 nsub = FactorAlternation(sub, nsub, flags);
229 if (nsub == 1) {
230 Regexp* re = sub[0];
231 delete[] subcopy;
232 return re;
233 }
234 }
235
236 if (nsub > kMaxNsub) {
237 // Too many subexpressions to fit in a single Regexp.
238 // Make a two-level tree. Two levels gets us to 65535^2.
239 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;
240 Regexp* re = new Regexp(op, flags);
241 re->AllocSub(nbigsub);
242 Regexp** subs = re->sub();
243 for (int i = 0; i < nbigsub - 1; i++)
244 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);
245 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,
246 nsub - (nbigsub-1)*kMaxNsub, flags,
247 false);
248 delete[] subcopy;
249 return re;
250 }
251
252 Regexp* re = new Regexp(op, flags);
253 re->AllocSub(nsub);
254 Regexp** subs = re->sub();
255 for (int i = 0; i < nsub; i++)
256 subs[i] = sub[i];
257
258 delete[] subcopy;
259 return re;
260 }
261
262 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {
263 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);
264 }
265
266 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {
267 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);
268 }
269
270 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {
271 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);
272 }
273
274 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {
275 Regexp* re = new Regexp(kRegexpCapture, flags);
276 re->AllocSub(1);
277 re->sub()[0] = sub;
278 re->cap_ = cap;
279 return re;
280 }
281
282 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {
283 Regexp* re = new Regexp(kRegexpRepeat, flags);
284 re->AllocSub(1);
285 re->sub()[0] = sub;
286 re->min_ = min;
287 re->max_ = max;
288 return re;
289 }
290
291 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {
292 Regexp* re = new Regexp(kRegexpLiteral, flags);
293 re->rune_ = rune;
294 return re;
295 }
296
297 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {
298 if (nrunes <= 0)
299 return new Regexp(kRegexpEmptyMatch, flags);
300 if (nrunes == 1)
301 return NewLiteral(runes[0], flags);
302 Regexp* re = new Regexp(kRegexpLiteralString, flags);
303 for (int i = 0; i < nrunes; i++)
304 re->AddRuneToString(runes[i]);
305 return re;
306 }
307
308 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {
309 Regexp* re = new Regexp(kRegexpCharClass, flags);
310 re->cc_ = cc;
311 return re;
312 }
313
314 // Swaps this and that in place.
315 void Regexp::Swap(Regexp* that) {
316 // Can use memmove because Regexp is just a struct (no vtable).
317 char tmp[sizeof *this];
318 memmove(tmp, this, sizeof tmp);
319 memmove(this, that, sizeof tmp);
320 memmove(that, tmp, sizeof tmp);
321 }
322
323 // Tests equality of all top-level structure but not subregexps.
324 static bool TopEqual(Regexp* a, Regexp* b) {
325 if (a->op() != b->op())
326 return false;
327
328 switch (a->op()) {
329 case kRegexpNoMatch:
330 case kRegexpEmptyMatch:
331 case kRegexpAnyChar:
332 case kRegexpAnyByte:
333 case kRegexpBeginLine:
334 case kRegexpEndLine:
335 case kRegexpWordBoundary:
336 case kRegexpNoWordBoundary:
337 case kRegexpBeginText:
338 return true;
339
340 case kRegexpEndText:
341 // The parse flags remember whether it's \z or (?-m:$),
342 // which matters when testing against PCRE.
343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;
344
345 case kRegexpLiteral:
346 return a->rune() == b->rune() &&
347 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;
348
349 case kRegexpLiteralString:
350 return a->nrunes() == b->nrunes() &&
351 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&
352 memcmp(a->runes(), b->runes(),
353 a->nrunes() * sizeof a->runes()[0]) == 0;
354
355 case kRegexpAlternate:
356 case kRegexpConcat:
357 return a->nsub() == b->nsub();
358
359 case kRegexpStar:
360 case kRegexpPlus:
361 case kRegexpQuest:
362 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;
363
364 case kRegexpRepeat:
365 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&
366 a->min() == b->min() &&
367 a->max() == b->max();
368
369 case kRegexpCapture:
370 return a->cap() == b->cap() && a->name() == b->name();
371
372 case kRegexpHaveMatch:
373 return a->match_id() == b->match_id();
374
375 case kRegexpCharClass: {
376 CharClass* acc = a->cc();
377 CharClass* bcc = b->cc();
378 return acc->size() == bcc->size() &&
379 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&
380 memcmp(acc->begin(), bcc->begin(),
381 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;
382 }
383 }
384
385 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();
386 return 0;
387 }
388
389 bool Regexp::Equal(Regexp* a, Regexp* b) {
390 if (a == NULL || b == NULL)
391 return a == b;
392
393 if (!TopEqual(a, b))
394 return false;
395
396 // Fast path:
397 // return without allocating vector if there are no subregexps.
398 switch (a->op()) {
399 case kRegexpAlternate:
400 case kRegexpConcat:
401 case kRegexpStar:
402 case kRegexpPlus:
403 case kRegexpQuest:
404 case kRegexpRepeat:
405 case kRegexpCapture:
406 break;
407
408 default:
409 return true;
410 }
411
412 // Committed to doing real work.
413 // The stack (vector) has pairs of regexps waiting to
414 // be compared. The regexps are only equal if
415 // all the pairs end up being equal.
416 vector<Regexp*> stk;
417
418 for (;;) {
419 // Invariant: TopEqual(a, b) == true.
420 Regexp* a2;
421 Regexp* b2;
422 switch (a->op()) {
423 default:
424 break;
425 case kRegexpAlternate:
426 case kRegexpConcat:
427 for (int i = 0; i < a->nsub(); i++) {
428 a2 = a->sub()[i];
429 b2 = b->sub()[i];
430 if (!TopEqual(a2, b2))
431 return false;
432 stk.push_back(a2);
433 stk.push_back(b2);
434 }
435 break;
436
437 case kRegexpStar:
438 case kRegexpPlus:
439 case kRegexpQuest:
440 case kRegexpRepeat:
441 case kRegexpCapture:
442 a2 = a->sub()[0];
443 b2 = b->sub()[0];
444 if (!TopEqual(a2, b2))
445 return false;
446 // Really:
447 // stk.push_back(a2);
448 // stk.push_back(b2);
449 // break;
450 // but faster to assign directly and loop.
451 a = a2;
452 b = b2;
453 continue;
454 }
455
456 size_t n = stk.size();
457 if (n == 0)
458 break;
459
460 DCHECK_GE(n, 2);
461 a = stk[n-2];
462 b = stk[n-1];
463 stk.resize(n-2);
464 }
465
466 return true;
467 }
468
469 // Keep in sync with enum RegexpStatusCode in regexp.h
470 static const char *kErrorStrings[] = {
471 "no error",
472 "unexpected error",
473 "invalid escape sequence",
474 "invalid character class",
475 "invalid character class range",
476 "missing ]",
477 "missing )",
478 "trailing \\",
479 "no argument for repetition operator",
480 "invalid repetition size",
481 "bad repetition operator",
482 "invalid perl operator",
483 "invalid UTF-8",
484 "invalid named capture group",
485 };
486
487 string RegexpStatus::CodeText(enum RegexpStatusCode code) {
488 if (code < 0 || code >= arraysize(kErrorStrings))
489 code = kRegexpInternalError;
490 return kErrorStrings[code];
491 }
492
493 string RegexpStatus::Text() const {
494 if (error_arg_.empty())
495 return CodeText(code_);
496 string s;
497 s.append(CodeText(code_));
498 s.append(": ");
499 s.append(error_arg_.data(), error_arg_.size());
500 return s;
501 }
502
503 void RegexpStatus::Copy(const RegexpStatus& status) {
504 code_ = status.code_;
505 error_arg_ = status.error_arg_;
506 }
507
508 typedef int Ignored; // Walker<void> doesn't exist
509
510 // Walker subclass to count capturing parens in regexp.
511 class NumCapturesWalker : public Regexp::Walker<Ignored> {
512 public:
513 NumCapturesWalker() : ncapture_(0) {}
514 int ncapture() { return ncapture_; }
515
516 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
517 if (re->op() == kRegexpCapture)
518 ncapture_++;
519 return ignored;
520 }
521 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
522 // Should never be called: we use Walk not WalkExponential.
523 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";
524 return ignored;
525 }
526
527 private:
528 int ncapture_;
529 DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker);
530 };
531
532 int Regexp::NumCaptures() {
533 NumCapturesWalker w;
534 w.Walk(this, 0);
535 return w.ncapture();
536 }
537
538 // Walker class to build map of named capture groups and their indices.
539 class NamedCapturesWalker : public Regexp::Walker<Ignored> {
540 public:
541 NamedCapturesWalker() : map_(NULL) {}
542 ~NamedCapturesWalker() { delete map_; }
543
544 map<string, int>* TakeMap() {
545 map<string, int>* m = map_;
546 map_ = NULL;
547 return m;
548 }
549
550 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
551 if (re->op() == kRegexpCapture && re->name() != NULL) {
552 // Allocate map once we find a name.
553 if (map_ == NULL)
554 map_ = new map<string, int>;
555
556 // Record first occurrence of each name.
557 // (The rule is that if you have the same name
558 // multiple times, only the leftmost one counts.)
559 if (map_->find(*re->name()) == map_->end())
560 (*map_)[*re->name()] = re->cap();
561 }
562 return ignored;
563 }
564
565 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
566 // Should never be called: we use Walk not WalkExponential.
567 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";
568 return ignored;
569 }
570
571 private:
572 map<string, int>* map_;
573 DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker);
574 };
575
576 map<string, int>* Regexp::NamedCaptures() {
577 NamedCapturesWalker w;
578 w.Walk(this, 0);
579 return w.TakeMap();
580 }
581
582 // Walker class to build map from capture group indices to their names.
583 class CaptureNamesWalker : public Regexp::Walker<Ignored> {
584 public:
585 CaptureNamesWalker() : map_(NULL) {}
586 ~CaptureNamesWalker() { delete map_; }
587
588 map<int, string>* TakeMap() {
589 map<int, string>* m = map_;
590 map_ = NULL;
591 return m;
592 }
593
594 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {
595 if (re->op() == kRegexpCapture && re->name() != NULL) {
596 // Allocate map once we find a name.
597 if (map_ == NULL)
598 map_ = new map<int, string>;
599
600 (*map_)[re->cap()] = *re->name();
601 }
602 return ignored;
603 }
604
605 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {
606 // Should never be called: we use Walk not WalkExponential.
607 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";
608 return ignored;
609 }
610
611 private:
612 map<int, string>* map_;
613 DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker);
614 };
615
616 map<int, string>* Regexp::CaptureNames() {
617 CaptureNamesWalker w;
618 w.Walk(this, 0);
619 return w.TakeMap();
620 }
621
622 // Determines whether regexp matches must be anchored
623 // with a fixed string prefix. If so, returns the prefix and
624 // the regexp that remains after the prefix. The prefix might
625 // be ASCII case-insensitive.
626 bool Regexp::RequiredPrefix(string *prefix, bool *foldcase, Regexp** suffix) {
627 // No need for a walker: the regexp must be of the form
628 // 1. some number of ^ anchors
629 // 2. a literal char or string
630 // 3. the rest
631 prefix->clear();
632 *foldcase = false;
633 *suffix = NULL;
634 if (op_ != kRegexpConcat)
635 return false;
636
637 // Some number of anchors, then a literal or concatenation.
638 int i = 0;
639 Regexp** sub = this->sub();
640 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)
641 i++;
642 if (i == 0 || i >= nsub_)
643 return false;
644
645 Regexp* re = sub[i];
646 switch (re->op_) {
647 default:
648 return false;
649
650 case kRegexpLiteralString:
651 // Convert to string in proper encoding.
652 if (re->parse_flags() & Latin1) {
653 prefix->resize(re->nrunes_);
654 for (int j = 0; j < re->nrunes_; j++)
655 (*prefix)[j] = static_cast<char>(re->runes_[j]);
656 } else {
657 // Convert to UTF-8 in place.
658 // Assume worst-case space and then trim.
659 prefix->resize(re->nrunes_ * UTFmax);
660 char *p = &(*prefix)[0];
661 for (int j = 0; j < re->nrunes_; j++) {
662 Rune r = re->runes_[j];
663 if (r < Runeself)
664 *p++ = static_cast<char>(r);
665 else
666 p += runetochar(p, &r);
667 }
668 prefix->resize(p - &(*prefix)[0]);
669 }
670 break;
671
672 case kRegexpLiteral:
673 if ((re->parse_flags() & Latin1) || re->rune_ < Runeself) {
674 prefix->append(1, static_cast<char>(re->rune_));
675 } else {
676 char buf[UTFmax];
677 prefix->append(buf, runetochar(buf, &re->rune_));
678 }
679 break;
680 }
681 *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;
682 i++;
683
684 // The rest.
685 if (i < nsub_) {
686 for (int j = i; j < nsub_; j++)
687 sub[j]->Incref();
688 re = Concat(sub + i, nsub_ - i, parse_flags());
689 } else {
690 re = new Regexp(kRegexpEmptyMatch, parse_flags());
691 }
692 *suffix = re;
693 return true;
694 }
695
696 // Character class builder is a balanced binary tree (STL set)
697 // containing non-overlapping, non-abutting RuneRanges.
698 // The less-than operator used in the tree treats two
699 // ranges as equal if they overlap at all, so that
700 // lookups for a particular Rune are possible.
701
702 CharClassBuilder::CharClassBuilder() {
703 nrunes_ = 0;
704 upper_ = 0;
705 lower_ = 0;
706 }
707
708 // Add lo-hi to the class; return whether class got bigger.
709 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {
710 if (hi < lo)
711 return false;
712
713 if (lo <= 'z' && hi >= 'A') {
714 // Overlaps some alpha, maybe not all.
715 // Update bitmaps telling which ASCII letters are in the set.
716 Rune lo1 = max<Rune>(lo, 'A');
717 Rune hi1 = min<Rune>(hi, 'Z');
718 if (lo1 <= hi1)
719 upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');
720
721 lo1 = max<Rune>(lo, 'a');
722 hi1 = min<Rune>(hi, 'z');
723 if (lo1 <= hi1)
724 lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');
725 }
726
727 { // Check whether lo, hi is already in the class.
728 iterator it = ranges_.find(RuneRange(lo, lo));
729 if (it != end() && it->lo <= lo && hi <= it->hi)
730 return false;
731 }
732
733 // Look for a range abutting lo on the left.
734 // If it exists, take it out and increase our range.
735 if (lo > 0) {
736 iterator it = ranges_.find(RuneRange(lo-1, lo-1));
737 if (it != end()) {
738 lo = it->lo;
739 if (it->hi > hi)
740 hi = it->hi;
741 nrunes_ -= it->hi - it->lo + 1;
742 ranges_.erase(it);
743 }
744 }
745
746 // Look for a range abutting hi on the right.
747 // If it exists, take it out and increase our range.
748 if (hi < Runemax) {
749 iterator it = ranges_.find(RuneRange(hi+1, hi+1));
750 if (it != end()) {
751 hi = it->hi;
752 nrunes_ -= it->hi - it->lo + 1;
753 ranges_.erase(it);
754 }
755 }
756
757 // Look for ranges between lo and hi. Take them out.
758 // This is only safe because the set has no overlapping ranges.
759 // We've already removed any ranges abutting lo and hi, so
760 // any that overlap [lo, hi] must be contained within it.
761 for (;;) {
762 iterator it = ranges_.find(RuneRange(lo, hi));
763 if (it == end())
764 break;
765 nrunes_ -= it->hi - it->lo + 1;
766 ranges_.erase(it);
767 }
768
769 // Finally, add [lo, hi].
770 nrunes_ += hi - lo + 1;
771 ranges_.insert(RuneRange(lo, hi));
772 return true;
773 }
774
775 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {
776 for (iterator it = cc->begin(); it != cc->end(); ++it)
777 AddRange(it->lo, it->hi);
778 }
779
780 bool CharClassBuilder::Contains(Rune r) {
781 return ranges_.find(RuneRange(r, r)) != end();
782 }
783
784 // Does the character class behave the same on A-Z as on a-z?
785 bool CharClassBuilder::FoldsASCII() {
786 return ((upper_ ^ lower_) & AlphaMask) == 0;
787 }
788
789 CharClassBuilder* CharClassBuilder::Copy() {
790 CharClassBuilder* cc = new CharClassBuilder;
791 for (iterator it = begin(); it != end(); ++it)
792 cc->ranges_.insert(RuneRange(it->lo, it->hi));
793 cc->upper_ = upper_;
794 cc->lower_ = lower_;
795 cc->nrunes_ = nrunes_;
796 return cc;
797 }
798
799
800
801 void CharClassBuilder::RemoveAbove(Rune r) {
802 if (r >= Runemax)
803 return;
804
805 if (r < 'z') {
806 if (r < 'a')
807 lower_ = 0;
808 else
809 lower_ &= AlphaMask >> ('z' - r);
810 }
811
812 if (r < 'Z') {
813 if (r < 'A')
814 upper_ = 0;
815 else
816 upper_ &= AlphaMask >> ('Z' - r);
817 }
818
819 for (;;) {
820
821 iterator it = ranges_.find(RuneRange(r + 1, Runemax));
822 if (it == end())
823 break;
824 RuneRange rr = *it;
825 ranges_.erase(it);
826 nrunes_ -= rr.hi - rr.lo + 1;
827 if (rr.lo <= r) {
828 rr.hi = r;
829 ranges_.insert(rr);
830 nrunes_ += rr.hi - rr.lo + 1;
831 }
832 }
833 }
834
835 void CharClassBuilder::Negate() {
836 // Build up negation and then copy in.
837 // Could edit ranges in place, but C++ won't let me.
838 vector<RuneRange> v;
839 v.reserve(ranges_.size() + 1);
840
841 // In negation, first range begins at 0, unless
842 // the current class begins at 0.
843 iterator it = begin();
844 if (it == end()) {
845 v.push_back(RuneRange(0, Runemax));
846 } else {
847 int nextlo = 0;
848 if (it->lo == 0) {
849 nextlo = it->hi + 1;
850 ++it;
851 }
852 for (; it != end(); ++it) {
853 v.push_back(RuneRange(nextlo, it->lo - 1));
854 nextlo = it->hi + 1;
855 }
856 if (nextlo <= Runemax)
857 v.push_back(RuneRange(nextlo, Runemax));
858 }
859
860 ranges_.clear();
861 for (size_t i = 0; i < v.size(); i++)
862 ranges_.insert(v[i]);
863
864 upper_ = AlphaMask & ~upper_;
865 lower_ = AlphaMask & ~lower_;
866 nrunes_ = Runemax+1 - nrunes_;
867 }
868
869 // Character class is a sorted list of ranges.
870 // The ranges are allocated in the same block as the header,
871 // necessitating a special allocator and Delete method.
872
873 CharClass* CharClass::New(int maxranges) {
874 CharClass* cc;
875 uint8* data = new uint8[sizeof *cc + maxranges*sizeof cc->ranges_[0]];
876 cc = reinterpret_cast<CharClass*>(data);
877 cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc);
878 cc->nranges_ = 0;
879 cc->folds_ascii_ = false;
880 cc->nrunes_ = 0;
881 return cc;
882 }
883
884 void CharClass::Delete() {
885 uint8 *data = reinterpret_cast<uint8*>(this);
886 delete[] data;
887 }
888
889 CharClass* CharClass::Negate() {
890 CharClass* cc = CharClass::New(nranges_+1);
891 cc->folds_ascii_ = folds_ascii_;
892 cc->nrunes_ = Runemax + 1 - nrunes_;
893 int n = 0;
894 int nextlo = 0;
895 for (CharClass::iterator it = begin(); it != end(); ++it) {
896 if (it->lo == nextlo) {
897 nextlo = it->hi + 1;
898 } else {
899 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);
900 nextlo = it->hi + 1;
901 }
902 }
903 if (nextlo <= Runemax)
904 cc->ranges_[n++] = RuneRange(nextlo, Runemax);
905 cc->nranges_ = n;
906 return cc;
907 }
908
909 bool CharClass::Contains(Rune r) {
910 RuneRange* rr = ranges_;
911 int n = nranges_;
912 while (n > 0) {
913 int m = n/2;
914 if (rr[m].hi < r) {
915 rr += m+1;
916 n -= m+1;
917 } else if (r < rr[m].lo) {
918 n = m;
919 } else { // rr[m].lo <= r && r <= rr[m].hi
920 return true;
921 }
922 }
923 return false;
924 }
925
926 CharClass* CharClassBuilder::GetCharClass() {
927 CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));
928 int n = 0;
929 for (iterator it = begin(); it != end(); ++it)
930 cc->ranges_[n++] = *it;
931 cc->nranges_ = n;
932 DCHECK_LE(n, static_cast<int>(ranges_.size()));
933 cc->nrunes_ = nrunes_;
934 cc->folds_ascii_ = FoldsASCII();
935 return cc;
936 }
937
938 } // namespace re2
OLDNEW
« no previous file with comments | « third_party/re2/re2/regexp.h ('k') | third_party/re2/re2/set.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698