third_party/re2/re2/regexp.cc - Issue 10575037: Include RE2 library

Side by Side Diff: third_party/re2/re2/regexp.cc

Issue 10575037: Include RE2 library (Closed) Base URL: svn://svn.chromium.org/chrome/trunk/src

Patch Set: Removed valgrind specific code that is in Chromium already Created 8 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright 2006 The RE2 Authors. All Rights Reserved.

	2 // Use of this source code is governed by a BSD-style

	3 // license that can be found in the LICENSE file.

	4

	5 // Regular expression representation.

	6 // Tested by parse_test.cc

	7

	8 #include "util/util.h"

	9 #include "re2/regexp.h"

	10 #include "re2/stringpiece.h"

	11 #include "re2/walker-inl.h"

	12

	13 namespace re2 {

	14

	15 // Constructor. Allocates vectors as appropriate for operator.

	16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)

	17 : op_(op),

	18 simple_(false),

	19 parse_flags_(static_cast<uint16>(parse_flags)),

	20 ref_(1),

	21 nsub_(0),

	22 down_(NULL) {

	23 subone_ = NULL;

	24 memset(the_union_, 0, sizeof the_union_);

	25 }

	26

	27 // Destructor. Assumes already cleaned up children.

	28 // Private: use Decref() instead of delete to destroy Regexps.

	29 // Can't call Decref on the sub-Regexps here because

	30 // that could cause arbitrarily deep recursion, so

	31 // required Decref() to have handled them for us.

	32 Regexp::~Regexp() {

	33 if (nsub_ > 0)

	34 LOG(DFATAL) << "Regexp not destroyed.";

	35

	36 switch (op_) {

	37 default:

	38 break;

	39 case kRegexpCapture:

	40 delete name_;

	41 break;

	42 case kRegexpLiteralString:

	43 delete[] runes_;

	44 break;

	45 case kRegexpCharClass:

	46 cc_->Delete();

	47 delete ccb_;

	48 break;

	49 }

	50 }

	51

	52 // If it's possible to destroy this regexp without recurring,

	53 // do so and return true. Else return false.

	54 bool Regexp::QuickDestroy() {

	55 if (nsub_ == 0) {

	56 delete this;

	57 return true;

	58 }

	59 return false;

	60 }

	61

	62 static map<Regexp*, int> ref_map;

	63 static Mutex ref_mutex;

	64

	65 int Regexp::Ref() {

	66 if (ref_ < kMaxRef)

	67 return ref_;

	68

	69 MutexLock l(&ref_mutex);

	70 return ref_map[this];

	71 }

	72

	73 // Increments reference count, returns object as convenience.

	74 Regexp* Regexp::Incref() {

	75 if (ref_ >= kMaxRef-1) {

	76 // Store ref count in overflow map.

	77 MutexLock l(&ref_mutex);

	78 if (ref_ == kMaxRef) { // already overflowed

	79 ref_map[this]++;

	80 return this;

	81 }

	82 // overflowing now

	83 ref_map[this] = kMaxRef;

	84 ref_ = kMaxRef;

	85 return this;

	86 }

	87

	88 ref_++;

	89 return this;

	90 }

	91

	92 // Decrements reference count and deletes this object if count reaches 0.

	93 void Regexp::Decref() {

	94 if (ref_ == kMaxRef) {

	95 // Ref count is stored in overflow map.

	96 MutexLock l(&ref_mutex);

	97 int r = ref_map[this] - 1;

	98 if (r < kMaxRef) {

	99 ref_ = r;

	100 ref_map.erase(this);

	101 } else {

	102 ref_map[this] = r;

	103 }

	104 return;

	105 }

	106 ref_--;

	107 if (ref_ == 0)

	108 Destroy();

	109 }

	110

	111 // Deletes this object; ref count has count reached 0.

	112 void Regexp::Destroy() {

	113 if (QuickDestroy())

	114 return;

	115

	116 // Handle recursive Destroy with explicit stack

	117 // to avoid arbitrarily deep recursion on process stack [sigh].

	118 down_ = NULL;

	119 Regexp* stack = this;

	120 while (stack != NULL) {

	121 Regexp* re = stack;

	122 stack = re->down_;

	123 if (re->ref_ != 0)

	124 LOG(DFATAL) << "Bad reference count " << re->ref_;

	125 if (re->nsub_ > 0) {

	126 Regexp** subs = re->sub();

	127 for (int i = 0; i < re->nsub_; i++) {

	128 Regexp* sub = subs[i];

	129 if (sub == NULL)

	130 continue;

	131 if (sub->ref_ == kMaxRef)

	132 sub->Decref();

	133 else

	134 --sub->ref_;

	135 if (sub->ref_ == 0 && !sub->QuickDestroy()) {

	136 sub->down_ = stack;

	137 stack = sub;

	138 }

	139 }

	140 if (re->nsub_ > 1)

	141 delete[] subs;

	142 re->nsub_ = 0;

	143 }

	144 delete re;

	145 }

	146 }

	147

	148 void Regexp::AddRuneToString(Rune r) {

	149 DCHECK(op_ == kRegexpLiteralString);

	150 if (nrunes_ == 0) {

	151 // start with 8

	152 runes_ = new Rune[8];

	153 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {

	154 // double on powers of two

	155 Rune *old = runes_;

	156 runes_ = new Rune[nrunes_ * 2];

	157 for (int i = 0; i < nrunes_; i++)

	158 runes_[i] = old[i];

	159 delete[] old;

	160 }

	161

	162 runes_[nrunes_++] = r;

	163 }

	164

	165 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {

	166 Regexp* re = new Regexp(kRegexpHaveMatch, flags);

	167 re->match_id_ = match_id;

	168 return re;

	169 }

	170

	171 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {

	172 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)

	173 return sub;

	174 Regexp* re = new Regexp(kRegexpPlus, flags);

	175 re->AllocSub(1);

	176 re->sub()[0] = sub;

	177 return re;

	178 }

	179

	180 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {

	181 if (sub->op() == kRegexpStar && sub->parse_flags() == flags)

	182 return sub;

	183 Regexp* re = new Regexp(kRegexpStar, flags);

	184 re->AllocSub(1);

	185 re->sub()[0] = sub;

	186 return re;

	187 }

	188

	189 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {

	190 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)

	191 return sub;

	192 Regexp* re = new Regexp(kRegexpQuest, flags);

	193 re->AllocSub(1);

	194 re->sub()[0] = sub;

	195 return re;

	196 }

	197

	198 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,

	199 ParseFlags flags, bool can_factor) {

	200 if (nsub == 1)

	201 return sub[0];

	202

	203 Regexp** subcopy = NULL;

	204 if (op == kRegexpAlternate && can_factor) {

	205 // Going to edit sub; make a copy so we don't step on caller.

	206 subcopy = new Regexp*[nsub];

	207 memmove(subcopy, sub, nsub * sizeof sub[0]);

	208 sub = subcopy;

	209 nsub = FactorAlternation(sub, nsub, flags);

	210 if (nsub == 1) {

	211 Regexp* re = sub[0];

	212 delete[] subcopy;

	213 return re;

	214 }

	215 }

	216

	217 if (nsub > kMaxNsub) {

	218 // Too many subexpressions to fit in a single Regexp.

	219 // Make a two-level tree. Two levels gets us to 65535^2.

	220 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;

	221 Regexp* re = new Regexp(op, flags);

	222 re->AllocSub(nbigsub);

	223 Regexp** subs = re->sub();

	224 for (int i = 0; i < nbigsub - 1; i++)

	225 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);

	226 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,

	227 nsub - (nbigsub-1)*kMaxNsub, flags,

	228 false);

	229 delete[] subcopy;

	230 return re;

	231 }

	232

	233 Regexp* re = new Regexp(op, flags);

	234 re->AllocSub(nsub);

	235 Regexp** subs = re->sub();

	236 for (int i = 0; i < nsub; i++)

	237 subs[i] = sub[i];

	238

	239 delete[] subcopy;

	240 return re;

	241 }

	242

	243 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {

	244 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);

	245 }

	246

	247 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {

	248 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);

	249 }

	250

	251 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {

	252 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);

	253 }

	254

	255 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {

	256 Regexp* re = new Regexp(kRegexpCapture, flags);

	257 re->AllocSub(1);

	258 re->sub()[0] = sub;

	259 re->cap_ = cap;

	260 return re;

	261 }

	262

	263 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {

	264 Regexp* re = new Regexp(kRegexpRepeat, flags);

	265 re->AllocSub(1);

	266 re->sub()[0] = sub;

	267 re->min_ = min;

	268 re->max_ = max;

	269 return re;

	270 }

	271

	272 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {

	273 Regexp* re = new Regexp(kRegexpLiteral, flags);

	274 re->rune_ = rune;

	275 return re;

	276 }

	277

	278 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {

	279 if (nrunes <= 0)

	280 return new Regexp(kRegexpEmptyMatch, flags);

	281 if (nrunes == 1)

	282 return NewLiteral(runes[0], flags);

	283 Regexp* re = new Regexp(kRegexpLiteralString, flags);

	284 for (int i = 0; i < nrunes; i++)

	285 re->AddRuneToString(runes[i]);

	286 return re;

	287 }

	288

	289 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {

	290 Regexp* re = new Regexp(kRegexpCharClass, flags);

	291 re->cc_ = cc;

	292 return re;

	293 }

	294

	295 // Swaps this and that in place.

	296 void Regexp::Swap(Regexp* that) {

	297 // Can use memmove because Regexp is just a struct (no vtable).

	298 char tmp[sizeof *this];

	299 memmove(tmp, this, sizeof tmp);

	300 memmove(this, that, sizeof tmp);

	301 memmove(that, tmp, sizeof tmp);

	302 }

	303

	304 // Tests equality of all top-level structure but not subregexps.

	305 static bool TopEqual(Regexp* a, Regexp* b) {

	306 if (a->op() != b->op())

	307 return false;

	308

	309 switch (a->op()) {

	310 case kRegexpNoMatch:

	311 case kRegexpEmptyMatch:

	312 case kRegexpAnyChar:

	313 case kRegexpAnyByte:

	314 case kRegexpBeginLine:

	315 case kRegexpEndLine:

	316 case kRegexpWordBoundary:

	317 case kRegexpNoWordBoundary:

	318 case kRegexpBeginText:

	319 return true;

	320

	321 case kRegexpEndText:

	322 // The parse flags remember whether it's \z or (?-m:$),

	323 // which matters when testing against PCRE.

	324 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;

	325

	326 case kRegexpLiteral:

	327 return a->rune() == b->rune() &&

	328 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;

	329

	330 case kRegexpLiteralString:

	331 return a->nrunes() == b->nrunes() &&

	332 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&

	333 memcmp(a->runes(), b->runes(),

	334 a->nrunes() * sizeof a->runes()[0]) == 0;

	335

	336 case kRegexpAlternate:

	337 case kRegexpConcat:

	338 return a->nsub() == b->nsub();

	339

	340 case kRegexpStar:

	341 case kRegexpPlus:

	342 case kRegexpQuest:

	343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;

	344

	345 case kRegexpRepeat:

	346 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&

	347 a->min() == b->min() &&

	348 a->max() == b->max();

	349

	350 case kRegexpCapture:

	351 return a->cap() == b->cap() && a->name() == b->name();

	352

	353 case kRegexpHaveMatch:

	354 return a->match_id() == b->match_id();

	355

	356 case kRegexpCharClass: {

	357 CharClass* acc = a->cc();

	358 CharClass* bcc = b->cc();

	359 return acc->size() == bcc->size() &&

	360 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&

	361 memcmp(acc->begin(), bcc->begin(),

	362 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;

	363 }

	364 }

	365

	366 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();

	367 return 0;

	368 }

	369

	370 bool Regexp::Equal(Regexp* a, Regexp* b) {

	371 if (a == NULL \|\| b == NULL)

	372 return a == b;

	373

	374 if (!TopEqual(a, b))

	375 return false;

	376

	377 // Fast path:

	378 // return without allocating vector if there are no subregexps.

	379 switch (a->op()) {

	380 case kRegexpAlternate:

	381 case kRegexpConcat:

	382 case kRegexpStar:

	383 case kRegexpPlus:

	384 case kRegexpQuest:

	385 case kRegexpRepeat:

	386 case kRegexpCapture:

	387 break;

	388

	389 default:

	390 return true;

	391 }

	392

	393 // Committed to doing real work.

	394 // The stack (vector) has pairs of regexps waiting to

	395 // be compared. The regexps are only equal if

	396 // all the pairs end up being equal.

	397 vector<Regexp*> stk;

	398

	399 for (;;) {

	400 // Invariant: TopEqual(a, b) == true.

	401 Regexp* a2;

	402 Regexp* b2;

	403 switch (a->op()) {

	404 default:

	405 break;

	406 case kRegexpAlternate:

	407 case kRegexpConcat:

	408 for (int i = 0; i < a->nsub(); i++) {

	409 a2 = a->sub()[i];

	410 b2 = b->sub()[i];

	411 if (!TopEqual(a2, b2))

	412 return false;

	413 stk.push_back(a2);

	414 stk.push_back(b2);

	415 }

	416 break;

	417

	418 case kRegexpStar:

	419 case kRegexpPlus:

	420 case kRegexpQuest:

	421 case kRegexpRepeat:

	422 case kRegexpCapture:

	423 a2 = a->sub()[0];

	424 b2 = b->sub()[0];

	425 if (!TopEqual(a2, b2))

	426 return false;

	427 // Really:

	428 // stk.push_back(a2);

	429 // stk.push_back(b2);

	430 // break;

	431 // but faster to assign directly and loop.

	432 a = a2;

	433 b = b2;

	434 continue;

	435 }

	436

	437 int n = stk.size();

	438 if (n == 0)

	439 break;

	440

	441 a = stk[n-2];

	442 b = stk[n-1];

	443 stk.resize(n-2);

	444 }

	445

	446 return true;

	447 }

	448

	449 // Keep in sync with enum RegexpStatusCode in regexp.h

	450 static const string kErrorStrings[] = {

	451 "no error",

	452 "unexpected error",

	453 "invalid escape sequence",

	454 "invalid character class",

	455 "invalid character class range",

	456 "missing ]",

	457 "missing )",

	458 "trailing \\",

	459 "no argument for repetition operator",

	460 "invalid repetition size",

	461 "bad repetition operator",

	462 "invalid perl operator",

	463 "invalid UTF-8",

	464 "invalid named capture group",

	465 };

	466

	467 const string& RegexpStatus::CodeText(enum RegexpStatusCode code) {

	468 if (code < 0 \|\| code >= arraysize(kErrorStrings))

	469 code = kRegexpInternalError;

	470 return kErrorStrings[code];

	471 }

	472

	473 string RegexpStatus::Text() const {

	474 if (error_arg_.empty())

	475 return CodeText(code_);

	476 string s;

	477 s.append(CodeText(code_));

	478 s.append(": ");

	479 s.append(error_arg_.data(), error_arg_.size());

	480 return s;

	481 }

	482

	483 void RegexpStatus::Copy(const RegexpStatus& status) {

	484 code_ = status.code_;

	485 error_arg_ = status.error_arg_;

	486 }

	487

	488 typedef int Ignored; // Walker<void> doesn't exist

	489

	490 // Walker subclass to count capturing parens in regexp.

	491 class NumCapturesWalker : public Regexp::Walker<Ignored> {

	492 public:

	493 NumCapturesWalker() : ncapture_(0) {}

	494 int ncapture() { return ncapture_; }

	495

	496 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {

	497 if (re->op() == kRegexpCapture)

	498 ncapture_++;

	499 return ignored;

	500 }

	501 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {

	502 // Should never be called: we use Walk not WalkExponential.

	503 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";

	504 return ignored;

	505 }

	506

	507 private:

	508 int ncapture_;

	509 DISALLOW_EVIL_CONSTRUCTORS(NumCapturesWalker);

	510 };

	511

	512 int Regexp::NumCaptures() {

	513 NumCapturesWalker w;

	514 w.Walk(this, 0);

	515 return w.ncapture();

	516 }

	517

	518 // Walker class to build map of named capture groups and their indices.

	519 class NamedCapturesWalker : public Regexp::Walker<Ignored> {

	520 public:

	521 NamedCapturesWalker() : map_(NULL) {}

	522 ~NamedCapturesWalker() { delete map_; }

	523

	524 map<string, int>* TakeMap() {

	525 map<string, int>* m = map_;

	526 map_ = NULL;

	527 return m;

	528 }

	529

	530 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {

	531 if (re->op() == kRegexpCapture && re->name() != NULL) {

	532 // Allocate map once we find a name.

	533 if (map_ == NULL)

	534 map_ = new map<string, int>;

	535

	536 // Record first occurrence of each name.

	537 // (The rule is that if you have the same name

	538 // multiple times, only the leftmost one counts.)

	539 if (map_->find(*re->name()) == map_->end())

	540 (map_)[re->name()] = re->cap();

	541 }

	542 return ignored;

	543 }

	544

	545 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {

	546 // Should never be called: we use Walk not WalkExponential.

	547 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";

	548 return ignored;

	549 }

	550

	551 private:

	552 map<string, int>* map_;

	553 DISALLOW_EVIL_CONSTRUCTORS(NamedCapturesWalker);

	554 };

	555

	556 map<string, int>* Regexp::NamedCaptures() {

	557 NamedCapturesWalker w;

	558 w.Walk(this, 0);

	559 return w.TakeMap();

	560 }

	561

	562 // Walker class to build map from capture group indices to their names.

	563 class CaptureNamesWalker : public Regexp::Walker<Ignored> {

	564 public:

	565 CaptureNamesWalker() : map_(NULL) {}

	566 ~CaptureNamesWalker() { delete map_; }

	567

	568 map<int, string>* TakeMap() {

	569 map<int, string>* m = map_;

	570 map_ = NULL;

	571 return m;

	572 }

	573

	574 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {

	575 if (re->op() == kRegexpCapture && re->name() != NULL) {

	576 // Allocate map once we find a name.

	577 if (map_ == NULL)

	578 map_ = new map<int, string>;

	579

	580 (map_)[re->cap()] = re->name();

	581 }

	582 return ignored;

	583 }

	584

	585 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {

	586 // Should never be called: we use Walk not WalkExponential.

	587 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";

	588 return ignored;

	589 }

	590

	591 private:

	592 map<int, string>* map_;

	593 DISALLOW_EVIL_CONSTRUCTORS(CaptureNamesWalker);

	594 };

	595

	596 map<int, string>* Regexp::CaptureNames() {

	597 CaptureNamesWalker w;

	598 w.Walk(this, 0);

	599 return w.TakeMap();

	600 }

	601

	602 // Determines whether regexp matches must be anchored

	603 // with a fixed string prefix. If so, returns the prefix and

	604 // the regexp that remains after the prefix. The prefix might

	605 // be ASCII case-insensitive.

	606 bool Regexp::RequiredPrefix(string prefix, bool foldcase, Regexp** suffix) {

	607 // No need for a walker: the regexp must be of the form

	608 // 1. some number of ^ anchors

	609 // 2. a literal char or string

	610 // 3. the rest

	611 prefix->clear();

	612 *foldcase = false;

	613 *suffix = NULL;

	614 if (op_ != kRegexpConcat)

	615 return false;

	616

	617 // Some number of anchors, then a literal or concatenation.

	618 int i = 0;

	619 Regexp** sub = this->sub();

	620 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)

	621 i++;

	622 if (i == 0 \|\| i >= nsub_)

	623 return false;

	624

	625 Regexp* re = sub[i];

	626 switch (re->op_) {

	627 default:

	628 return false;

	629

	630 case kRegexpLiteralString:

	631 // Convert to string in proper encoding.

	632 if (re->parse_flags() & Latin1) {

	633 prefix->resize(re->nrunes_);

	634 for (int j = 0; j < re->nrunes_; j++)

	635 (*prefix)[j] = re->runes_[j];

	636 } else {

	637 // Convert to UTF-8 in place.

	638 // Assume worst-case space and then trim.

	639 prefix->resize(re->nrunes_ * UTFmax);

	640 char p = &(prefix)[0];

	641 for (int j = 0; j < re->nrunes_; j++) {

	642 Rune r = re->runes_[j];

	643 if (r < Runeself)

	644 *p++ = r;

	645 else

	646 p += runetochar(p, &r);

	647 }

	648 prefix->resize(p - &(*prefix)[0]);

	649 }

	650 break;

	651

	652 case kRegexpLiteral:

	653 if ((re->parse_flags() & Latin1) \|\| re->rune_ < Runeself) {

	654 prefix->append(1, re->rune_);

	655 } else {

	656 char buf[UTFmax];

	657 prefix->append(buf, runetochar(buf, &re->rune_));

	658 }

	659 break;

	660 }

	661 *foldcase = (sub[i]->parse_flags() & FoldCase);

	662 i++;

	663

	664 // The rest.

	665 if (i < nsub_) {

	666 for (int j = i; j < nsub_; j++)

	667 sub[j]->Incref();

	668 re = Concat(sub + i, nsub_ - i, parse_flags());

	669 } else {

	670 re = new Regexp(kRegexpEmptyMatch, parse_flags());

	671 }

	672 *suffix = re;

	673 return true;

	674 }

	675

	676 // Character class builder is a balanced binary tree (STL set)

	677 // containing non-overlapping, non-abutting RuneRanges.

	678 // The less-than operator used in the tree treats two

	679 // ranges as equal if they overlap at all, so that

	680 // lookups for a particular Rune are possible.

	681

	682 CharClassBuilder::CharClassBuilder() {

	683 nrunes_ = 0;

	684 upper_ = 0;

	685 lower_ = 0;

	686 }

	687

	688 // Add lo-hi to the class; return whether class got bigger.

	689 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {

	690 if (hi < lo)

	691 return false;

	692

	693 if (lo <= 'z' && hi >= 'A') {

	694 // Overlaps some alpha, maybe not all.

	695 // Update bitmaps telling which ASCII letters are in the set.

	696 Rune lo1 = max<Rune>(lo, 'A');

	697 Rune hi1 = min<Rune>(hi, 'Z');

	698 if (lo1 <= hi1)

	699 upper_ \|= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');

	700

	701 lo1 = max<Rune>(lo, 'a');

	702 hi1 = min<Rune>(hi, 'z');

	703 if (lo1 <= hi1)

	704 lower_ \|= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');

	705 }

	706

	707 { // Check whether lo, hi is already in the class.

	708 iterator it = ranges_.find(RuneRange(lo, lo));

	709 if (it != end() && it->lo <= lo && hi <= it->hi)

	710 return false;

	711 }

	712

	713 // Look for a range abutting lo on the left.

	714 // If it exists, take it out and increase our range.

	715 if (lo > 0) {

	716 iterator it = ranges_.find(RuneRange(lo-1, lo-1));

	717 if (it != end()) {

	718 lo = it->lo;

	719 if (it->hi > hi)

	720 hi = it->hi;

	721 nrunes_ -= it->hi - it->lo + 1;

	722 ranges_.erase(it);

	723 }

	724 }

	725

	726 // Look for a range abutting hi on the right.

	727 // If it exists, take it out and increase our range.

	728 if (hi < Runemax) {

	729 iterator it = ranges_.find(RuneRange(hi+1, hi+1));

	730 if (it != end()) {

	731 hi = it->hi;

	732 nrunes_ -= it->hi - it->lo + 1;

	733 ranges_.erase(it);

	734 }

	735 }

	736

	737 // Look for ranges between lo and hi. Take them out.

	738 // This is only safe because the set has no overlapping ranges.

	739 // We've already removed any ranges abutting lo and hi, so

	740 // any that overlap [lo, hi] must be contained within it.

	741 for (;;) {

	742 iterator it = ranges_.find(RuneRange(lo, hi));

	743 if (it == end())

	744 break;

	745 nrunes_ -= it->hi - it->lo + 1;

	746 ranges_.erase(it);

	747 }

	748

	749 // Finally, add [lo, hi].

	750 nrunes_ += hi - lo + 1;

	751 ranges_.insert(RuneRange(lo, hi));

	752 return true;

	753 }

	754

	755 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {

	756 for (iterator it = cc->begin(); it != cc->end(); ++it)

	757 AddRange(it->lo, it->hi);

	758 }

	759

	760 bool CharClassBuilder::Contains(Rune r) {

	761 return ranges_.find(RuneRange(r, r)) != end();

	762 }

	763

	764 // Does the character class behave the same on A-Z as on a-z?

	765 bool CharClassBuilder::FoldsASCII() {

	766 return ((upper_ ^ lower_) & AlphaMask) == 0;

	767 }

	768

	769 CharClassBuilder* CharClassBuilder::Copy() {

	770 CharClassBuilder* cc = new CharClassBuilder;

	771 for (iterator it = begin(); it != end(); ++it)

	772 cc->ranges_.insert(RuneRange(it->lo, it->hi));

	773 cc->upper_ = upper_;

	774 cc->lower_ = lower_;

	775 cc->nrunes_ = nrunes_;

	776 return cc;

	777 }

	778

	779

	780

	781 void CharClassBuilder::RemoveAbove(Rune r) {

	782 if (r >= Runemax)

	783 return;

	784

	785 if (r < 'z') {

	786 if (r < 'a')

	787 lower_ = 0;

	788 else

	789 lower_ &= AlphaMask >> ('z' - r);

	790 }

	791

	792 if (r < 'Z') {

	793 if (r < 'A')

	794 upper_ = 0;

	795 else

	796 upper_ &= AlphaMask >> ('Z' - r);

	797 }

	798

	799 for (;;) {

	800

	801 iterator it = ranges_.find(RuneRange(r + 1, Runemax));

	802 if (it == end())

	803 break;

	804 RuneRange rr = *it;

	805 ranges_.erase(it);

	806 nrunes_ -= rr.hi - rr.lo + 1;

	807 if (rr.lo <= r) {

	808 rr.hi = r;

	809 ranges_.insert(rr);

	810 nrunes_ += rr.hi - rr.lo + 1;

	811 }

	812 }

	813 }

	814

	815 void CharClassBuilder::Negate() {

	816 // Build up negation and then copy in.

	817 // Could edit ranges in place, but C++ won't let me.

	818 vector<RuneRange> v;

	819 v.reserve(ranges_.size() + 1);

	820

	821 // In negation, first range begins at 0, unless

	822 // the current class begins at 0.

	823 iterator it = begin();

	824 if (it == end()) {

	825 v.push_back(RuneRange(0, Runemax));

	826 } else {

	827 int nextlo = 0;

	828 if (it->lo == 0) {

	829 nextlo = it->hi + 1;

	830 ++it;

	831 }

	832 for (; it != end(); ++it) {

	833 v.push_back(RuneRange(nextlo, it->lo - 1));

	834 nextlo = it->hi + 1;

	835 }

	836 if (nextlo <= Runemax)

	837 v.push_back(RuneRange(nextlo, Runemax));

	838 }

	839

	840 ranges_.clear();

	841 for (int i = 0; i < v.size(); i++)

	842 ranges_.insert(v[i]);

	843

	844 upper_ = AlphaMask & ~upper_;

	845 lower_ = AlphaMask & ~lower_;

	846 nrunes_ = Runemax+1 - nrunes_;

	847 }

	848

	849 // Character class is a sorted list of ranges.

	850 // The ranges are allocated in the same block as the header,

	851 // necessitating a special allocator and Delete method.

	852

	853 CharClass* CharClass::New(int maxranges) {

	854 CharClass* cc;

	855 uint8* data = new uint8[sizeof cc + maxrangessizeof cc->ranges_[0]];

	856 cc = reinterpret_cast<CharClass*>(data);

	857 cc->ranges_ = reinterpret_cast<RuneRange>(data + sizeof cc);

	858 cc->nranges_ = 0;

	859 cc->folds_ascii_ = false;

	860 cc->nrunes_ = 0;

	861 return cc;

	862 }

	863

	864 void CharClass::Delete() {

	865 if (this == NULL)

	866 return;

	867 uint8 data = reinterpret_cast<uint8>(this);

	868 delete[] data;

	869 }

	870

	871 CharClass* CharClass::Negate() {

	872 CharClass* cc = CharClass::New(nranges_+1);

	873 cc->folds_ascii_ = folds_ascii_;

	874 cc->nrunes_ = Runemax + 1 - nrunes_;

	875 int n = 0;

	876 int nextlo = 0;

	877 for (CharClass::iterator it = begin(); it != end(); ++it) {

	878 if (it->lo == nextlo) {

	879 nextlo = it->hi + 1;

	880 } else {

	881 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);

	882 nextlo = it->hi + 1;

	883 }

	884 }

	885 if (nextlo <= Runemax)

	886 cc->ranges_[n++] = RuneRange(nextlo, Runemax);

	887 cc->nranges_ = n;

	888 return cc;

	889 }

	890

	891 bool CharClass::Contains(Rune r) {

	892 RuneRange* rr = ranges_;

	893 int n = nranges_;

	894 while (n > 0) {

	895 int m = n/2;

	896 if (rr[m].hi < r) {

	897 rr += m+1;

	898 n -= m+1;

	899 } else if (r < rr[m].lo) {

	900 n = m;

	901 } else { // rr[m].lo <= r && r <= rr[m].hi

	902 return true;

	903 }

	904 }

	905 return false;

	906 }

	907

	908 CharClass* CharClassBuilder::GetCharClass() {

	909 CharClass* cc = CharClass::New(ranges_.size());

	910 int n = 0;

	911 for (iterator it = begin(); it != end(); ++it)

	912 cc->ranges_[n++] = *it;

	913 cc->nranges_ = n;

	914 DCHECK_LE(n, ranges_.size());

	915 cc->nrunes_ = nrunes_;

	916 cc->folds_ascii_ = FoldsASCII();

	917 return cc;

	918 }

	919

	920 } // namespace re2

OLD	NEW

« third_party/re2/README.chromium ('K') | « third_party/re2/re2/regexp.h ('k') | third_party/re2/re2/set.h » ('j') | no next file with comments »