third_party/re2/re2/regexp.cc - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/regexp.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 5 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 // Regular expression representation.

6 // Tested by parse_test.cc

7

8 #include "util/util.h"

9 #include "re2/regexp.h"

10 #include "re2/stringpiece.h"

11 #include "re2/walker-inl.h"

12

13 namespace re2 {

14

15 // Constructor. Allocates vectors as appropriate for operator.

16 Regexp::Regexp(RegexpOp op, ParseFlags parse_flags)

17 : op_(static_cast<uint8>(op)),

18 simple_(false),

19 parse_flags_(static_cast<uint16>(parse_flags)),

20 ref_(1),

21 nsub_(0),

22 down_(NULL) {

23 subone_ = NULL;

24 memset(the_union_, 0, sizeof the_union_);

25 }

26

27 // Destructor. Assumes already cleaned up children.

28 // Private: use Decref() instead of delete to destroy Regexps.

29 // Can't call Decref on the sub-Regexps here because

30 // that could cause arbitrarily deep recursion, so

31 // required Decref() to have handled them for us.

32 Regexp::~Regexp() {

33 if (nsub_ > 0)

34 LOG(DFATAL) << "Regexp not destroyed.";

35

36 switch (op_) {

37 default:

38 break;

39 case kRegexpCapture:

40 delete name_;

41 break;

42 case kRegexpLiteralString:

43 delete[] runes_;

44 break;

45 case kRegexpCharClass:

46 if (cc_)

47 cc_->Delete();

48 delete ccb_;

49 break;

50 }

51 }

52

53 // If it's possible to destroy this regexp without recurring,

54 // do so and return true. Else return false.

55 bool Regexp::QuickDestroy() {

56 if (nsub_ == 0) {

57 delete this;

58 return true;

59 }

60 return false;

61 }

62

63 static map<Regexp, int> ref_map;

64 GLOBAL_MUTEX(ref_mutex);

65

66 int Regexp::Ref() {

67 if (ref_ < kMaxRef)

68 return ref_;

69

70 GLOBAL_MUTEX_LOCK(ref_mutex);

71 int r = 0;

72 if (ref_map != NULL) {

73 r = (*ref_map)[this];

74 }

75 GLOBAL_MUTEX_UNLOCK(ref_mutex);

76 return r;

77 }

78

79 // Increments reference count, returns object as convenience.

80 Regexp* Regexp::Incref() {

81 if (ref_ >= kMaxRef-1) {

82 // Store ref count in overflow map.

83 GLOBAL_MUTEX_LOCK(ref_mutex);

84 if (ref_map == NULL) {

85 ref_map = new map<Regexp*, int>;

86 }

87 if (ref_ == kMaxRef) {

88 // already overflowed

89 (*ref_map)[this]++;

90 } else {

91 // overflowing now

92 (*ref_map)[this] = kMaxRef;

93 ref_ = kMaxRef;

94 }

95 GLOBAL_MUTEX_UNLOCK(ref_mutex);

96 return this;

97 }

98

99 ref_++;

100 return this;

101 }

102

103 // Decrements reference count and deletes this object if count reaches 0.

104 void Regexp::Decref() {

105 if (ref_ == kMaxRef) {

106 // Ref count is stored in overflow map.

107 GLOBAL_MUTEX_LOCK(ref_mutex);

108 int r = (*ref_map)[this] - 1;

109 if (r < kMaxRef) {

110 ref_ = static_cast<uint16>(r);

111 ref_map->erase(this);

112 } else {

113 (*ref_map)[this] = r;

114 }

115 GLOBAL_MUTEX_UNLOCK(ref_mutex);

116 return;

117 }

118 ref_--;

119 if (ref_ == 0)

120 Destroy();

121 }

122

123 // Deletes this object; ref count has count reached 0.

124 void Regexp::Destroy() {

125 if (QuickDestroy())

126 return;

127

128 // Handle recursive Destroy with explicit stack

129 // to avoid arbitrarily deep recursion on process stack [sigh].

130 down_ = NULL;

131 Regexp* stack = this;

132 while (stack != NULL) {

133 Regexp* re = stack;

134 stack = re->down_;

135 if (re->ref_ != 0)

136 LOG(DFATAL) << "Bad reference count " << re->ref_;

137 if (re->nsub_ > 0) {

138 Regexp** subs = re->sub();

139 for (int i = 0; i < re->nsub_; i++) {

140 Regexp* sub = subs[i];

141 if (sub == NULL)

142 continue;

143 if (sub->ref_ == kMaxRef)

144 sub->Decref();

145 else

146 --sub->ref_;

147 if (sub->ref_ == 0 && !sub->QuickDestroy()) {

148 sub->down_ = stack;

149 stack = sub;

150 }

151 }

152 if (re->nsub_ > 1)

153 delete[] subs;

154 re->nsub_ = 0;

155 }

156 delete re;

157 }

158 }

159

160 void Regexp::AddRuneToString(Rune r) {

161 DCHECK(op_ == kRegexpLiteralString);

162 if (nrunes_ == 0) {

163 // start with 8

164 runes_ = new Rune[8];

165 } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) {

166 // double on powers of two

167 Rune *old = runes_;

168 runes_ = new Rune[nrunes_ * 2];

169 for (int i = 0; i < nrunes_; i++)

170 runes_[i] = old[i];

171 delete[] old;

172 }

173

174 runes_[nrunes_++] = r;

175 }

176

177 Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) {

178 Regexp* re = new Regexp(kRegexpHaveMatch, flags);

179 re->match_id_ = match_id;

180 return re;

181 }

182

183 Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) {

184 if (sub->op() == kRegexpPlus && sub->parse_flags() == flags)

185 return sub;

186 Regexp* re = new Regexp(kRegexpPlus, flags);

187 re->AllocSub(1);

188 re->sub()[0] = sub;

189 return re;

190 }

191

192 Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) {

193 if (sub->op() == kRegexpStar && sub->parse_flags() == flags)

194 return sub;

195 Regexp* re = new Regexp(kRegexpStar, flags);

196 re->AllocSub(1);

197 re->sub()[0] = sub;

198 return re;

199 }

200

201 Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) {

202 if (sub->op() == kRegexpQuest && sub->parse_flags() == flags)

203 return sub;

204 Regexp* re = new Regexp(kRegexpQuest, flags);

205 re->AllocSub(1);

206 re->sub()[0] = sub;

207 return re;

208 }

209

210 Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub,

211 ParseFlags flags, bool can_factor) {

212 if (nsub == 1)

213 return sub[0];

214

215 if (nsub == 0) {

216 if (op == kRegexpAlternate)

217 return new Regexp(kRegexpNoMatch, flags);

218 else

219 return new Regexp(kRegexpEmptyMatch, flags);

220 }

221

222 Regexp** subcopy = NULL;

223 if (op == kRegexpAlternate && can_factor) {

224 // Going to edit sub; make a copy so we don't step on caller.

225 subcopy = new Regexp*[nsub];

226 memmove(subcopy, sub, nsub * sizeof sub[0]);

227 sub = subcopy;

228 nsub = FactorAlternation(sub, nsub, flags);

229 if (nsub == 1) {

230 Regexp* re = sub[0];

231 delete[] subcopy;

232 return re;

233 }

234 }

235

236 if (nsub > kMaxNsub) {

237 // Too many subexpressions to fit in a single Regexp.

238 // Make a two-level tree. Two levels gets us to 65535^2.

239 int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub;

240 Regexp* re = new Regexp(op, flags);

241 re->AllocSub(nbigsub);

242 Regexp** subs = re->sub();

243 for (int i = 0; i < nbigsub - 1; i++)

244 subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false);

245 subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub,

246 nsub - (nbigsub-1)*kMaxNsub, flags,

247 false);

248 delete[] subcopy;

249 return re;

250 }

251

252 Regexp* re = new Regexp(op, flags);

253 re->AllocSub(nsub);

254 Regexp** subs = re->sub();

255 for (int i = 0; i < nsub; i++)

256 subs[i] = sub[i];

257

258 delete[] subcopy;

259 return re;

260 }

261

262 Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) {

263 return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false);

264 }

265

266 Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) {

267 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true);

268 }

269

270 Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) {

271 return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false);

272 }

273

274 Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) {

275 Regexp* re = new Regexp(kRegexpCapture, flags);

276 re->AllocSub(1);

277 re->sub()[0] = sub;

278 re->cap_ = cap;

279 return re;

280 }

281

282 Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) {

283 Regexp* re = new Regexp(kRegexpRepeat, flags);

284 re->AllocSub(1);

285 re->sub()[0] = sub;

286 re->min_ = min;

287 re->max_ = max;

288 return re;

289 }

290

291 Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) {

292 Regexp* re = new Regexp(kRegexpLiteral, flags);

293 re->rune_ = rune;

294 return re;

295 }

296

297 Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) {

298 if (nrunes <= 0)

299 return new Regexp(kRegexpEmptyMatch, flags);

300 if (nrunes == 1)

301 return NewLiteral(runes[0], flags);

302 Regexp* re = new Regexp(kRegexpLiteralString, flags);

303 for (int i = 0; i < nrunes; i++)

304 re->AddRuneToString(runes[i]);

305 return re;

306 }

307

308 Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) {

309 Regexp* re = new Regexp(kRegexpCharClass, flags);

310 re->cc_ = cc;

311 return re;

312 }

313

314 // Swaps this and that in place.

315 void Regexp::Swap(Regexp* that) {

316 // Can use memmove because Regexp is just a struct (no vtable).

317 char tmp[sizeof *this];

318 memmove(tmp, this, sizeof tmp);

319 memmove(this, that, sizeof tmp);

320 memmove(that, tmp, sizeof tmp);

321 }

322

323 // Tests equality of all top-level structure but not subregexps.

324 static bool TopEqual(Regexp* a, Regexp* b) {

325 if (a->op() != b->op())

326 return false;

327

328 switch (a->op()) {

329 case kRegexpNoMatch:

330 case kRegexpEmptyMatch:

331 case kRegexpAnyChar:

332 case kRegexpAnyByte:

333 case kRegexpBeginLine:

334 case kRegexpEndLine:

335 case kRegexpWordBoundary:

336 case kRegexpNoWordBoundary:

337 case kRegexpBeginText:

338 return true;

339

340 case kRegexpEndText:

341 // The parse flags remember whether it's \z or (?-m:$),

342 // which matters when testing against PCRE.

343 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0;

344

345 case kRegexpLiteral:

346 return a->rune() == b->rune() &&

347 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0;

348

349 case kRegexpLiteralString:

350 return a->nrunes() == b->nrunes() &&

351 ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 &&

352 memcmp(a->runes(), b->runes(),

353 a->nrunes() * sizeof a->runes()[0]) == 0;

354

355 case kRegexpAlternate:

356 case kRegexpConcat:

357 return a->nsub() == b->nsub();

358

359 case kRegexpStar:

360 case kRegexpPlus:

361 case kRegexpQuest:

362 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0;

363

364 case kRegexpRepeat:

365 return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 &&

366 a->min() == b->min() &&

367 a->max() == b->max();

368

369 case kRegexpCapture:

370 return a->cap() == b->cap() && a->name() == b->name();

371

372 case kRegexpHaveMatch:

373 return a->match_id() == b->match_id();

374

375 case kRegexpCharClass: {

376 CharClass* acc = a->cc();

377 CharClass* bcc = b->cc();

378 return acc->size() == bcc->size() &&

379 acc->end() - acc->begin() == bcc->end() - bcc->begin() &&

380 memcmp(acc->begin(), bcc->begin(),

381 (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0;

382 }

383 }

384

385 LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op();

386 return 0;

387 }

388

389 bool Regexp::Equal(Regexp* a, Regexp* b) {

390 if (a == NULL \|\| b == NULL)

391 return a == b;

392

393 if (!TopEqual(a, b))

394 return false;

395

396 // Fast path:

397 // return without allocating vector if there are no subregexps.

398 switch (a->op()) {

399 case kRegexpAlternate:

400 case kRegexpConcat:

401 case kRegexpStar:

402 case kRegexpPlus:

403 case kRegexpQuest:

404 case kRegexpRepeat:

405 case kRegexpCapture:

406 break;

407

408 default:

409 return true;

410 }

411

412 // Committed to doing real work.

413 // The stack (vector) has pairs of regexps waiting to

414 // be compared. The regexps are only equal if

415 // all the pairs end up being equal.

416 vector<Regexp*> stk;

417

418 for (;;) {

419 // Invariant: TopEqual(a, b) == true.

420 Regexp* a2;

421 Regexp* b2;

422 switch (a->op()) {

423 default:

424 break;

425 case kRegexpAlternate:

426 case kRegexpConcat:

427 for (int i = 0; i < a->nsub(); i++) {

428 a2 = a->sub()[i];

429 b2 = b->sub()[i];

430 if (!TopEqual(a2, b2))

431 return false;

432 stk.push_back(a2);

433 stk.push_back(b2);

434 }

435 break;

436

437 case kRegexpStar:

438 case kRegexpPlus:

439 case kRegexpQuest:

440 case kRegexpRepeat:

441 case kRegexpCapture:

442 a2 = a->sub()[0];

443 b2 = b->sub()[0];

444 if (!TopEqual(a2, b2))

445 return false;

446 // Really:

447 // stk.push_back(a2);

448 // stk.push_back(b2);

449 // break;

450 // but faster to assign directly and loop.

451 a = a2;

452 b = b2;

453 continue;

454 }

455

456 size_t n = stk.size();

457 if (n == 0)

458 break;

459

460 DCHECK_GE(n, 2);

461 a = stk[n-2];

462 b = stk[n-1];

463 stk.resize(n-2);

464 }

465

466 return true;

467 }

468

469 // Keep in sync with enum RegexpStatusCode in regexp.h

470 static const char *kErrorStrings[] = {

471 "no error",

472 "unexpected error",

473 "invalid escape sequence",

474 "invalid character class",

475 "invalid character class range",

476 "missing ]",

477 "missing )",

478 "trailing \\",

479 "no argument for repetition operator",

480 "invalid repetition size",

481 "bad repetition operator",

482 "invalid perl operator",

483 "invalid UTF-8",

484 "invalid named capture group",

485 };

486

487 string RegexpStatus::CodeText(enum RegexpStatusCode code) {

488 if (code < 0 \|\| code >= arraysize(kErrorStrings))

489 code = kRegexpInternalError;

490 return kErrorStrings[code];

491 }

492

493 string RegexpStatus::Text() const {

494 if (error_arg_.empty())

495 return CodeText(code_);

496 string s;

497 s.append(CodeText(code_));

498 s.append(": ");

499 s.append(error_arg_.data(), error_arg_.size());

500 return s;

501 }

502

503 void RegexpStatus::Copy(const RegexpStatus& status) {

504 code_ = status.code_;

505 error_arg_ = status.error_arg_;

506 }

507

508 typedef int Ignored; // Walker<void> doesn't exist

509

510 // Walker subclass to count capturing parens in regexp.

511 class NumCapturesWalker : public Regexp::Walker<Ignored> {

512 public:

513 NumCapturesWalker() : ncapture_(0) {}

514 int ncapture() { return ncapture_; }

515

516 virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {

517 if (re->op() == kRegexpCapture)

518 ncapture_++;

519 return ignored;

520 }

521 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {

522 // Should never be called: we use Walk not WalkExponential.

523 LOG(DFATAL) << "NumCapturesWalker::ShortVisit called";

524 return ignored;

525 }

526

527 private:

528 int ncapture_;

529 DISALLOW_COPY_AND_ASSIGN(NumCapturesWalker);

530 };

531

532 int Regexp::NumCaptures() {

533 NumCapturesWalker w;

534 w.Walk(this, 0);

535 return w.ncapture();

536 }

537

538 // Walker class to build map of named capture groups and their indices.

539 class NamedCapturesWalker : public Regexp::Walker<Ignored> {

540 public:

541 NamedCapturesWalker() : map_(NULL) {}

542 ~NamedCapturesWalker() { delete map_; }

543

544 map<string, int>* TakeMap() {

545 map<string, int>* m = map_;

546 map_ = NULL;

547 return m;

548 }

549

550 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {

551 if (re->op() == kRegexpCapture && re->name() != NULL) {

552 // Allocate map once we find a name.

553 if (map_ == NULL)

554 map_ = new map<string, int>;

555

556 // Record first occurrence of each name.

557 // (The rule is that if you have the same name

558 // multiple times, only the leftmost one counts.)

559 if (map_->find(*re->name()) == map_->end())

560 (map_)[re->name()] = re->cap();

561 }

562 return ignored;

563 }

564

565 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {

566 // Should never be called: we use Walk not WalkExponential.

567 LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called";

568 return ignored;

569 }

570

571 private:

572 map<string, int>* map_;

573 DISALLOW_COPY_AND_ASSIGN(NamedCapturesWalker);

574 };

575

576 map<string, int>* Regexp::NamedCaptures() {

577 NamedCapturesWalker w;

578 w.Walk(this, 0);

579 return w.TakeMap();

580 }

581

582 // Walker class to build map from capture group indices to their names.

583 class CaptureNamesWalker : public Regexp::Walker<Ignored> {

584 public:

585 CaptureNamesWalker() : map_(NULL) {}

586 ~CaptureNamesWalker() { delete map_; }

587

588 map<int, string>* TakeMap() {

589 map<int, string>* m = map_;

590 map_ = NULL;

591 return m;

592 }

593

594 Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) {

595 if (re->op() == kRegexpCapture && re->name() != NULL) {

596 // Allocate map once we find a name.

597 if (map_ == NULL)

598 map_ = new map<int, string>;

599

600 (map_)[re->cap()] = re->name();

601 }

602 return ignored;

603 }

604

605 virtual Ignored ShortVisit(Regexp* re, Ignored ignored) {

606 // Should never be called: we use Walk not WalkExponential.

607 LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called";

608 return ignored;

609 }

610

611 private:

612 map<int, string>* map_;

613 DISALLOW_COPY_AND_ASSIGN(CaptureNamesWalker);

614 };

615

616 map<int, string>* Regexp::CaptureNames() {

617 CaptureNamesWalker w;

618 w.Walk(this, 0);

619 return w.TakeMap();

620 }

621

622 // Determines whether regexp matches must be anchored

623 // with a fixed string prefix. If so, returns the prefix and

624 // the regexp that remains after the prefix. The prefix might

625 // be ASCII case-insensitive.

626 bool Regexp::RequiredPrefix(string prefix, bool foldcase, Regexp** suffix) {

627 // No need for a walker: the regexp must be of the form

628 // 1. some number of ^ anchors

629 // 2. a literal char or string

630 // 3. the rest

631 prefix->clear();

632 *foldcase = false;

633 *suffix = NULL;

634 if (op_ != kRegexpConcat)

635 return false;

636

637 // Some number of anchors, then a literal or concatenation.

638 int i = 0;

639 Regexp** sub = this->sub();

640 while (i < nsub_ && sub[i]->op_ == kRegexpBeginText)

641 i++;

642 if (i == 0 \|\| i >= nsub_)

643 return false;

644

645 Regexp* re = sub[i];

646 switch (re->op_) {

647 default:

648 return false;

649

650 case kRegexpLiteralString:

651 // Convert to string in proper encoding.

652 if (re->parse_flags() & Latin1) {

653 prefix->resize(re->nrunes_);

654 for (int j = 0; j < re->nrunes_; j++)

655 (*prefix)[j] = static_cast<char>(re->runes_[j]);

656 } else {

657 // Convert to UTF-8 in place.

658 // Assume worst-case space and then trim.

659 prefix->resize(re->nrunes_ * UTFmax);

660 char p = &(prefix)[0];

661 for (int j = 0; j < re->nrunes_; j++) {

662 Rune r = re->runes_[j];

663 if (r < Runeself)

664 *p++ = static_cast<char>(r);

665 else

666 p += runetochar(p, &r);

667 }

668 prefix->resize(p - &(*prefix)[0]);

669 }

670 break;

671

672 case kRegexpLiteral:

673 if ((re->parse_flags() & Latin1) \|\| re->rune_ < Runeself) {

674 prefix->append(1, static_cast<char>(re->rune_));

675 } else {

676 char buf[UTFmax];

677 prefix->append(buf, runetochar(buf, &re->rune_));

678 }

679 break;

680 }

681 *foldcase = (sub[i]->parse_flags() & FoldCase) != 0;

682 i++;

683

684 // The rest.

685 if (i < nsub_) {

686 for (int j = i; j < nsub_; j++)

687 sub[j]->Incref();

688 re = Concat(sub + i, nsub_ - i, parse_flags());

689 } else {

690 re = new Regexp(kRegexpEmptyMatch, parse_flags());

691 }

692 *suffix = re;

693 return true;

694 }

695

696 // Character class builder is a balanced binary tree (STL set)

697 // containing non-overlapping, non-abutting RuneRanges.

698 // The less-than operator used in the tree treats two

699 // ranges as equal if they overlap at all, so that

700 // lookups for a particular Rune are possible.

701

702 CharClassBuilder::CharClassBuilder() {

703 nrunes_ = 0;

704 upper_ = 0;

705 lower_ = 0;

706 }

707

708 // Add lo-hi to the class; return whether class got bigger.

709 bool CharClassBuilder::AddRange(Rune lo, Rune hi) {

710 if (hi < lo)

711 return false;

712

713 if (lo <= 'z' && hi >= 'A') {

714 // Overlaps some alpha, maybe not all.

715 // Update bitmaps telling which ASCII letters are in the set.

716 Rune lo1 = max<Rune>(lo, 'A');

717 Rune hi1 = min<Rune>(hi, 'Z');

718 if (lo1 <= hi1)

719 upper_ \|= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A');

720

721 lo1 = max<Rune>(lo, 'a');

722 hi1 = min<Rune>(hi, 'z');

723 if (lo1 <= hi1)

724 lower_ \|= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a');

725 }

726

727 { // Check whether lo, hi is already in the class.

728 iterator it = ranges_.find(RuneRange(lo, lo));

729 if (it != end() && it->lo <= lo && hi <= it->hi)

730 return false;

731 }

732

733 // Look for a range abutting lo on the left.

734 // If it exists, take it out and increase our range.

735 if (lo > 0) {

736 iterator it = ranges_.find(RuneRange(lo-1, lo-1));

737 if (it != end()) {

738 lo = it->lo;

739 if (it->hi > hi)

740 hi = it->hi;

741 nrunes_ -= it->hi - it->lo + 1;

742 ranges_.erase(it);

743 }

744 }

745

746 // Look for a range abutting hi on the right.

747 // If it exists, take it out and increase our range.

748 if (hi < Runemax) {

749 iterator it = ranges_.find(RuneRange(hi+1, hi+1));

750 if (it != end()) {

751 hi = it->hi;

752 nrunes_ -= it->hi - it->lo + 1;

753 ranges_.erase(it);

754 }

755 }

756

757 // Look for ranges between lo and hi. Take them out.

758 // This is only safe because the set has no overlapping ranges.

759 // We've already removed any ranges abutting lo and hi, so

760 // any that overlap [lo, hi] must be contained within it.

761 for (;;) {

762 iterator it = ranges_.find(RuneRange(lo, hi));

763 if (it == end())

764 break;

765 nrunes_ -= it->hi - it->lo + 1;

766 ranges_.erase(it);

767 }

768

769 // Finally, add [lo, hi].

770 nrunes_ += hi - lo + 1;

771 ranges_.insert(RuneRange(lo, hi));

772 return true;

773 }

774

775 void CharClassBuilder::AddCharClass(CharClassBuilder *cc) {

776 for (iterator it = cc->begin(); it != cc->end(); ++it)

777 AddRange(it->lo, it->hi);

778 }

779

780 bool CharClassBuilder::Contains(Rune r) {

781 return ranges_.find(RuneRange(r, r)) != end();

782 }

783

784 // Does the character class behave the same on A-Z as on a-z?

785 bool CharClassBuilder::FoldsASCII() {

786 return ((upper_ ^ lower_) & AlphaMask) == 0;

787 }

788

789 CharClassBuilder* CharClassBuilder::Copy() {

790 CharClassBuilder* cc = new CharClassBuilder;

791 for (iterator it = begin(); it != end(); ++it)

792 cc->ranges_.insert(RuneRange(it->lo, it->hi));

793 cc->upper_ = upper_;

794 cc->lower_ = lower_;

795 cc->nrunes_ = nrunes_;

796 return cc;

797 }

798

799

800

801 void CharClassBuilder::RemoveAbove(Rune r) {

802 if (r >= Runemax)

803 return;

804

805 if (r < 'z') {

806 if (r < 'a')

807 lower_ = 0;

808 else

809 lower_ &= AlphaMask >> ('z' - r);

810 }

811

812 if (r < 'Z') {

813 if (r < 'A')

814 upper_ = 0;

815 else

816 upper_ &= AlphaMask >> ('Z' - r);

817 }

818

819 for (;;) {

820

821 iterator it = ranges_.find(RuneRange(r + 1, Runemax));

822 if (it == end())

823 break;

824 RuneRange rr = *it;

825 ranges_.erase(it);

826 nrunes_ -= rr.hi - rr.lo + 1;

827 if (rr.lo <= r) {

828 rr.hi = r;

829 ranges_.insert(rr);

830 nrunes_ += rr.hi - rr.lo + 1;

831 }

832 }

833 }

834

835 void CharClassBuilder::Negate() {

836 // Build up negation and then copy in.

837 // Could edit ranges in place, but C++ won't let me.

838 vector<RuneRange> v;

839 v.reserve(ranges_.size() + 1);

840

841 // In negation, first range begins at 0, unless

842 // the current class begins at 0.

843 iterator it = begin();

844 if (it == end()) {

845 v.push_back(RuneRange(0, Runemax));

846 } else {

847 int nextlo = 0;

848 if (it->lo == 0) {

849 nextlo = it->hi + 1;

850 ++it;

851 }

852 for (; it != end(); ++it) {

853 v.push_back(RuneRange(nextlo, it->lo - 1));

854 nextlo = it->hi + 1;

855 }

856 if (nextlo <= Runemax)

857 v.push_back(RuneRange(nextlo, Runemax));

858 }

859

860 ranges_.clear();

861 for (size_t i = 0; i < v.size(); i++)

862 ranges_.insert(v[i]);

863

864 upper_ = AlphaMask & ~upper_;

865 lower_ = AlphaMask & ~lower_;

866 nrunes_ = Runemax+1 - nrunes_;

867 }

868

869 // Character class is a sorted list of ranges.

870 // The ranges are allocated in the same block as the header,

871 // necessitating a special allocator and Delete method.

872

873 CharClass* CharClass::New(int maxranges) {

874 CharClass* cc;

875 uint8* data = new uint8[sizeof cc + maxrangessizeof cc->ranges_[0]];

876 cc = reinterpret_cast<CharClass*>(data);

877 cc->ranges_ = reinterpret_cast<RuneRange>(data + sizeof cc);

878 cc->nranges_ = 0;

879 cc->folds_ascii_ = false;

880 cc->nrunes_ = 0;

881 return cc;

882 }

883

884 void CharClass::Delete() {

885 uint8 data = reinterpret_cast<uint8>(this);

886 delete[] data;

887 }

888

889 CharClass* CharClass::Negate() {

890 CharClass* cc = CharClass::New(nranges_+1);

891 cc->folds_ascii_ = folds_ascii_;

892 cc->nrunes_ = Runemax + 1 - nrunes_;

893 int n = 0;

894 int nextlo = 0;

895 for (CharClass::iterator it = begin(); it != end(); ++it) {

896 if (it->lo == nextlo) {

897 nextlo = it->hi + 1;

898 } else {

899 cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1);

900 nextlo = it->hi + 1;

901 }

902 }

903 if (nextlo <= Runemax)

904 cc->ranges_[n++] = RuneRange(nextlo, Runemax);

905 cc->nranges_ = n;

906 return cc;

907 }

908

909 bool CharClass::Contains(Rune r) {

910 RuneRange* rr = ranges_;

911 int n = nranges_;

912 while (n > 0) {

913 int m = n/2;

914 if (rr[m].hi < r) {

915 rr += m+1;

916 n -= m+1;

917 } else if (r < rr[m].lo) {

918 n = m;

919 } else { // rr[m].lo <= r && r <= rr[m].hi

920 return true;

921 }

922 }

923 return false;

924 }

925

926 CharClass* CharClassBuilder::GetCharClass() {

927 CharClass* cc = CharClass::New(static_cast<int>(ranges_.size()));

928 int n = 0;

929 for (iterator it = begin(); it != end(); ++it)

930 cc->ranges_[n++] = *it;

931 cc->nranges_ = n;

932 DCHECK_LE(n, static_cast<int>(ranges_.size()));

933 cc->nrunes_ = nrunes_;

934 cc->folds_ascii_ = FoldsASCII();

935 return cc;

936 }

937

938 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/regexp.h ('k') | third_party/re2/re2/set.h » ('j') | no next file with comments »