third_party/re2/re2/parse.cc - Issue 1544433002: Replace RE2 import with a dependency

Side by Side Diff: third_party/re2/re2/parse.cc

Issue 1544433002: Replace RE2 import with a dependency (Closed) Base URL: https://chromium.googlesource.com/chromium/src.git@master

Patch Set: Re-Added LICENSE and OWNERS file Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
	(Empty)
1 // Copyright 2006 The RE2 Authors. All Rights Reserved.

2 // Use of this source code is governed by a BSD-style

3 // license that can be found in the LICENSE file.

4

5 // Regular expression parser.

6

7 // The parser is a simple precedence-based parser with a

8 // manual stack. The parsing work is done by the methods

9 // of the ParseState class. The Regexp::Parse function is

10 // essentially just a lexer that calls the ParseState method

11 // for each token.

12

13 // The parser recognizes POSIX extended regular expressions

14 // excluding backreferences, collating elements, and collating

15 // classes. It also allows the empty string as a regular expression

16 // and recognizes the Perl escape sequences \d, \s, \w, \D, \S, and \W.

17 // See regexp.h for rationale.

18

19 #include "util/util.h"

20 #include "re2/regexp.h"

21 #include "re2/stringpiece.h"

22 #include "re2/unicode_casefold.h"

23 #include "re2/unicode_groups.h"

24 #include "re2/walker-inl.h"

25

26 namespace re2 {

27

28 // Regular expression parse state.

29 // The list of parsed regexps so far is maintained as a vector of

30 // Regexp pointers called the stack. Left parenthesis and vertical

31 // bar markers are also placed on the stack, as Regexps with

32 // non-standard opcodes.

33 // Scanning a left parenthesis causes the parser to push a left parenthesis

34 // marker on the stack.

35 // Scanning a vertical bar causes the parser to pop the stack until it finds a

36 // vertical bar or left parenthesis marker (not popping the marker),

37 // concatenate all the popped results, and push them back on

38 // the stack (DoConcatenation).

39 // Scanning a right parenthesis causes the parser to act as though it

40 // has seen a vertical bar, which then leaves the top of the stack in the

41 // form LeftParen regexp VerticalBar regexp VerticalBar ... regexp VerticalBar.

42 // The parser pops all this off the stack and creates an alternation of the

43 // regexps (DoAlternation).

44

45 class Regexp::ParseState {

46 public:

47 ParseState(ParseFlags flags, const StringPiece& whole_regexp,

48 RegexpStatus* status);

49 ~ParseState();

50

51 ParseFlags flags() { return flags_; }

52 int rune_max() { return rune_max_; }

53

54 // Parse methods. All public methods return a bool saying

55 // whether parsing should continue. If a method returns

56 // false, it has set fields in *status_, and the parser

57 // should return NULL.

58

59 // Pushes the given regular expression onto the stack.

60 // Could check for too much memory used here.

61 bool PushRegexp(Regexp* re);

62

63 // Pushes the literal rune r onto the stack.

64 bool PushLiteral(Rune r);

65

66 // Pushes a regexp with the given op (and no args) onto the stack.

67 bool PushSimpleOp(RegexpOp op);

68

69 // Pushes a ^ onto the stack.

70 bool PushCarat();

71

72 // Pushes a \b (word == true) or \B (word == false) onto the stack.

73 bool PushWordBoundary(bool word);

74

75 // Pushes a $ onto the stack.

76 bool PushDollar();

77

78 // Pushes a . onto the stack

79 bool PushDot();

80

81 // Pushes a repeat operator regexp onto the stack.

82 // A valid argument for the operator must already be on the stack.

83 // s is the name of the operator, for use in error messages.

84 bool PushRepeatOp(RegexpOp op, const StringPiece& s, bool nongreedy);

85

86 // Pushes a repetition regexp onto the stack.

87 // A valid argument for the operator must already be on the stack.

88 bool PushRepetition(int min, int max, const StringPiece& s, bool nongreedy);

89

90 // Checks whether a particular regexp op is a marker.

91 bool IsMarker(RegexpOp op);

92

93 // Processes a left parenthesis in the input.

94 // Pushes a marker onto the stack.

95 bool DoLeftParen(const StringPiece& name);

96 bool DoLeftParenNoCapture();

97

98 // Processes a vertical bar in the input.

99 bool DoVerticalBar();

100

101 // Processes a right parenthesis in the input.

102 bool DoRightParen();

103

104 // Processes the end of input, returning the final regexp.

105 Regexp* DoFinish();

106

107 // Finishes the regexp if necessary, preparing it for use

108 // in a more complicated expression.

109 // If it is a CharClassBuilder, converts into a CharClass.

110 Regexp* FinishRegexp(Regexp*);

111

112 // These routines don't manipulate the parse stack

113 // directly, but they do need to look at flags_.

114 // ParseCharClass also manipulates the internals of Regexp

115 // while creating *out_re.

116

117 // Parse a character class into *out_re.

118 // Removes parsed text from s.

119 bool ParseCharClass(StringPiece* s, Regexp** out_re,

120 RegexpStatus* status);

121

122 // Parse a character class character into *rp.

123 // Removes parsed text from s.

124 bool ParseCCCharacter(StringPiece* s, Rune *rp,

125 const StringPiece& whole_class,

126 RegexpStatus* status);

127

128 // Parse a character class range into rr.

129 // Removes parsed text from s.

130 bool ParseCCRange(StringPiece* s, RuneRange* rr,

131 const StringPiece& whole_class,

132 RegexpStatus* status);

133

134 // Parse a Perl flag set or non-capturing group from s.

135 bool ParsePerlFlags(StringPiece* s);

136

137

138 // Finishes the current concatenation,

139 // collapsing it into a single regexp on the stack.

140 void DoConcatenation();

141

142 // Finishes the current alternation,

143 // collapsing it to a single regexp on the stack.

144 void DoAlternation();

145

146 // Generalized DoAlternation/DoConcatenation.

147 void DoCollapse(RegexpOp op);

148

149 // Maybe concatenate Literals into LiteralString.

150 bool MaybeConcatString(int r, ParseFlags flags);

151

152 private:

153 ParseFlags flags_;

154 StringPiece whole_regexp_;

155 RegexpStatus* status_;

156 Regexp* stacktop_;

157 int ncap_; // number of capturing parens seen

158 int rune_max_; // maximum char value for this encoding

159

160 DISALLOW_COPY_AND_ASSIGN(ParseState);

161 };

162

163 // Pseudo-operators - only on parse stack.

164 const RegexpOp kLeftParen = static_cast<RegexpOp>(kMaxRegexpOp+1);

165 const RegexpOp kVerticalBar = static_cast<RegexpOp>(kMaxRegexpOp+2);

166

167 Regexp::ParseState::ParseState(ParseFlags flags,

168 const StringPiece& whole_regexp,

169 RegexpStatus* status)

170 : flags_(flags), whole_regexp_(whole_regexp),

171 status_(status), stacktop_(NULL), ncap_(0) {

172 if (flags_ & Latin1)

173 rune_max_ = 0xFF;

174 else

175 rune_max_ = Runemax;

176 }

177

178 // Cleans up by freeing all the regexps on the stack.

179 Regexp::ParseState::~ParseState() {

180 Regexp* next;

181 for (Regexp* re = stacktop_; re != NULL; re = next) {

182 next = re->down_;

183 re->down_ = NULL;

184 if (re->op() == kLeftParen)

185 delete re->name_;

186 re->Decref();

187 }

188 }

189

190 // Finishes the regexp if necessary, preparing it for use in

191 // a more complex expression.

192 // If it is a CharClassBuilder, converts into a CharClass.

193 Regexp* Regexp::ParseState::FinishRegexp(Regexp* re) {

194 if (re == NULL)

195 return NULL;

196 re->down_ = NULL;

197

198 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {

199 CharClassBuilder* ccb = re->ccb_;

200 re->ccb_ = NULL;

201 re->cc_ = ccb->GetCharClass();

202 delete ccb;

203 }

204

205 return re;

206 }

207

208 // Pushes the given regular expression onto the stack.

209 // Could check for too much memory used here.

210 bool Regexp::ParseState::PushRegexp(Regexp* re) {

211 MaybeConcatString(-1, NoParseFlags);

212

213 // Special case: a character class of one character is just

214 // a literal. This is a common idiom for escaping

215 // single characters (e.g., [.] instead of \.), and some

216 // analysis does better with fewer character classes.

217 // Similarly, [Aa] can be rewritten as a literal A with ASCII case folding.

218 if (re->op_ == kRegexpCharClass && re->ccb_ != NULL) {

219 re->ccb_->RemoveAbove(rune_max_);

220 if (re->ccb_->size() == 1) {

221 Rune r = re->ccb_->begin()->lo;

222 re->Decref();

223 re = new Regexp(kRegexpLiteral, flags_);

224 re->rune_ = r;

225 } else if (re->ccb_->size() == 2) {

226 Rune r = re->ccb_->begin()->lo;

227 if ('A' <= r && r <= 'Z' && re->ccb_->Contains(r + 'a' - 'A')) {

228 re->Decref();

229 re = new Regexp(kRegexpLiteral, flags_ \| FoldCase);

230 re->rune_ = r + 'a' - 'A';

231 }

232 }

233 }

234

235 if (!IsMarker(re->op()))

236 re->simple_ = re->ComputeSimple();

237 re->down_ = stacktop_;

238 stacktop_ = re;

239 return true;

240 }

241

242 // Searches the case folding tables and returns the CaseFold* that contains r.

243 // If there isn't one, returns the CaseFold* with smallest f->lo bigger than r.

244 // If there isn't one, returns NULL.

245 const CaseFold* LookupCaseFold(const CaseFold *f, int n, Rune r) {

246 const CaseFold* ef = f + n;

247

248 // Binary search for entry containing r.

249 while (n > 0) {

250 int m = n/2;

251 if (f[m].lo <= r && r <= f[m].hi)

252 return &f[m];

253 if (r < f[m].lo) {

254 n = m;

255 } else {

256 f += m+1;

257 n -= m+1;

258 }

259 }

260

261 // There is no entry that contains r, but f points

262 // where it would have been. Unless f points at

263 // the end of the array, it points at the next entry

264 // after r.

265 if (f < ef)

266 return f;

267

268 // No entry contains r; no entry contains runes > r.

269 return NULL;

270 }

271

272 // Returns the result of applying the fold f to the rune r.

273 Rune ApplyFold(const CaseFold *f, Rune r) {

274 switch (f->delta) {

275 default:

276 return r + f->delta;

277

278 case EvenOddSkip: // even <-> odd but only applies to every other

279 if ((r - f->lo) % 2)

280 return r;

281 // fall through

282 case EvenOdd: // even <-> odd

283 if (r%2 == 0)

284 return r + 1;

285 return r - 1;

286

287 case OddEvenSkip: // odd <-> even but only applies to every other

288 if ((r - f->lo) % 2)

289 return r;

290 // fall through

291 case OddEven: // odd <-> even

292 if (r%2 == 1)

293 return r + 1;

294 return r - 1;

295 }

296 }

297

298 // Returns the next Rune in r's folding cycle (see unicode_casefold.h).

299 // Examples:

300 // CycleFoldRune('A') = 'a'

301 // CycleFoldRune('a') = 'A'

302 //

303 // CycleFoldRune('K') = 'k'

304 // CycleFoldRune('k') = 0x212A (Kelvin)

305 // CycleFoldRune(0x212A) = 'K'

306 //

307 // CycleFoldRune('?') = '?'

308 Rune CycleFoldRune(Rune r) {

309 const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, r);

310 if (f == NULL \|\| r < f->lo)

311 return r;

312 return ApplyFold(f, r);

313 }

314

315 // Add lo-hi to the class, along with their fold-equivalent characters.

316 // If lo-hi is already in the class, assume that the fold-equivalent

317 // chars are there too, so there's no work to do.

318 static void AddFoldedRange(CharClassBuilder* cc, Rune lo, Rune hi, int depth) {

319 // AddFoldedRange calls itself recursively for each rune in the fold cycle.

320 // Most folding cycles are small: there aren't any bigger than four in the

321 // current Unicode tables. make_unicode_casefold.py checks that

322 // the cycles are not too long, and we double-check here using depth.

323 if (depth > 10) {

324 LOG(DFATAL) << "AddFoldedRange recurses too much.";

325 return;

326 }

327

328 if (!cc->AddRange(lo, hi)) // lo-hi was already there? we're done

329 return;

330

331 while (lo <= hi) {

332 const CaseFold* f = LookupCaseFold(unicode_casefold, num_unicode_casefold, l o);

333 if (f == NULL) // lo has no fold, nor does anything above lo

334 break;

335 if (lo < f->lo) { // lo has no fold; next rune with a fold is f->lo

336 lo = f->lo;

337 continue;

338 }

339

340 // Add in the result of folding the range lo - f->hi

341 // and that range's fold, recursively.

342 Rune lo1 = lo;

343 Rune hi1 = min<Rune>(hi, f->hi);

344 switch (f->delta) {

345 default:

346 lo1 += f->delta;

347 hi1 += f->delta;

348 break;

349 case EvenOdd:

350 if (lo1%2 == 1)

351 lo1--;

352 if (hi1%2 == 0)

353 hi1++;

354 break;

355 case OddEven:

356 if (lo1%2 == 0)

357 lo1--;

358 if (hi1%2 == 1)

359 hi1++;

360 break;

361 }

362 AddFoldedRange(cc, lo1, hi1, depth+1);

363

364 // Pick up where this fold left off.

365 lo = f->hi + 1;

366 }

367 }

368

369 // Pushes the literal rune r onto the stack.

370 bool Regexp::ParseState::PushLiteral(Rune r) {

371 // Do case folding if needed.

372 if ((flags_ & FoldCase) && CycleFoldRune(r) != r) {

373 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);

374 re->ccb_ = new CharClassBuilder;

375 Rune r1 = r;

376 do {

377 if (!(flags_ & NeverNL) \|\| r != '\n') {

378 re->ccb_->AddRange(r, r);

379 }

380 r = CycleFoldRune(r);

381 } while (r != r1);

382 return PushRegexp(re);

383 }

384

385 // Exclude newline if applicable.

386 if ((flags_ & NeverNL) && r == '\n')

387 return PushRegexp(new Regexp(kRegexpNoMatch, flags_));

388

389 // No fancy stuff worked. Ordinary literal.

390 if (MaybeConcatString(r, flags_))

391 return true;

392

393 Regexp* re = new Regexp(kRegexpLiteral, flags_);

394 re->rune_ = r;

395 return PushRegexp(re);

396 }

397

398 // Pushes a ^ onto the stack.

399 bool Regexp::ParseState::PushCarat() {

400 if (flags_ & OneLine) {

401 return PushSimpleOp(kRegexpBeginText);

402 }

403 return PushSimpleOp(kRegexpBeginLine);

404 }

405

406 // Pushes a \b or \B onto the stack.

407 bool Regexp::ParseState::PushWordBoundary(bool word) {

408 if (word)

409 return PushSimpleOp(kRegexpWordBoundary);

410 return PushSimpleOp(kRegexpNoWordBoundary);

411 }

412

413 // Pushes a $ onto the stack.

414 bool Regexp::ParseState::PushDollar() {

415 if (flags_ & OneLine) {

416 // Clumsy marker so that MimicsPCRE() can tell whether

417 // this kRegexpEndText was a $ and not a \z.

418 Regexp::ParseFlags oflags = flags_;

419 flags_ = flags_ \| WasDollar;

420 bool ret = PushSimpleOp(kRegexpEndText);

421 flags_ = oflags;

422 return ret;

423 }

424 return PushSimpleOp(kRegexpEndLine);

425 }

426

427 // Pushes a . onto the stack.

428 bool Regexp::ParseState::PushDot() {

429 if ((flags_ & DotNL) && !(flags_ & NeverNL))

430 return PushSimpleOp(kRegexpAnyChar);

431 // Rewrite . into [^\n]

432 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);

433 re->ccb_ = new CharClassBuilder;

434 re->ccb_->AddRange(0, '\n' - 1);

435 re->ccb_->AddRange('\n' + 1, rune_max_);

436 return PushRegexp(re);

437 }

438

439 // Pushes a regexp with the given op (and no args) onto the stack.

440 bool Regexp::ParseState::PushSimpleOp(RegexpOp op) {

441 Regexp* re = new Regexp(op, flags_);

442 return PushRegexp(re);

443 }

444

445 // Pushes a repeat operator regexp onto the stack.

446 // A valid argument for the operator must already be on the stack.

447 // The char c is the name of the operator, for use in error messages.

448 bool Regexp::ParseState::PushRepeatOp(RegexpOp op, const StringPiece& s,

449 bool nongreedy) {

450 if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {

451 status_->set_code(kRegexpRepeatArgument);

452 status_->set_error_arg(s);

453 return false;

454 }

455 Regexp::ParseFlags fl = flags_;

456 if (nongreedy)

457 fl = fl ^ NonGreedy;

458 Regexp* re = new Regexp(op, fl);

459 re->AllocSub(1);

460 re->down_ = stacktop_->down_;

461 re->sub()[0] = FinishRegexp(stacktop_);

462 re->simple_ = re->ComputeSimple();

463 stacktop_ = re;

464 return true;

465 }

466

467 // RepetitionWalker reports whether the repetition regexp is valid.

468 // Valid means that the combination of the top-level repetition

469 // and any inner repetitions does not exceed n copies of the

470 // innermost thing.

471 // This rewalks the regexp tree and is called for every repetition,

472 // so we have to worry about inducing quadratic behavior in the parser.

473 // We avoid this by only using RepetitionWalker when min or max >= 2.

474 // In that case the depth of any >= 2 nesting can only get to 9 without

475 // triggering a parse error, so each subtree can only be rewalked 9 times.

476 class RepetitionWalker : public Regexp::Walker<int> {

477 public:

478 RepetitionWalker() {}

479 virtual int PreVisit(Regexp* re, int parent_arg, bool* stop);

480 virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg,

481 int* child_args, int nchild_args);

482 virtual int ShortVisit(Regexp* re, int parent_arg);

483

484 private:

485 DISALLOW_COPY_AND_ASSIGN(RepetitionWalker);

486 };

487

488 int RepetitionWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) {

489 int arg = parent_arg;

490 if (re->op() == kRegexpRepeat) {

491 int m = re->max();

492 if (m < 0) {

493 m = re->min();

494 }

495 if (m > 0) {

496 arg /= m;

497 }

498 }

499 return arg;

500 }

501

502 int RepetitionWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg,

503 int* child_args, int nchild_args) {

504 int arg = pre_arg;

505 for (int i = 0; i < nchild_args; i++) {

506 if (child_args[i] < arg) {

507 arg = child_args[i];

508 }

509 }

510 return arg;

511 }

512

513 int RepetitionWalker::ShortVisit(Regexp* re, int parent_arg) {

514 // This should never be called, since we use Walk and not

515 // WalkExponential.

516 LOG(DFATAL) << "RepetitionWalker::ShortVisit called";

517 return 0;

518 }

519

520 // Pushes a repetition regexp onto the stack.

521 // A valid argument for the operator must already be on the stack.

522 bool Regexp::ParseState::PushRepetition(int min, int max,

523 const StringPiece& s,

524 bool nongreedy) {

525 if ((max != -1 && max < min) \|\| min > 1000 \|\| max > 1000) {

526 status_->set_code(kRegexpRepeatSize);

527 status_->set_error_arg(s);

528 return false;

529 }

530 if (stacktop_ == NULL \|\| IsMarker(stacktop_->op())) {

531 status_->set_code(kRegexpRepeatArgument);

532 status_->set_error_arg(s);

533 return false;

534 }

535 Regexp::ParseFlags fl = flags_;

536 if (nongreedy)

537 fl = fl ^ NonGreedy;

538 Regexp* re = new Regexp(kRegexpRepeat, fl);

539 re->min_ = min;

540 re->max_ = max;

541 re->AllocSub(1);

542 re->down_ = stacktop_->down_;

543 re->sub()[0] = FinishRegexp(stacktop_);

544 re->simple_ = re->ComputeSimple();

545 stacktop_ = re;

546 if (min >= 2 \|\| max >= 2) {

547 RepetitionWalker w;

548 if (w.Walk(stacktop_, 1000) == 0) {

549 status_->set_code(kRegexpRepeatSize);

550 status_->set_error_arg(s);

551 return false;

552 }

553 }

554 return true;

555 }

556

557 // Checks whether a particular regexp op is a marker.

558 bool Regexp::ParseState::IsMarker(RegexpOp op) {

559 return op >= kLeftParen;

560 }

561

562 // Processes a left parenthesis in the input.

563 // Pushes a marker onto the stack.

564 bool Regexp::ParseState::DoLeftParen(const StringPiece& name) {

565 Regexp* re = new Regexp(kLeftParen, flags_);

566 re->cap_ = ++ncap_;

567 if (name.data() != NULL)

568 re->name_ = new string(name.as_string());

569 return PushRegexp(re);

570 }

571

572 // Pushes a non-capturing marker onto the stack.

573 bool Regexp::ParseState::DoLeftParenNoCapture() {

574 Regexp* re = new Regexp(kLeftParen, flags_);

575 re->cap_ = -1;

576 return PushRegexp(re);

577 }

578

579 // Processes a vertical bar in the input.

580 bool Regexp::ParseState::DoVerticalBar() {

581 MaybeConcatString(-1, NoParseFlags);

582 DoConcatenation();

583

584 // Below the vertical bar is a list to alternate.

585 // Above the vertical bar is a list to concatenate.

586 // We just did the concatenation, so either swap

587 // the result below the vertical bar or push a new

588 // vertical bar on the stack.

589 Regexp* r1;

590 Regexp* r2;

591 if ((r1 = stacktop_) != NULL &&

592 (r2 = r1->down_) != NULL &&

593 r2->op() == kVerticalBar) {

594 Regexp* r3;

595 if ((r3 = r2->down_) != NULL &&

596 (r1->op() == kRegexpAnyChar \|\| r3->op() == kRegexpAnyChar)) {

597 // AnyChar is above or below the vertical bar. Let it subsume

598 // the other when the other is Literal, CharClass or AnyChar.

599 if (r3->op() == kRegexpAnyChar &&

600 (r1->op() == kRegexpLiteral \|\|

601 r1->op() == kRegexpCharClass \|\|

602 r1->op() == kRegexpAnyChar)) {

603 // Discard r1.

604 stacktop_ = r2;

605 r1->Decref();

606 return true;

607 }

608 if (r1->op() == kRegexpAnyChar &&

609 (r3->op() == kRegexpLiteral \|\|

610 r3->op() == kRegexpCharClass \|\|

611 r3->op() == kRegexpAnyChar)) {

612 // Rearrange the stack and discard r3.

613 r1->down_ = r3->down_;

614 r2->down_ = r1;

615 stacktop_ = r2;

616 r3->Decref();

617 return true;

618 }

619 }

620 // Swap r1 below vertical bar (r2).

621 r1->down_ = r2->down_;

622 r2->down_ = r1;

623 stacktop_ = r2;

624 return true;

625 }

626 return PushSimpleOp(kVerticalBar);

627 }

628

629 // Processes a right parenthesis in the input.

630 bool Regexp::ParseState::DoRightParen() {

631 // Finish the current concatenation and alternation.

632 DoAlternation();

633

634 // The stack should be: LeftParen regexp

635 // Remove the LeftParen, leaving the regexp,

636 // parenthesized.

637 Regexp* r1;

638 Regexp* r2;

639 if ((r1 = stacktop_) == NULL \|\|

640 (r2 = r1->down_) == NULL \|\|

641 r2->op() != kLeftParen) {

642 status_->set_code(kRegexpMissingParen);

643 status_->set_error_arg(whole_regexp_);

644 return false;

645 }

646

647 // Pop off r1, r2. Will Decref or reuse below.

648 stacktop_ = r2->down_;

649

650 // Restore flags from when paren opened.

651 Regexp* re = r2;

652 flags_ = re->parse_flags();

653

654 // Rewrite LeftParen as capture if needed.

655 if (re->cap_ > 0) {

656 re->op_ = kRegexpCapture;

657 // re->cap_ is already set

658 re->AllocSub(1);

659 re->sub()[0] = FinishRegexp(r1);

660 re->simple_ = re->ComputeSimple();

661 } else {

662 re->Decref();

663 re = r1;

664 }

665 return PushRegexp(re);

666 }

667

668 // Processes the end of input, returning the final regexp.

669 Regexp* Regexp::ParseState::DoFinish() {

670 DoAlternation();

671 Regexp* re = stacktop_;

672 if (re != NULL && re->down_ != NULL) {

673 status_->set_code(kRegexpMissingParen);

674 status_->set_error_arg(whole_regexp_);

675 return NULL;

676 }

677 stacktop_ = NULL;

678 return FinishRegexp(re);

679 }

680

681 // Returns the leading regexp that re starts with.

682 // The returned Regexp* points into a piece of re,

683 // so it must not be used after the caller calls re->Decref().

684 Regexp* Regexp::LeadingRegexp(Regexp* re) {

685 if (re->op() == kRegexpEmptyMatch)

686 return NULL;

687 if (re->op() == kRegexpConcat && re->nsub() >= 2) {

688 Regexp** sub = re->sub();

689 if (sub[0]->op() == kRegexpEmptyMatch)

690 return NULL;

691 return sub[0];

692 }

693 return re;

694 }

695

696 // Removes LeadingRegexp(re) from re and returns what's left.

697 // Consumes the reference to re and may edit it in place.

698 // If caller wants to hold on to LeadingRegexp(re),

699 // must have already Incref'ed it.

700 Regexp* Regexp::RemoveLeadingRegexp(Regexp* re) {

701 if (re->op() == kRegexpEmptyMatch)

702 return re;

703 if (re->op() == kRegexpConcat && re->nsub() >= 2) {

704 Regexp** sub = re->sub();

705 if (sub[0]->op() == kRegexpEmptyMatch)

706 return re;

707 sub[0]->Decref();

708 sub[0] = NULL;

709 if (re->nsub() == 2) {

710 // Collapse concatenation to single regexp.

711 Regexp* nre = sub[1];

712 sub[1] = NULL;

713 re->Decref();

714 return nre;

715 }

716 // 3 or more -> 2 or more.

717 re->nsub_--;

718 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);

719 return re;

720 }

721 Regexp::ParseFlags pf = re->parse_flags();

722 re->Decref();

723 return new Regexp(kRegexpEmptyMatch, pf);

724 }

725

726 // Returns the leading string that re starts with.

727 // The returned Rune* points into a piece of re,

728 // so it must not be used after the caller calls re->Decref().

729 Rune* Regexp::LeadingString(Regexp* re, int *nrune,

730 Regexp::ParseFlags *flags) {

731 while (re->op() == kRegexpConcat && re->nsub() > 0)

732 re = re->sub()[0];

733

734 *flags = static_cast<Regexp::ParseFlags>(re->parse_flags_ & Regexp::FoldCase);

735

736 if (re->op() == kRegexpLiteral) {

737 *nrune = 1;

738 return &re->rune_;

739 }

740

741 if (re->op() == kRegexpLiteralString) {

742 *nrune = re->nrunes_;

743 return re->runes_;

744 }

745

746 *nrune = 0;

747 return NULL;

748 }

749

750 // Removes the first n leading runes from the beginning of re.

751 // Edits re in place.

752 void Regexp::RemoveLeadingString(Regexp* re, int n) {

753 // Chase down concats to find first string.

754 // For regexps generated by parser, nested concats are

755 // flattened except when doing so would overflow the 16-bit

756 // limit on the size of a concatenation, so we should never

757 // see more than two here.

758 Regexp* stk[4];

759 int d = 0;

760 while (re->op() == kRegexpConcat) {

761 if (d < arraysize(stk))

762 stk[d++] = re;

763 re = re->sub()[0];

764 }

765

766 // Remove leading string from re.

767 if (re->op() == kRegexpLiteral) {

768 re->rune_ = 0;

769 re->op_ = kRegexpEmptyMatch;

770 } else if (re->op() == kRegexpLiteralString) {

771 if (n >= re->nrunes_) {

772 delete[] re->runes_;

773 re->runes_ = NULL;

774 re->nrunes_ = 0;

775 re->op_ = kRegexpEmptyMatch;

776 } else if (n == re->nrunes_ - 1) {

777 Rune rune = re->runes_[re->nrunes_ - 1];

778 delete[] re->runes_;

779 re->runes_ = NULL;

780 re->nrunes_ = 0;

781 re->rune_ = rune;

782 re->op_ = kRegexpLiteral;

783 } else {

784 re->nrunes_ -= n;

785 memmove(re->runes_, re->runes_ + n, re->nrunes_ * sizeof re->runes_[0]);

786 }

787 }

788

789 // If re is now empty, concatenations might simplify too.

790 while (d-- > 0) {

791 re = stk[d];

792 Regexp** sub = re->sub();

793 if (sub[0]->op() == kRegexpEmptyMatch) {

794 sub[0]->Decref();

795 sub[0] = NULL;

796 // Delete first element of concat.

797 switch (re->nsub()) {

798 case 0:

799 case 1:

800 // Impossible.

801 LOG(DFATAL) << "Concat of " << re->nsub();

802 re->submany_ = NULL;

803 re->op_ = kRegexpEmptyMatch;

804 break;

805

806 case 2: {

807 // Replace re with sub[1].

808 Regexp* old = sub[1];

809 sub[1] = NULL;

810 re->Swap(old);

811 old->Decref();

812 break;

813 }

814

815 default:

816 // Slide down.

817 re->nsub_--;

818 memmove(sub, sub + 1, re->nsub_ * sizeof sub[0]);

819 break;

820 }

821 }

822 }

823 }

824

825 // Factors common prefixes from alternation.

826 // For example,

827 // ABC\|ABD\|AEF\|BCX\|BCY

828 // simplifies to

829 // A(B(C\|D)\|EF)\|BC(X\|Y)

830 // which the normal parse state routines will further simplify to

831 // A(B[CD]\|EF)\|BC[XY]

832 //

833 // Rewrites sub to contain simplified list to alternate and returns

834 // the new length of sub. Adjusts reference counts accordingly

835 // (incoming sub[i] decremented, outgoing sub[i] incremented).

836

837 // It's too much of a pain to write this code with an explicit stack,

838 // so instead we let the caller specify a maximum depth and

839 // don't simplify beyond that. There are around 15 words of local

840 // variables and parameters in the frame, so allowing 8 levels

841 // on a 64-bit machine is still less than a kilobyte of stack and

842 // probably enough benefit for practical uses.

843 const int kFactorAlternationMaxDepth = 8;

844

845 int Regexp::FactorAlternation(

846 Regexp** sub, int n,

847 Regexp::ParseFlags altflags) {

848 return FactorAlternationRecursive(sub, n, altflags,

849 kFactorAlternationMaxDepth);

850 }

851

852 int Regexp::FactorAlternationRecursive(

853 Regexp** sub, int n,

854 Regexp::ParseFlags altflags,

855 int maxdepth) {

856

857 if (maxdepth <= 0)

858 return n;

859

860 // Round 1: Factor out common literal prefixes.

861 Rune *rune = NULL;

862 int nrune = 0;

863 Regexp::ParseFlags runeflags = Regexp::NoParseFlags;

864 int start = 0;

865 int out = 0;

866 for (int i = 0; i <= n; i++) {

867 // Invariant: what was in sub[0:start] has been Decref'ed

868 // and that space has been reused for sub[0:out] (out <= start).

869 //

870 // Invariant: sub[start:i] consists of regexps that all begin

871 // with the string rune[0:nrune].

872

873 Rune* rune_i = NULL;

874 int nrune_i = 0;

875 Regexp::ParseFlags runeflags_i = Regexp::NoParseFlags;

876 if (i < n) {

877 rune_i = LeadingString(sub[i], &nrune_i, &runeflags_i);

878 if (runeflags_i == runeflags) {

879 int same = 0;

880 while (same < nrune && same < nrune_i && rune[same] == rune_i[same])

881 same++;

882 if (same > 0) {

883 // Matches at least one rune in current range. Keep going around.

884 nrune = same;

885 continue;

886 }

887 }

888 }

889

890 // Found end of a run with common leading literal string:

891 // sub[start:i] all begin with rune[0:nrune] but sub[i]

892 // does not even begin with rune[0].

893 //

894 // Factor out common string and append factored expression to sub[0:out].

895 if (i == start) {

896 // Nothing to do - first iteration.

897 } else if (i == start+1) {

898 // Just one: don't bother factoring.

899 sub[out++] = sub[start];

900 } else {

901 // Construct factored form: prefix(suffix1\|suffix2\|...)

902 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1\|suffix2\|...

903 x[0] = LiteralString(rune, nrune, runeflags);

904 for (int j = start; j < i; j++)

905 RemoveLeadingString(sub[j], nrune);

906 int nn = FactorAlternationRecursive(sub + start, i - start, altflags,

907 maxdepth - 1);

908 x[1] = AlternateNoFactor(sub + start, nn, altflags);

909 sub[out++] = Concat(x, 2, altflags);

910 }

911

912 // Prepare for next round (if there is one).

913 if (i < n) {

914 start = i;

915 rune = rune_i;

916 nrune = nrune_i;

917 runeflags = runeflags_i;

918 }

919 }

920 n = out;

921

922 // Round 2: Factor out common complex prefixes,

923 // just the first piece of each concatenation,

924 // whatever it is. This is good enough a lot of the time.

925 start = 0;

926 out = 0;

927 Regexp* first = NULL;

928 for (int i = 0; i <= n; i++) {

929 // Invariant: what was in sub[0:start] has been Decref'ed

930 // and that space has been reused for sub[0:out] (out <= start).

931 //

932 // Invariant: sub[start:i] consists of regexps that all begin with first.

933

934 Regexp* first_i = NULL;

935 if (i < n) {

936 first_i = LeadingRegexp(sub[i]);

937 if (first != NULL && Regexp::Equal(first, first_i)) {

938 continue;

939 }

940 }

941

942 // Found end of a run with common leading regexp:

943 // sub[start:i] all begin with first but sub[i] does not.

944 //

945 // Factor out common regexp and append factored expression to sub[0:out].

946 if (i == start) {

947 // Nothing to do - first iteration.

948 } else if (i == start+1) {

949 // Just one: don't bother factoring.

950 sub[out++] = sub[start];

951 } else {

952 // Construct factored form: prefix(suffix1\|suffix2\|...)

953 Regexp* x[2]; // x[0] = prefix, x[1] = suffix1\|suffix2\|...

954 x[0] = first->Incref();

955 for (int j = start; j < i; j++)

956 sub[j] = RemoveLeadingRegexp(sub[j]);

957 int nn = FactorAlternationRecursive(sub + start, i - start, altflags,

958 maxdepth - 1);

959 x[1] = AlternateNoFactor(sub + start, nn, altflags);

960 sub[out++] = Concat(x, 2, altflags);

961 }

962

963 // Prepare for next round (if there is one).

964 if (i < n) {

965 start = i;

966 first = first_i;

967 }

968 }

969 n = out;

970

971 // Round 3: Collapse runs of single literals into character classes.

972 start = 0;

973 out = 0;

974 for (int i = 0; i <= n; i++) {

975 // Invariant: what was in sub[0:start] has been Decref'ed

976 // and that space has been reused for sub[0:out] (out <= start).

977 //

978 // Invariant: sub[start:i] consists of regexps that are either

979 // literal runes or character classes.

980

981 if (i < n &&

982 (sub[i]->op() == kRegexpLiteral \|\|

983 sub[i]->op() == kRegexpCharClass))

984 continue;

985

986 // sub[i] is not a char or char class;

987 // emit char class for sub[start:i]...

988 if (i == start) {

989 // Nothing to do.

990 } else if (i == start+1) {

991 sub[out++] = sub[start];

992 } else {

993 // Make new char class.

994 CharClassBuilder ccb;

995 for (int j = start; j < i; j++) {

996 Regexp* re = sub[j];

997 if (re->op() == kRegexpCharClass) {

998 CharClass* cc = re->cc();

999 for (CharClass::iterator it = cc->begin(); it != cc->end(); ++it)

1000 ccb.AddRange(it->lo, it->hi);

1001 } else if (re->op() == kRegexpLiteral) {

1002 ccb.AddRangeFlags(re->rune(), re->rune(), re->parse_flags());

1003 } else {

1004 LOG(DFATAL) << "RE2: unexpected op: " << re->op() << " "

1005 << re->ToString();

1006 }

1007 re->Decref();

1008 }

1009 sub[out++] = NewCharClass(ccb.GetCharClass(), altflags);

1010 }

1011

1012 // ... and then emit sub[i].

1013 if (i < n)

1014 sub[out++] = sub[i];

1015 start = i+1;

1016 }

1017 n = out;

1018

1019 // Round 4: Collapse runs of empty matches into single empty match.

1020 start = 0;

1021 out = 0;

1022 for (int i = 0; i < n; i++) {

1023 if (i + 1 < n &&

1024 sub[i]->op() == kRegexpEmptyMatch &&

1025 sub[i+1]->op() == kRegexpEmptyMatch) {

1026 sub[i]->Decref();

1027 continue;

1028 }

1029 sub[out++] = sub[i];

1030 }

1031 n = out;

1032

1033 return n;

1034 }

1035

1036 // Collapse the regexps on top of the stack, down to the

1037 // first marker, into a new op node (op == kRegexpAlternate

1038 // or op == kRegexpConcat).

1039 void Regexp::ParseState::DoCollapse(RegexpOp op) {

1040 // Scan backward to marker, counting children of composite.

1041 int n = 0;

1042 Regexp* next = NULL;

1043 Regexp* sub;

1044 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {

1045 next = sub->down_;

1046 if (sub->op_ == op)

1047 n += sub->nsub_;

1048 else

1049 n++;

1050 }

1051

1052 // If there's just one child, leave it alone.

1053 // (Concat of one thing is that one thing; alternate of one thing is same.)

1054 if (stacktop_ != NULL && stacktop_->down_ == next)

1055 return;

1056

1057 // Construct op (alternation or concatenation), flattening op of op.

1058 Regexp** subs = new Regexp*[n];

1059 next = NULL;

1060 int i = n;

1061 for (sub = stacktop_; sub != NULL && !IsMarker(sub->op()); sub = next) {

1062 next = sub->down_;

1063 if (sub->op_ == op) {

1064 Regexp** sub_subs = sub->sub();

1065 for (int k = sub->nsub_ - 1; k >= 0; k--)

1066 subs[--i] = sub_subs[k]->Incref();

1067 sub->Decref();

1068 } else {

1069 subs[--i] = FinishRegexp(sub);

1070 }

1071 }

1072

1073 Regexp* re = ConcatOrAlternate(op, subs, n, flags_, true);

1074 delete[] subs;

1075 re->simple_ = re->ComputeSimple();

1076 re->down_ = next;

1077 stacktop_ = re;

1078 }

1079

1080 // Finishes the current concatenation,

1081 // collapsing it into a single regexp on the stack.

1082 void Regexp::ParseState::DoConcatenation() {

1083 Regexp* r1 = stacktop_;

1084 if (r1 == NULL \|\| IsMarker(r1->op())) {

1085 // empty concatenation is special case

1086 Regexp* re = new Regexp(kRegexpEmptyMatch, flags_);

1087 PushRegexp(re);

1088 }

1089 DoCollapse(kRegexpConcat);

1090 }

1091

1092 // Finishes the current alternation,

1093 // collapsing it to a single regexp on the stack.

1094 void Regexp::ParseState::DoAlternation() {

1095 DoVerticalBar();

1096 // Now stack top is kVerticalBar.

1097 Regexp* r1 = stacktop_;

1098 stacktop_ = r1->down_;

1099 r1->Decref();

1100 DoCollapse(kRegexpAlternate);

1101 }

1102

1103 // Incremental conversion of concatenated literals into strings.

1104 // If top two elements on stack are both literal or string,

1105 // collapse into single string.

1106 // Don't walk down the stack -- the parser calls this frequently

1107 // enough that below the bottom two is known to be collapsed.

1108 // Only called when another regexp is about to be pushed

1109 // on the stack, so that the topmost literal is not being considered.

1110 // (Otherwise ab* would turn into (ab)*.)

1111 // If r >= 0, consider pushing a literal r on the stack.

1112 // Return whether that happened.

1113 bool Regexp::ParseState::MaybeConcatString(int r, ParseFlags flags) {

1114 Regexp* re1;

1115 Regexp* re2;

1116 if ((re1 = stacktop_) == NULL \|\| (re2 = re1->down_) == NULL)

1117 return false;

1118

1119 if (re1->op_ != kRegexpLiteral && re1->op_ != kRegexpLiteralString)

1120 return false;

1121 if (re2->op_ != kRegexpLiteral && re2->op_ != kRegexpLiteralString)

1122 return false;

1123 if ((re1->parse_flags_ & FoldCase) != (re2->parse_flags_ & FoldCase))

1124 return false;

1125

1126 if (re2->op_ == kRegexpLiteral) {

1127 // convert into string

1128 Rune rune = re2->rune_;

1129 re2->op_ = kRegexpLiteralString;

1130 re2->nrunes_ = 0;

1131 re2->runes_ = NULL;

1132 re2->AddRuneToString(rune);

1133 }

1134

1135 // push re1 into re2.

1136 if (re1->op_ == kRegexpLiteral) {

1137 re2->AddRuneToString(re1->rune_);

1138 } else {

1139 for (int i = 0; i < re1->nrunes_; i++)

1140 re2->AddRuneToString(re1->runes_[i]);

1141 re1->nrunes_ = 0;

1142 delete[] re1->runes_;

1143 re1->runes_ = NULL;

1144 }

1145

1146 // reuse re1 if possible

1147 if (r >= 0) {

1148 re1->op_ = kRegexpLiteral;

1149 re1->rune_ = r;

1150 re1->parse_flags_ = static_cast<uint16>(flags);

1151 return true;

1152 }

1153

1154 stacktop_ = re2;

1155 re1->Decref();

1156 return false;

1157 }

1158

1159 // Lexing routines.

1160

1161 // Parses a decimal integer, storing it in *n.

1162 // Sets *s to span the remainder of the string.

1163 // Sets *out_re to the regexp for the class.

1164 static bool ParseInteger(StringPiece* s, int* np) {

1165 if (s->size() == 0 \|\| !isdigit((*s)[0] & 0xFF))

1166 return false;

1167 // Disallow leading zeros.

1168 if (s->size() >= 2 && (s)[0] == '0' && isdigit((s)[1] & 0xFF))

1169 return false;

1170 int n = 0;

1171 int c;

1172 while (s->size() > 0 && isdigit(c = (*s)[0] & 0xFF)) {

1173 // Avoid overflow.

1174 if (n >= 100000000)

1175 return false;

1176 n = n*10 + c - '0';

1177 s->remove_prefix(1); // digit

1178 }

1179 *np = n;

1180 return true;

1181 }

1182

1183 // Parses a repetition suffix like {1,2} or {2} or {2,}.

1184 // Sets *s to span the remainder of the string on success.

1185 // Sets lo and hi to the given range.

1186 // In the case of {2,}, the high number is unbounded;

1187 // sets *hi to -1 to signify this.

1188 // {,2} is NOT a valid suffix.

1189 // The Maybe in the name signifies that the regexp parse

1190 // doesn't fail even if ParseRepetition does, so the StringPiece

1191 // s must NOT be edited unless MaybeParseRepetition returns true.

1192 static bool MaybeParseRepetition(StringPiece* sp, int* lo, int* hi) {

1193 StringPiece s = *sp;

1194 if (s.size() == 0 \|\| s[0] != '{')

1195 return false;

1196 s.remove_prefix(1); // '{'

1197 if (!ParseInteger(&s, lo))

1198 return false;

1199 if (s.size() == 0)

1200 return false;

1201 if (s[0] == ',') {

1202 s.remove_prefix(1); // ','

1203 if (s.size() == 0)

1204 return false;

1205 if (s[0] == '}') {

1206 // {2,} means at least 2

1207 *hi = -1;

1208 } else {

1209 // {2,4} means 2, 3, or 4.

1210 if (!ParseInteger(&s, hi))

1211 return false;

1212 }

1213 } else {

1214 // {2} means exactly two

1215 hi = lo;

1216 }

1217 if (s.size() == 0 \|\| s[0] != '}')

1218 return false;

1219 s.remove_prefix(1); // '}'

1220 *sp = s;

1221 return true;

1222 }

1223

1224 // Removes the next Rune from the StringPiece and stores it in *r.

1225 // Returns number of bytes removed from sp.

1226 // Behaves as though there is a terminating NUL at the end of sp.

1227 // Argument order is backwards from usual Google style

1228 // but consistent with chartorune.

1229 static int StringPieceToRune(Rune r, StringPiece sp, RegexpStatus* status) {

1230 int n;

1231 if (fullrune(sp->data(), sp->size())) {

1232 n = chartorune(r, sp->data());

1233 // Some copies of chartorune have a bug that accepts

1234 // encodings of values in (10FFFF, 1FFFFF] as valid.

1235 // Those values break the character class algorithm,

1236 // which assumes Runemax is the largest rune.

1237 if (*r > Runemax) {

1238 n = 1;

1239 *r = Runeerror;

1240 }

1241 if (!(n == 1 && *r == Runeerror)) { // no decoding error

1242 sp->remove_prefix(n);

1243 return n;

1244 }

1245 }

1246

1247 status->set_code(kRegexpBadUTF8);

1248 status->set_error_arg(NULL);

1249 return -1;

1250 }

1251

1252 // Return whether name is valid UTF-8.

1253 // If not, set status to kRegexpBadUTF8.

1254 static bool IsValidUTF8(const StringPiece& s, RegexpStatus* status) {

1255 StringPiece t = s;

1256 Rune r;

1257 while (t.size() > 0) {

1258 if (StringPieceToRune(&r, &t, status) < 0)

1259 return false;

1260 }

1261 return true;

1262 }

1263

1264 // Is c a hex digit?

1265 static int IsHex(int c) {

1266 return ('0' <= c && c <= '9') \|\|

1267 ('A' <= c && c <= 'F') \|\|

1268 ('a' <= c && c <= 'f');

1269 }

1270

1271 // Convert hex digit to value.

1272 static int UnHex(int c) {

1273 if ('0' <= c && c <= '9')

1274 return c - '0';

1275 if ('A' <= c && c <= 'F')

1276 return c - 'A' + 10;

1277 if ('a' <= c && c <= 'f')

1278 return c - 'a' + 10;

1279 LOG(DFATAL) << "Bad hex digit " << c;

1280 return 0;

1281 }

1282

1283 // Parse an escape sequence (e.g., \n, \{).

1284 // Sets *s to span the remainder of the string.

1285 // Sets *rp to the named character.

1286 static bool ParseEscape(StringPiece* s, Rune* rp,

1287 RegexpStatus* status, int rune_max) {

1288 const char* begin = s->begin();

1289 if (s->size() < 1 \|\| (*s)[0] != '\\') {

1290 // Should not happen - caller always checks.

1291 status->set_code(kRegexpInternalError);

1292 status->set_error_arg(NULL);

1293 return false;

1294 }

1295 if (s->size() < 2) {

1296 status->set_code(kRegexpTrailingBackslash);

1297 status->set_error_arg(NULL);

1298 return false;

1299 }

1300 Rune c, c1;

1301 s->remove_prefix(1); // backslash

1302 if (StringPieceToRune(&c, s, status) < 0)

1303 return false;

1304 int code;

1305 switch (c) {

1306 default:

1307 if (c < Runeself && !isalpha(c) && !isdigit(c)) {

1308 // Escaped non-word characters are always themselves.

1309 // PCRE is not quite so rigorous: it accepts things like

1310 // \q, but we don't. We once rejected \_, but too many

1311 // programs and people insist on using it, so allow \_.

1312 *rp = c;

1313 return true;

1314 }

1315 goto BadEscape;

1316

1317 // Octal escapes.

1318 case '1':

1319 case '2':

1320 case '3':

1321 case '4':

1322 case '5':

1323 case '6':

1324 case '7':

1325 // Single non-zero octal digit is a backreference; not supported.

1326 if (s->size() == 0 \|\| (s)[0] < '0' \|\| (s)[0] > '7')

1327 goto BadEscape;

1328 // fall through

1329 case '0':

1330 // consume up to three octal digits; already have one.

1331 code = c - '0';

1332 if (s->size() > 0 && '0' <= (c = (*s)[0]) && c <= '7') {

1333 code = code * 8 + c - '0';

1334 s->remove_prefix(1); // digit

1335 if (s->size() > 0) {

1336 c = (*s)[0];

1337 if ('0' <= c && c <= '7') {

1338 code = code * 8 + c - '0';

1339 s->remove_prefix(1); // digit

1340 }

1341 }

1342 }

1343 if (code > rune_max)

1344 goto BadEscape;

1345 *rp = code;

1346 return true;

1347

1348 // Hexadecimal escapes

1349 case 'x':

1350 if (s->size() == 0)

1351 goto BadEscape;

1352 if (StringPieceToRune(&c, s, status) < 0)

1353 return false;

1354 if (c == '{') {

1355 // Any number of digits in braces.

1356 // Update n as we consume the string, so that

1357 // the whole thing gets shown in the error message.

1358 // Perl accepts any text at all; it ignores all text

1359 // after the first non-hex digit. We require only hex digits,

1360 // and at least one.

1361 if (StringPieceToRune(&c, s, status) < 0)

1362 return false;

1363 int nhex = 0;

1364 code = 0;

1365 while (IsHex(c)) {

1366 nhex++;

1367 code = code * 16 + UnHex(c);

1368 if (code > rune_max)

1369 goto BadEscape;

1370 if (s->size() == 0)

1371 goto BadEscape;

1372 if (StringPieceToRune(&c, s, status) < 0)

1373 return false;

1374 }

1375 if (c != '}' \|\| nhex == 0)

1376 goto BadEscape;

1377 *rp = code;

1378 return true;

1379 }

1380 // Easy case: two hex digits.

1381 if (s->size() == 0)

1382 goto BadEscape;

1383 if (StringPieceToRune(&c1, s, status) < 0)

1384 return false;

1385 if (!IsHex(c) \|\| !IsHex(c1))

1386 goto BadEscape;

1387 rp = UnHex(c) 16 + UnHex(c1);

1388 return true;

1389

1390 // C escapes.

1391 case 'n':

1392 *rp = '\n';

1393 return true;

1394 case 'r':

1395 *rp = '\r';

1396 return true;

1397 case 't':

1398 *rp = '\t';

1399 return true;

1400

1401 // Less common C escapes.

1402 case 'a':

1403 *rp = '\a';

1404 return true;

1405 case 'f':

1406 *rp = '\f';

1407 return true;

1408 case 'v':

1409 *rp = '\v';

1410 return true;

1411

1412 // This code is disabled to avoid misparsing

1413 // the Perl word-boundary \b as a backspace

1414 // when in POSIX regexp mode. Surprisingly,

1415 // in Perl, \b means word-boundary but [\b]

1416 // means backspace. We don't support that:

1417 // if you want a backspace embed a literal

1418 // backspace character or use \x08.

1419 //

1420 // case 'b':

1421 // *rp = '\b';

1422 // return true;

1423 }

1424

1425 LOG(DFATAL) << "Not reached in ParseEscape.";

1426

1427 BadEscape:

1428 // Unrecognized escape sequence.

1429 status->set_code(kRegexpBadEscape);

1430 status->set_error_arg(

1431 StringPiece(begin, static_cast<int>(s->data() - begin)));

1432 return false;

1433 }

1434

1435 // Add a range to the character class, but exclude newline if asked.

1436 // Also handle case folding.

1437 void CharClassBuilder::AddRangeFlags(

1438 Rune lo, Rune hi, Regexp::ParseFlags parse_flags) {

1439

1440 // Take out \n if the flags say so.

1441 bool cutnl = !(parse_flags & Regexp::ClassNL) \|\|

1442 (parse_flags & Regexp::NeverNL);

1443 if (cutnl && lo <= '\n' && '\n' <= hi) {

1444 if (lo < '\n')

1445 AddRangeFlags(lo, '\n' - 1, parse_flags);

1446 if (hi > '\n')

1447 AddRangeFlags('\n' + 1, hi, parse_flags);

1448 return;

1449 }

1450

1451 // If folding case, add fold-equivalent characters too.

1452 if (parse_flags & Regexp::FoldCase)

1453 AddFoldedRange(this, lo, hi, 0);

1454 else

1455 AddRange(lo, hi);

1456 }

1457

1458 // Look for a group with the given name.

1459 static const UGroup* LookupGroup(const StringPiece& name,

1460 const UGroup *groups, int ngroups) {

1461 // Simple name lookup.

1462 for (int i = 0; i < ngroups; i++)

1463 if (StringPiece(groups[i].name) == name)

1464 return &groups[i];

1465 return NULL;

1466 }

1467

1468 // Fake UGroup containing all Runes

1469 static URange16 any16[] = { { 0, 65535 } };

1470 static URange32 any32[] = { { 65536, Runemax } };

1471 static UGroup anygroup = { "Any", +1, any16, 1, any32, 1 };

1472

1473 // Look for a POSIX group with the given name (e.g., "[:^alpha:]")

1474 static const UGroup* LookupPosixGroup(const StringPiece& name) {

1475 return LookupGroup(name, posix_groups, num_posix_groups);

1476 }

1477

1478 static const UGroup* LookupPerlGroup(const StringPiece& name) {

1479 return LookupGroup(name, perl_groups, num_perl_groups);

1480 }

1481

1482 // Look for a Unicode group with the given name (e.g., "Han")

1483 static const UGroup* LookupUnicodeGroup(const StringPiece& name) {

1484 // Special case: "Any" means any.

1485 if (name == StringPiece("Any"))

1486 return &anygroup;

1487 return LookupGroup(name, unicode_groups, num_unicode_groups);

1488 }

1489

1490 // Add a UGroup or its negation to the character class.

1491 static void AddUGroup(CharClassBuilder cc, const UGroup g, int sign,

1492 Regexp::ParseFlags parse_flags) {

1493 if (sign == +1) {

1494 for (int i = 0; i < g->nr16; i++) {

1495 cc->AddRangeFlags(g->r16[i].lo, g->r16[i].hi, parse_flags);

1496 }

1497 for (int i = 0; i < g->nr32; i++) {

1498 cc->AddRangeFlags(g->r32[i].lo, g->r32[i].hi, parse_flags);

1499 }

1500 } else {

1501 if (parse_flags & Regexp::FoldCase) {

1502 // Normally adding a case-folded group means

1503 // adding all the extra fold-equivalent runes too.

1504 // But if we're adding the negation of the group,

1505 // we have to exclude all the runes that are fold-equivalent

1506 // to what's already missing. Too hard, so do in two steps.

1507 CharClassBuilder ccb1;

1508 AddUGroup(&ccb1, g, +1, parse_flags);

1509 // If the flags say to take out \n, put it in, so that negating will take it out.

1510 // Normally AddRangeFlags does this, but we're bypassing AddRangeFlags.

1511 bool cutnl = !(parse_flags & Regexp::ClassNL) \|\|

1512 (parse_flags & Regexp::NeverNL);

1513 if (cutnl) {

1514 ccb1.AddRange('\n', '\n');

1515 }

1516 ccb1.Negate();

1517 cc->AddCharClass(&ccb1);

1518 return;

1519 }

1520 int next = 0;

1521 for (int i = 0; i < g->nr16; i++) {

1522 if (next < g->r16[i].lo)

1523 cc->AddRangeFlags(next, g->r16[i].lo - 1, parse_flags);

1524 next = g->r16[i].hi + 1;

1525 }

1526 for (int i = 0; i < g->nr32; i++) {

1527 if (next < g->r32[i].lo)

1528 cc->AddRangeFlags(next, g->r32[i].lo - 1, parse_flags);

1529 next = g->r32[i].hi + 1;

1530 }

1531 if (next <= Runemax)

1532 cc->AddRangeFlags(next, Runemax, parse_flags);

1533 }

1534 }

1535

1536 // Maybe parse a Perl character class escape sequence.

1537 // Only recognizes the Perl character classes (\d \s \w \D \S \W),

1538 // not the Perl empty-string classes (\b \B \A \Z \z).

1539 // On success, sets *s to span the remainder of the string

1540 // and returns the corresponding UGroup.

1541 // The StringPiece must NOT be edited unless the call succeeds.

1542 const UGroup* MaybeParsePerlCCEscape(StringPiece* s, Regexp::ParseFlags parse_fl ags) {

1543 if (!(parse_flags & Regexp::PerlClasses))

1544 return NULL;

1545 if (s->size() < 2 \|\| (*s)[0] != '\\')

1546 return NULL;

1547 // Could use StringPieceToRune, but there aren't

1548 // any non-ASCII Perl group names.

1549 StringPiece name(s->begin(), 2);

1550 const UGroup *g = LookupPerlGroup(name);

1551 if (g == NULL)

1552 return NULL;

1553 s->remove_prefix(name.size());

1554 return g;

1555 }

1556

1557 enum ParseStatus {

1558 kParseOk, // Did some parsing.

1559 kParseError, // Found an error.

1560 kParseNothing, // Decided not to parse.

1561 };

1562

1563 // Maybe parses a Unicode character group like \p{Han} or \P{Han}

1564 // (the latter is a negated group).

1565 ParseStatus ParseUnicodeGroup(StringPiece* s, Regexp::ParseFlags parse_flags,

1566 CharClassBuilder *cc,

1567 RegexpStatus* status) {

1568 // Decide whether to parse.

1569 if (!(parse_flags & Regexp::UnicodeGroups))

1570 return kParseNothing;

1571 if (s->size() < 2 \|\| (*s)[0] != '\\')

1572 return kParseNothing;

1573 Rune c = (*s)[1];

1574 if (c != 'p' && c != 'P')

1575 return kParseNothing;

1576

1577 // Committed to parse. Results:

1578 int sign = +1; // -1 = negated char class

1579 if (c == 'P')

1580 sign = -1;

1581 StringPiece seq = *s; // \p{Han} or \pL

1582 StringPiece name; // Han or L

1583 s->remove_prefix(2); // '\\', 'p'

1584

1585 if (!StringPieceToRune(&c, s, status))

1586 return kParseError;

1587 if (c != '{') {

1588 // Name is the bit of string we just skipped over for c.

1589 const char* p = seq.begin() + 2;

1590 name = StringPiece(p, static_cast<int>(s->begin() - p));

1591 } else {

1592 // Name is in braces. Look for closing }

1593 size_t end = s->find('}', 0);

1594 if (end == s->npos) {

1595 if (!IsValidUTF8(seq, status))

1596 return kParseError;

1597 status->set_code(kRegexpBadCharRange);

1598 status->set_error_arg(seq);

1599 return kParseError;

1600 }

1601 name = StringPiece(s->begin(), static_cast<int>(end)); // without '}'

1602 s->remove_prefix(static_cast<int>(end) + 1); // with '}'

1603 if (!IsValidUTF8(name, status))

1604 return kParseError;

1605 }

1606

1607 // Chop seq where s now begins.

1608 seq = StringPiece(seq.begin(), static_cast<int>(s->begin() - seq.begin()));

1609

1610 // Look up group

1611 if (name.size() > 0 && name[0] == '^') {

1612 sign = -sign;

1613 name.remove_prefix(1); // '^'

1614 }

1615 const UGroup *g = LookupUnicodeGroup(name);

1616 if (g == NULL) {

1617 status->set_code(kRegexpBadCharRange);

1618 status->set_error_arg(seq);

1619 return kParseError;

1620 }

1621

1622 AddUGroup(cc, g, sign, parse_flags);

1623 return kParseOk;

1624 }

1625

1626 // Parses a character class name like [:alnum:].

1627 // Sets *s to span the remainder of the string.

1628 // Adds the ranges corresponding to the class to ranges.

1629 static ParseStatus ParseCCName(StringPiece* s, Regexp::ParseFlags parse_flags,

1630 CharClassBuilder *cc,

1631 RegexpStatus* status) {

1632 // Check begins with [:

1633 const char* p = s->data();

1634 const char* ep = s->data() + s->size();

1635 if (ep - p < 2 \|\| p[0] != '[' \|\| p[1] != ':')

1636 return kParseNothing;

1637

1638 // Look for closing :].

1639 const char* q;

1640 for (q = p+2; q <= ep-2 && (q != ':' \|\| (q+1) != ']'); q++)

1641 ;

1642

1643 // If no closing :], then ignore.

1644 if (q > ep-2)

1645 return kParseNothing;

1646

1647 // Got it. Check that it's valid.

1648 q += 2;

1649 StringPiece name(p, static_cast<int>(q-p));

1650

1651 const UGroup *g = LookupPosixGroup(name);

1652 if (g == NULL) {

1653 status->set_code(kRegexpBadCharRange);

1654 status->set_error_arg(name);

1655 return kParseError;

1656 }

1657

1658 s->remove_prefix(name.size());

1659 AddUGroup(cc, g, g->sign, parse_flags);

1660 return kParseOk;

1661 }

1662

1663 // Parses a character inside a character class.

1664 // There are fewer special characters here than in the rest of the regexp.

1665 // Sets *s to span the remainder of the string.

1666 // Sets *rp to the character.

1667 bool Regexp::ParseState::ParseCCCharacter(StringPiece* s, Rune *rp,

1668 const StringPiece& whole_class,

1669 RegexpStatus* status) {

1670 if (s->size() == 0) {

1671 status->set_code(kRegexpMissingBracket);

1672 status->set_error_arg(whole_class);

1673 return false;

1674 }

1675

1676 // Allow regular escape sequences even though

1677 // many need not be escaped in this context.

1678 if (s->size() >= 1 && (*s)[0] == '\\')

1679 return ParseEscape(s, rp, status, rune_max_);

1680

1681 // Otherwise take the next rune.

1682 return StringPieceToRune(rp, s, status) >= 0;

1683 }

1684

1685 // Parses a character class character, or, if the character

1686 // is followed by a hyphen, parses a character class range.

1687 // For single characters, rr->lo == rr->hi.

1688 // Sets *s to span the remainder of the string.

1689 // Sets *rp to the character.

1690 bool Regexp::ParseState::ParseCCRange(StringPiece* s, RuneRange* rr,

1691 const StringPiece& whole_class,

1692 RegexpStatus* status) {

1693 StringPiece os = *s;

1694 if (!ParseCCCharacter(s, &rr->lo, whole_class, status))

1695 return false;

1696 // [a-] means (a\|-), so check for final ].

1697 if (s->size() >= 2 && (s)[0] == '-' && (s)[1] != ']') {

1698 s->remove_prefix(1); // '-'

1699 if (!ParseCCCharacter(s, &rr->hi, whole_class, status))

1700 return false;

1701 if (rr->hi < rr->lo) {

1702 status->set_code(kRegexpBadCharRange);

1703 status->set_error_arg(

1704 StringPiece(os.data(), static_cast<int>(s->data() - os.data())));

1705 return false;

1706 }

1707 } else {

1708 rr->hi = rr->lo;

1709 }

1710 return true;

1711 }

1712

1713 // Parses a possibly-negated character class expression like [^abx-z[:digit:]].

1714 // Sets *s to span the remainder of the string.

1715 // Sets *out_re to the regexp for the class.

1716 bool Regexp::ParseState::ParseCharClass(StringPiece* s,

1717 Regexp** out_re,

1718 RegexpStatus* status) {

1719 StringPiece whole_class = *s;

1720 if (s->size() == 0 \|\| (*s)[0] != '[') {

1721 // Caller checked this.

1722 status->set_code(kRegexpInternalError);

1723 status->set_error_arg(NULL);

1724 return false;

1725 }

1726 bool negated = false;

1727 Regexp* re = new Regexp(kRegexpCharClass, flags_ & ~FoldCase);

1728 re->ccb_ = new CharClassBuilder;

1729 s->remove_prefix(1); // '['

1730 if (s->size() > 0 && (*s)[0] == '^') {

1731 s->remove_prefix(1); // '^'

1732 negated = true;

1733 if (!(flags_ & ClassNL) \|\| (flags_ & NeverNL)) {

1734 // If NL can't match implicitly, then pretend

1735 // negated classes include a leading \n.

1736 re->ccb_->AddRange('\n', '\n');

1737 }

1738 }

1739 bool first = true; // ] is okay as first char in class

1740 while (s->size() > 0 && ((*s)[0] != ']' \|\| first)) {

1741 // - is only okay unescaped as first or last in class.

1742 // Except that Perl allows - anywhere.

1743 if ((*s)[0] == '-' && !first && !(flags_&PerlX) &&

1744 (s->size() == 1 \|\| (*s)[1] != ']')) {

1745 StringPiece t = *s;

1746 t.remove_prefix(1); // '-'

1747 Rune r;

1748 int n = StringPieceToRune(&r, &t, status);

1749 if (n < 0) {

1750 re->Decref();

1751 return false;

1752 }

1753 status->set_code(kRegexpBadCharRange);

1754 status->set_error_arg(StringPiece(s->data(), 1+n));

1755 re->Decref();

1756 return false;

1757 }

1758 first = false;

1759

1760 // Look for [:alnum:] etc.

1761 if (s->size() > 2 && (s)[0] == '[' && (s)[1] == ':') {

1762 switch (ParseCCName(s, flags_, re->ccb_, status)) {

1763 case kParseOk:

1764 continue;

1765 case kParseError:

1766 re->Decref();

1767 return false;

1768 case kParseNothing:

1769 break;

1770 }

1771 }

1772

1773 // Look for Unicode character group like \p{Han}

1774 if (s->size() > 2 &&

1775 (*s)[0] == '\\' &&

1776 ((s)[1] == 'p' \|\| (s)[1] == 'P')) {

1777 switch (ParseUnicodeGroup(s, flags_, re->ccb_, status)) {

1778 case kParseOk:

1779 continue;

1780 case kParseError:

1781 re->Decref();

1782 return false;

1783 case kParseNothing:

1784 break;

1785 }

1786 }

1787

1788 // Look for Perl character class symbols (extension).

1789 const UGroup *g = MaybeParsePerlCCEscape(s, flags_);

1790 if (g != NULL) {

1791 AddUGroup(re->ccb_, g, g->sign, flags_);

1792 continue;

1793 }

1794

1795 // Otherwise assume single character or simple range.

1796 RuneRange rr;

1797 if (!ParseCCRange(s, &rr, whole_class, status)) {

1798 re->Decref();

1799 return false;

1800 }

1801 // AddRangeFlags is usually called in response to a class like

1802 // \p{Foo} or [[:foo:]]; for those, it filters \n out unless

1803 // Regexp::ClassNL is set. In an explicit range or singleton

1804 // like we just parsed, we do not filter \n out, so set ClassNL

1805 // in the flags.

1806 re->ccb_->AddRangeFlags(rr.lo, rr.hi, flags_ \| Regexp::ClassNL);

1807 }

1808 if (s->size() == 0) {

1809 status->set_code(kRegexpMissingBracket);

1810 status->set_error_arg(whole_class);

1811 re->Decref();

1812 return false;

1813 }

1814 s->remove_prefix(1); // ']'

1815

1816 if (negated)

1817 re->ccb_->Negate();

1818

1819 *out_re = re;

1820 return true;

1821 }

1822

1823 // Is this a valid capture name? [A-Za-z0-9_]+

1824 // PCRE limits names to 32 bytes.

1825 // Python rejects names starting with digits.

1826 // We don't enforce either of those.

1827 static bool IsValidCaptureName(const StringPiece& name) {

1828 if (name.size() == 0)

1829 return false;

1830 for (int i = 0; i < name.size(); i++) {

1831 int c = name[i];

1832 if (('0' <= c && c <= '9') \|\|

1833 ('a' <= c && c <= 'z') \|\|

1834 ('A' <= c && c <= 'Z') \|\|

1835 c == '_')

1836 continue;

1837 return false;

1838 }

1839 return true;

1840 }

1841

1842 // Parses a Perl flag setting or non-capturing group or both,

1843 // like (?i) or (?: or (?i:. Removes from s, updates parse state.

1844 // The caller must check that s begins with "(?".

1845 // Returns true on success. If the Perl flag is not

1846 // well-formed or not supported, sets status_ and returns false.

1847 bool Regexp::ParseState::ParsePerlFlags(StringPiece* s) {

1848 StringPiece t = *s;

1849

1850 // Caller is supposed to check this.

1851 if (!(flags_ & PerlX) \|\| t.size() < 2 \|\| t[0] != '(' \|\| t[1] != '?') {

1852 LOG(DFATAL) << "Bad call to ParseState::ParsePerlFlags";

1853 status_->set_code(kRegexpInternalError);

1854 return false;

1855 }

1856

1857 t.remove_prefix(2); // "(?"

1858

1859 // Check for named captures, first introduced in Python's regexp library.

1860 // As usual, there are three slightly different syntaxes:

1861 //

1862 // (?P<name>expr) the original, introduced by Python

1863 // (?<name>expr) the .NET alteration, adopted by Perl 5.10

1864 // (?'name'expr) another .NET alteration, adopted by Perl 5.10

1865 //

1866 // Perl 5.10 gave in and implemented the Python version too,

1867 // but they claim that the last two are the preferred forms.

1868 // PCRE and languages based on it (specifically, PHP and Ruby)

1869 // support all three as well. EcmaScript 4 uses only the Python form.

1870 //

1871 // In both the open source world (via Code Search) and the

1872 // Google source tree, (?P<expr>name) is the dominant form,

1873 // so that's the one we implement. One is enough.

1874 if (t.size() > 2 && t[0] == 'P' && t[1] == '<') {

1875 // Pull out name.

1876 size_t end = t.find('>', 2);

1877 if (end == t.npos) {

1878 if (!IsValidUTF8(*s, status_))

1879 return false;

1880 status_->set_code(kRegexpBadNamedCapture);

1881 status_->set_error_arg(*s);

1882 return false;

1883 }

1884

1885 // t is "P<name>...", t[end] == '>'

1886 StringPiece capture(t.begin()-2, static_cast<int>(end)+3); // "(?P<name>"

1887 StringPiece name(t.begin()+2, static_cast<int>(end)-2); // "name"

1888 if (!IsValidUTF8(name, status_))

1889 return false;

1890 if (!IsValidCaptureName(name)) {

1891 status_->set_code(kRegexpBadNamedCapture);

1892 status_->set_error_arg(capture);

1893 return false;

1894 }

1895

1896 if (!DoLeftParen(name)) {

1897 // DoLeftParen's failure set status_.

1898 return false;

1899 }

1900

1901 s->remove_prefix(static_cast<int>(capture.end() - s->begin()));

1902 return true;

1903 }

1904

1905 bool negated = false;

1906 bool sawflags = false;

1907 int nflags = flags_;

1908 Rune c;

1909 for (bool done = false; !done; ) {

1910 if (t.size() == 0)

1911 goto BadPerlOp;

1912 if (StringPieceToRune(&c, &t, status_) < 0)

1913 return false;

1914 switch (c) {

1915 default:

1916 goto BadPerlOp;

1917

1918 // Parse flags.

1919 case 'i':

1920 sawflags = true;

1921 if (negated)

1922 nflags &= ~FoldCase;

1923 else

1924 nflags \|= FoldCase;

1925 break;

1926

1927 case 'm': // opposite of our OneLine

1928 sawflags = true;

1929 if (negated)

1930 nflags \|= OneLine;

1931 else

1932 nflags &= ~OneLine;

1933 break;

1934

1935 case 's':

1936 sawflags = true;

1937 if (negated)

1938 nflags &= ~DotNL;

1939 else

1940 nflags \|= DotNL;

1941 break;

1942

1943 case 'U':

1944 sawflags = true;

1945 if (negated)

1946 nflags &= ~NonGreedy;

1947 else

1948 nflags \|= NonGreedy;

1949 break;

1950

1951 // Negation

1952 case '-':

1953 if (negated)

1954 goto BadPerlOp;

1955 negated = true;

1956 sawflags = false;

1957 break;

1958

1959 // Open new group.

1960 case ':':

1961 if (!DoLeftParenNoCapture()) {

1962 // DoLeftParenNoCapture's failure set status_.

1963 return false;

1964 }

1965 done = true;

1966 break;

1967

1968 // Finish flags.

1969 case ')':

1970 done = true;

1971 break;

1972 }

1973 }

1974

1975 if (negated && !sawflags)

1976 goto BadPerlOp;

1977

1978 flags_ = static_cast<Regexp::ParseFlags>(nflags);

1979 *s = t;

1980 return true;

1981

1982 BadPerlOp:

1983 status_->set_code(kRegexpBadPerlOp);

1984 status_->set_error_arg(

1985 StringPiece(s->begin(), static_cast<int>(t.begin() - s->begin())));

1986 return false;

1987 }

1988

1989 // Converts latin1 (assumed to be encoded as Latin1 bytes)

1990 // into UTF8 encoding in string.

1991 // Can't use EncodingUtils::EncodeLatin1AsUTF8 because it is

1992 // deprecated and because it rejects code points 0x80-0x9F.

1993 void ConvertLatin1ToUTF8(const StringPiece& latin1, string* utf) {

1994 char buf[UTFmax];

1995

1996 utf->clear();

1997 for (int i = 0; i < latin1.size(); i++) {

1998 Rune r = latin1[i] & 0xFF;

1999 int n = runetochar(buf, &r);

2000 utf->append(buf, n);

2001 }

2002 }

2003

2004 // Parses the regular expression given by s,

2005 // returning the corresponding Regexp tree.

2006 // The caller must Decref the return value when done with it.

2007 // Returns NULL on error.

2008 Regexp* Regexp::Parse(const StringPiece& s, ParseFlags global_flags,

2009 RegexpStatus* status) {

2010 // Make status non-NULL (easier on everyone else).

2011 RegexpStatus xstatus;

2012 if (status == NULL)

2013 status = &xstatus;

2014

2015 ParseState ps(global_flags, s, status);

2016 StringPiece t = s;

2017

2018 // Convert regexp to UTF-8 (easier on the rest of the parser).

2019 if (global_flags & Latin1) {

2020 string* tmp = new string;

2021 ConvertLatin1ToUTF8(t, tmp);

2022 status->set_tmp(tmp);

2023 t = *tmp;

2024 }

2025

2026 if (global_flags & Literal) {

2027 // Special parse loop for literal string.

2028 while (t.size() > 0) {

2029 Rune r;

2030 if (StringPieceToRune(&r, &t, status) < 0)

2031 return NULL;

2032 if (!ps.PushLiteral(r))

2033 return NULL;

2034 }

2035 return ps.DoFinish();

2036 }

2037

2038 StringPiece lastunary = NULL;

2039 while (t.size() > 0) {

2040 StringPiece isunary = NULL;

2041 switch (t[0]) {

2042 default: {

2043 Rune r;

2044 if (StringPieceToRune(&r, &t, status) < 0)

2045 return NULL;

2046 if (!ps.PushLiteral(r))

2047 return NULL;

2048 break;

2049 }

2050

2051 case '(':

2052 // "(?" introduces Perl escape.

2053 if ((ps.flags() & PerlX) && (t.size() >= 2 && t[1] == '?')) {

2054 // Flag changes and non-capturing groups.

2055 if (!ps.ParsePerlFlags(&t))

2056 return NULL;

2057 break;

2058 }

2059 if (ps.flags() & NeverCapture) {

2060 if (!ps.DoLeftParenNoCapture())

2061 return NULL;

2062 } else {

2063 if (!ps.DoLeftParen(NULL))

2064 return NULL;

2065 }

2066 t.remove_prefix(1); // '('

2067 break;

2068

2069 case '\|':

2070 if (!ps.DoVerticalBar())

2071 return NULL;

2072 t.remove_prefix(1); // '\|'

2073 break;

2074

2075 case ')':

2076 if (!ps.DoRightParen())

2077 return NULL;

2078 t.remove_prefix(1); // ')'

2079 break;

2080

2081 case '^': // Beginning of line.

2082 if (!ps.PushCarat())

2083 return NULL;

2084 t.remove_prefix(1); // '^'

2085 break;

2086

2087 case '$': // End of line.

2088 if (!ps.PushDollar())

2089 return NULL;

2090 t.remove_prefix(1); // '$'

2091 break;

2092

2093 case '.': // Any character (possibly except newline).

2094 if (!ps.PushDot())

2095 return NULL;

2096 t.remove_prefix(1); // '.'

2097 break;

2098

2099 case '[': { // Character class.

2100 Regexp* re;

2101 if (!ps.ParseCharClass(&t, &re, status))

2102 return NULL;

2103 if (!ps.PushRegexp(re))

2104 return NULL;

2105 break;

2106 }

2107

2108 case '*': { // Zero or more.

2109 RegexpOp op;

2110 op = kRegexpStar;

2111 goto Rep;

2112 case '+': // One or more.

2113 op = kRegexpPlus;

2114 goto Rep;

2115 case '?': // Zero or one.

2116 op = kRegexpQuest;

2117 goto Rep;

2118 Rep:

2119 StringPiece opstr = t;

2120 bool nongreedy = false;

2121 t.remove_prefix(1); // '*' or '+' or '?'

2122 if (ps.flags() & PerlX) {

2123 if (t.size() > 0 && t[0] == '?') {

2124 nongreedy = true;

2125 t.remove_prefix(1); // '?'

2126 }

2127 if (lastunary.size() > 0) {

2128 // In Perl it is not allowed to stack repetition operators:

2129 // a** is a syntax error, not a double-star.

2130 // (and a++ means something else entirely, which we don't support!)

2131 status->set_code(kRegexpRepeatOp);

2132 status->set_error_arg(

2133 StringPiece(lastunary.begin(),

2134 static_cast<int>(t.begin() - lastunary.begin())));

2135 return NULL;

2136 }

2137 }

2138 opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));

2139 if (!ps.PushRepeatOp(op, opstr, nongreedy))

2140 return NULL;

2141 isunary = opstr;

2142 break;

2143 }

2144

2145 case '{': { // Counted repetition.

2146 int lo, hi;

2147 StringPiece opstr = t;

2148 if (!MaybeParseRepetition(&t, &lo, &hi)) {

2149 // Treat like a literal.

2150 if (!ps.PushLiteral('{'))

2151 return NULL;

2152 t.remove_prefix(1); // '{'

2153 break;

2154 }

2155 bool nongreedy = false;

2156 if (ps.flags() & PerlX) {

2157 if (t.size() > 0 && t[0] == '?') {

2158 nongreedy = true;

2159 t.remove_prefix(1); // '?'

2160 }

2161 if (lastunary.size() > 0) {

2162 // Not allowed to stack repetition operators.

2163 status->set_code(kRegexpRepeatOp);

2164 status->set_error_arg(

2165 StringPiece(lastunary.begin(),

2166 static_cast<int>(t.begin() - lastunary.begin())));

2167 return NULL;

2168 }

2169 }

2170 opstr.set(opstr.data(), static_cast<int>(t.data() - opstr.data()));

2171 if (!ps.PushRepetition(lo, hi, opstr, nongreedy))

2172 return NULL;

2173 isunary = opstr;

2174 break;

2175 }

2176

2177 case '\\': { // Escaped character or Perl sequence.

2178 // \b and \B: word boundary or not

2179 if ((ps.flags() & Regexp::PerlB) &&

2180 t.size() >= 2 && (t[1] == 'b' \|\| t[1] == 'B')) {

2181 if (!ps.PushWordBoundary(t[1] == 'b'))

2182 return NULL;

2183 t.remove_prefix(2); // '\\', 'b'

2184 break;

2185 }

2186

2187 if ((ps.flags() & Regexp::PerlX) && t.size() >= 2) {

2188 if (t[1] == 'A') {

2189 if (!ps.PushSimpleOp(kRegexpBeginText))

2190 return NULL;

2191 t.remove_prefix(2); // '\\', 'A'

2192 break;

2193 }

2194 if (t[1] == 'z') {

2195 if (!ps.PushSimpleOp(kRegexpEndText))

2196 return NULL;

2197 t.remove_prefix(2); // '\\', 'z'

2198 break;

2199 }

2200 // Do not recognize \Z, because this library can't

2201 // implement the exact Perl/PCRE semantics.

2202 // (This library treats "(?-m)$" as \z, even though

2203 // in Perl and PCRE it is equivalent to \Z.)

2204

2205 if (t[1] == 'C') { // \C: any byte [sic]

2206 if (!ps.PushSimpleOp(kRegexpAnyByte))

2207 return NULL;

2208 t.remove_prefix(2); // '\\', 'C'

2209 break;

2210 }

2211

2212 if (t[1] == 'Q') { // \Q ... \E: the ... is always literals

2213 t.remove_prefix(2); // '\\', 'Q'

2214 while (t.size() > 0) {

2215 if (t.size() >= 2 && t[0] == '\\' && t[1] == 'E') {

2216 t.remove_prefix(2); // '\\', 'E'

2217 break;

2218 }

2219 Rune r;

2220 if (StringPieceToRune(&r, &t, status) < 0)

2221 return NULL;

2222 if (!ps.PushLiteral(r))

2223 return NULL;

2224 }

2225 break;

2226 }

2227 }

2228

2229 if (t.size() >= 2 && (t[1] == 'p' \|\| t[1] == 'P')) {

2230 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);

2231 re->ccb_ = new CharClassBuilder;

2232 switch (ParseUnicodeGroup(&t, ps.flags(), re->ccb_, status)) {

2233 case kParseOk:

2234 if (!ps.PushRegexp(re))

2235 return NULL;

2236 goto Break2;

2237 case kParseError:

2238 re->Decref();

2239 return NULL;

2240 case kParseNothing:

2241 re->Decref();

2242 break;

2243 }

2244 }

2245

2246 const UGroup *g = MaybeParsePerlCCEscape(&t, ps.flags());

2247 if (g != NULL) {

2248 Regexp* re = new Regexp(kRegexpCharClass, ps.flags() & ~FoldCase);

2249 re->ccb_ = new CharClassBuilder;

2250 AddUGroup(re->ccb_, g, g->sign, ps.flags());

2251 if (!ps.PushRegexp(re))

2252 return NULL;

2253 break;

2254 }

2255

2256 Rune r;

2257 if (!ParseEscape(&t, &r, status, ps.rune_max()))

2258 return NULL;

2259 if (!ps.PushLiteral(r))

2260 return NULL;

2261 break;

2262 }

2263 }

2264 Break2:

2265 lastunary = isunary;

2266 }

2267 return ps.DoFinish();

2268 }

2269

2270 } // namespace re2

OLD	NEW

« no previous file with comments | « third_party/re2/re2/onepass.cc ('k') | third_party/re2/re2/perl_groups.cc » ('j') | no next file with comments »