runtime/vm/regexp_parser.cc - Issue 2481873005: clang-format runtime/vm

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 2481873005: clang-format runtime/vm (Closed)

Patch Set: Merge Created 4 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

5 #include "vm/longjump.h"	5 #include "vm/longjump.h"

6 #include "vm/object_store.h"	6 #include "vm/object_store.h"

7 #include "vm/regexp_parser.h"	7 #include "vm/regexp_parser.h"

8	8

9 namespace dart {	9 namespace dart {

10	10

11 #define Z zone()	11 #define Z zone()

12	12

13 // Enables possessive quantifier syntax for testing.	13 // Enables possessive quantifier syntax for testing.

14 static const bool FLAG_regexp_possessive_quantifier = false;	14 static const bool FLAG_regexp_possessive_quantifier = false;

15	15

16 RegExpBuilder::RegExpBuilder()	16 RegExpBuilder::RegExpBuilder()

17 : zone_(Thread::Current()->zone()),	17 : zone_(Thread::Current()->zone()),

18 pending_empty_(false),	18 pending_empty_(false),

19 characters_(NULL),	19 characters_(NULL),

20 terms_(),	20 terms_(),

21 text_(),	21 text_(),

22 alternatives_()	22 alternatives_()

23 #ifdef DEBUG	23 #ifdef DEBUG

24 , last_added_(ADD_NONE)	24 ,

	25 last_added_(ADD_NONE)

25 #endif	26 #endif

26 {}	27 {

	28 }

27	29

28	30

29 void RegExpBuilder::FlushCharacters() {	31 void RegExpBuilder::FlushCharacters() {

30 pending_empty_ = false;	32 pending_empty_ = false;

31 if (characters_ != NULL) {	33 if (characters_ != NULL) {

32 RegExpTree* atom = new(Z) RegExpAtom(characters_);	34 RegExpTree* atom = new (Z) RegExpAtom(characters_);

33 characters_ = NULL;	35 characters_ = NULL;

34 text_.Add(atom);	36 text_.Add(atom);

35 LAST(ADD_ATOM);	37 LAST(ADD_ATOM);

36 }	38 }

37 }	39 }

38	40

39	41

40 void RegExpBuilder::FlushText() {	42 void RegExpBuilder::FlushText() {

41 FlushCharacters();	43 FlushCharacters();

42 intptr_t num_text = text_.length();	44 intptr_t num_text = text_.length();

43 if (num_text == 0) {	45 if (num_text == 0) {

44 return;	46 return;

45 } else if (num_text == 1) {	47 } else if (num_text == 1) {

46 terms_.Add(text_.Last());	48 terms_.Add(text_.Last());

47 } else {	49 } else {

48 RegExpText* text = new(Z) RegExpText();	50 RegExpText* text = new (Z) RegExpText();

49 for (intptr_t i = 0; i < num_text; i++)	51 for (intptr_t i = 0; i < num_text; i++)

50 text_[i]->AppendToText(text);	52 text_[i]->AppendToText(text);

51 terms_.Add(text);	53 terms_.Add(text);

52 }	54 }

53 text_.Clear();	55 text_.Clear();

54 }	56 }

55	57

56	58

57 void RegExpBuilder::AddCharacter(uint16_t c) {	59 void RegExpBuilder::AddCharacter(uint16_t c) {

58 pending_empty_ = false;	60 pending_empty_ = false;

59 if (characters_ == NULL) {	61 if (characters_ == NULL) {

60 characters_ = new(Z) ZoneGrowableArray<uint16_t>(4);	62 characters_ = new (Z) ZoneGrowableArray<uint16_t>(4);

61 }	63 }

62 characters_->Add(c);	64 characters_->Add(c);

63 LAST(ADD_CHAR);	65 LAST(ADD_CHAR);

64 }	66 }

65	67

66	68

67 void RegExpBuilder::AddEmpty() {	69 void RegExpBuilder::AddEmpty() {

68 pending_empty_ = true;	70 pending_empty_ = true;

69 }	71 }

70	72

(...skipping 29 matching lines...) Expand all Loading...
100 void RegExpBuilder::FlushTerms() {	102 void RegExpBuilder::FlushTerms() {

101 FlushText();	103 FlushText();

102 intptr_t num_terms = terms_.length();	104 intptr_t num_terms = terms_.length();

103 RegExpTree* alternative;	105 RegExpTree* alternative;

104 if (num_terms == 0) {	106 if (num_terms == 0) {

105 alternative = RegExpEmpty::GetInstance();	107 alternative = RegExpEmpty::GetInstance();

106 } else if (num_terms == 1) {	108 } else if (num_terms == 1) {

107 alternative = terms_.Last();	109 alternative = terms_.Last();

108 } else {	110 } else {

109 ZoneGrowableArray<RegExpTree> terms =	111 ZoneGrowableArray<RegExpTree> terms =

110 new(Z) ZoneGrowableArray<RegExpTree*>();	112 new (Z) ZoneGrowableArray<RegExpTree*>();

111 for (intptr_t i = 0; i < terms_.length(); i++) {	113 for (intptr_t i = 0; i < terms_.length(); i++) {

112 terms->Add(terms_[i]);	114 terms->Add(terms_[i]);

113 }	115 }

114 alternative = new(Z) RegExpAlternative(terms);	116 alternative = new (Z) RegExpAlternative(terms);

115 }	117 }

116 alternatives_.Add(alternative);	118 alternatives_.Add(alternative);

117 terms_.Clear();	119 terms_.Clear();

118 LAST(ADD_NONE);	120 LAST(ADD_NONE);

119 }	121 }

120	122

121	123

122 RegExpTree* RegExpBuilder::ToRegExp() {	124 RegExpTree* RegExpBuilder::ToRegExp() {

123 FlushTerms();	125 FlushTerms();

124 intptr_t num_alternatives = alternatives_.length();	126 intptr_t num_alternatives = alternatives_.length();

125 if (num_alternatives == 0) {	127 if (num_alternatives == 0) {

126 return RegExpEmpty::GetInstance();	128 return RegExpEmpty::GetInstance();

127 }	129 }

128 if (num_alternatives == 1) {	130 if (num_alternatives == 1) {

129 return alternatives_.Last();	131 return alternatives_.Last();

130 }	132 }

131 ZoneGrowableArray<RegExpTree> alternatives =	133 ZoneGrowableArray<RegExpTree> alternatives =

132 new(Z) ZoneGrowableArray<RegExpTree*>();	134 new (Z) ZoneGrowableArray<RegExpTree*>();

133 for (intptr_t i = 0; i < alternatives_.length(); i++) {	135 for (intptr_t i = 0; i < alternatives_.length(); i++) {

134 alternatives->Add(alternatives_[i]);	136 alternatives->Add(alternatives_[i]);

135 }	137 }

136 return new(Z) RegExpDisjunction(alternatives);	138 return new (Z) RegExpDisjunction(alternatives);

137 }	139 }

138	140

139	141

140 void RegExpBuilder::AddQuantifierToAtom(	142 void RegExpBuilder::AddQuantifierToAtom(

141 intptr_t min,	143 intptr_t min,

142 intptr_t max,	144 intptr_t max,

143 RegExpQuantifier::QuantifierType quantifier_type) {	145 RegExpQuantifier::QuantifierType quantifier_type) {

144 if (pending_empty_) {	146 if (pending_empty_) {

145 pending_empty_ = false;	147 pending_empty_ = false;

146 return;	148 return;

147 }	149 }

148 RegExpTree* atom;	150 RegExpTree* atom;

149 if (characters_ != NULL) {	151 if (characters_ != NULL) {

150 DEBUG_ASSERT(last_added_ == ADD_CHAR);	152 DEBUG_ASSERT(last_added_ == ADD_CHAR);

151 // Last atom was character.	153 // Last atom was character.

152	154

153 ZoneGrowableArray<uint16_t> *char_vector =	155 ZoneGrowableArray<uint16_t>* char_vector =

154 new(Z) ZoneGrowableArray<uint16_t>();	156 new (Z) ZoneGrowableArray<uint16_t>();

155 char_vector->AddArray(*characters_);	157 char_vector->AddArray(*characters_);

156 intptr_t num_chars = char_vector->length();	158 intptr_t num_chars = char_vector->length();

157 if (num_chars > 1) {	159 if (num_chars > 1) {

158 ZoneGrowableArray<uint16_t> *prefix =	160 ZoneGrowableArray<uint16_t>* prefix =

159 new(Z) ZoneGrowableArray<uint16_t>();	161 new (Z) ZoneGrowableArray<uint16_t>();

160 for (intptr_t i = 0; i < num_chars - 1; i++) {	162 for (intptr_t i = 0; i < num_chars - 1; i++) {

161 prefix->Add(char_vector->At(i));	163 prefix->Add(char_vector->At(i));

162 }	164 }

163 text_.Add(new(Z) RegExpAtom(prefix));	165 text_.Add(new (Z) RegExpAtom(prefix));

164 ZoneGrowableArray<uint16_t> *tail = new(Z) ZoneGrowableArray<uint16_t>();	166 ZoneGrowableArray<uint16_t>* tail = new (Z) ZoneGrowableArray<uint16_t>();

165 tail->Add(char_vector->At(num_chars - 1));	167 tail->Add(char_vector->At(num_chars - 1));

166 char_vector = tail;	168 char_vector = tail;

167 }	169 }

168 characters_ = NULL;	170 characters_ = NULL;

169 atom = new(Z) RegExpAtom(char_vector);	171 atom = new (Z) RegExpAtom(char_vector);

170 FlushText();	172 FlushText();

171 } else if (text_.length() > 0) {	173 } else if (text_.length() > 0) {

172 DEBUG_ASSERT(last_added_ == ADD_ATOM);	174 DEBUG_ASSERT(last_added_ == ADD_ATOM);

173 atom = text_.RemoveLast();	175 atom = text_.RemoveLast();

174 FlushText();	176 FlushText();

175 } else if (terms_.length() > 0) {	177 } else if (terms_.length() > 0) {

176 DEBUG_ASSERT(last_added_ == ADD_ATOM);	178 DEBUG_ASSERT(last_added_ == ADD_ATOM);

177 atom = terms_.RemoveLast();	179 atom = terms_.RemoveLast();

178 if (atom->max_match() == 0) {	180 if (atom->max_match() == 0) {

179 // Guaranteed to only match an empty string.	181 // Guaranteed to only match an empty string.

180 LAST(ADD_TERM);	182 LAST(ADD_TERM);

181 if (min == 0) {	183 if (min == 0) {

182 return;	184 return;

183 }	185 }

184 terms_.Add(atom);	186 terms_.Add(atom);

185 return;	187 return;

186 }	188 }

187 } else {	189 } else {

188 // Only call immediately after adding an atom or character!	190 // Only call immediately after adding an atom or character!

189 UNREACHABLE();	191 UNREACHABLE();

190 return;	192 return;

191 }	193 }

192 terms_.Add(new(Z) RegExpQuantifier(min, max, quantifier_type, atom));	194 terms_.Add(new (Z) RegExpQuantifier(min, max, quantifier_type, atom));

193 LAST(ADD_TERM);	195 LAST(ADD_TERM);

194 }	196 }

195	197

196 // ----------------------------------------------------------------------------	198 // ----------------------------------------------------------------------------

197 // Implementation of Parser	199 // Implementation of Parser

198	200

199 RegExpParser::RegExpParser(const String& in,	201 RegExpParser::RegExpParser(const String& in, String* error, bool multiline)

200 String* error,

201 bool multiline)

202 : zone_(Thread::Current()->zone()),	202 : zone_(Thread::Current()->zone()),

203 error_(error),	203 error_(error),

204 captures_(NULL),	204 captures_(NULL),

205 in_(in),	205 in_(in),

206 current_(kEndMarker),	206 current_(kEndMarker),

207 next_pos_(0),	207 next_pos_(0),

208 capture_count_(0),	208 capture_count_(0),

209 has_more_(true),	209 has_more_(true),

210 multiline_(multiline),	210 multiline_(multiline),

211 simple_(false),	211 simple_(false),

212 contains_anchor_(false),	212 contains_anchor_(false),

213 is_scanned_for_captures_(false),	213 is_scanned_for_captures_(false),

214 failed_(false) {	214 failed_(false) {

215 Advance();	215 Advance();

216 }	216 }

217	217

218	218

219 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) {	219 bool RegExpParser::ParseFunction(ParsedFunction* parsed_function) {

220 VMTagScope tagScope(parsed_function->thread(),	220 VMTagScope tagScope(parsed_function->thread(),

221 VMTag::kCompileParseRegExpTagId);	221 VMTag::kCompileParseRegExpTagId);

222 Zone* zone = parsed_function->zone();	222 Zone* zone = parsed_function->zone();

223 RegExp& regexp = RegExp::Handle(parsed_function->function().regexp());	223 RegExp& regexp = RegExp::Handle(parsed_function->function().regexp());

224	224

225 const String& pattern = String::Handle(regexp.pattern());	225 const String& pattern = String::Handle(regexp.pattern());

226 const bool multiline = regexp.is_multi_line();	226 const bool multiline = regexp.is_multi_line();

227	227

228 RegExpCompileData* compile_data = new(zone) RegExpCompileData();	228 RegExpCompileData* compile_data = new (zone) RegExpCompileData();

229 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {	229 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {

230 // Parsing failures are handled in the RegExp factory constructor.	230 // Parsing failures are handled in the RegExp factory constructor.

231 UNREACHABLE();	231 UNREACHABLE();

232 }	232 }

233	233

234 regexp.set_num_bracket_expressions(compile_data->capture_count);	234 regexp.set_num_bracket_expressions(compile_data->capture_count);

235 if (compile_data->simple) {	235 if (compile_data->simple) {

236 regexp.set_is_simple();	236 regexp.set_is_simple();

237 } else {	237 } else {

238 regexp.set_is_complex();	238 regexp.set_is_complex();

(...skipping 81 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
320 // Atom	320 // Atom

321 // Atom Quantifier	321 // Atom Quantifier

322 RegExpTree* RegExpParser::ParseDisjunction() {	322 RegExpTree* RegExpParser::ParseDisjunction() {

323 // Used to store current state while parsing subexpressions.	323 // Used to store current state while parsing subexpressions.

324 RegExpParserState initial_state(NULL, INITIAL, 0, Z);	324 RegExpParserState initial_state(NULL, INITIAL, 0, Z);

325 RegExpParserState* stored_state = &initial_state;	325 RegExpParserState* stored_state = &initial_state;

326 // Cache the builder in a local variable for quick access.	326 // Cache the builder in a local variable for quick access.

327 RegExpBuilder* builder = initial_state.builder();	327 RegExpBuilder* builder = initial_state.builder();

328 while (true) {	328 while (true) {

329 switch (current()) {	329 switch (current()) {

330 case kEndMarker:	330 case kEndMarker:

331 if (stored_state->IsSubexpression()) {	331 if (stored_state->IsSubexpression()) {

332 // Inside a parenthesized group when hitting end of input.	332 // Inside a parenthesized group when hitting end of input.

333 ReportError("Unterminated group");	333 ReportError("Unterminated group");

	334 UNREACHABLE();

	335 }

	336 ASSERT(INITIAL == stored_state->group_type());

	337 // Parsing completed successfully.

	338 return builder->ToRegExp();

	339 case ')': {

	340 if (!stored_state->IsSubexpression()) {

	341 ReportError("Unmatched ')'");

	342 UNREACHABLE();

	343 }

	344 ASSERT(INITIAL != stored_state->group_type());

	345

	346 Advance();

	347 // End disjunction parsing and convert builder content to new single

	348 // regexp atom.

	349 RegExpTree* body = builder->ToRegExp();

	350

	351 intptr_t end_capture_index = captures_started();

	352

	353 intptr_t capture_index = stored_state->capture_index();

	354 SubexpressionType group_type = stored_state->group_type();

	355

	356 // Restore previous state.

	357 stored_state = stored_state->previous_state();

	358 builder = stored_state->builder();

	359

	360 // Build result of subexpression.

	361 if (group_type == CAPTURE) {

	362 RegExpCapture* capture = new (Z) RegExpCapture(body, capture_index);

	363 (*captures_)[capture_index - 1] = capture;

	364 body = capture;

	365 } else if (group_type != GROUPING) {

	366 ASSERT(group_type == POSITIVE_LOOKAHEAD \|\|

	367 group_type == NEGATIVE_LOOKAHEAD);

	368 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);

	369 body = new (Z)

	370 RegExpLookahead(body, is_positive,

	371 end_capture_index - capture_index, capture_index);

	372 }

	373 builder->AddAtom(body);

	374 // For compatibility with JSC and ES3, we allow quantifiers after

	375 // lookaheads, and break in all cases.

	376 break;

	377 }

	378 case '\|': {

	379 Advance();

	380 builder->NewAlternative();

	381 continue;

	382 }

	383 case '*':

	384 case '+':

	385 case '?':

	386 ReportError("Nothing to repeat");

334 UNREACHABLE();	387 UNREACHABLE();

335 }	388 case '^': {

336 ASSERT(INITIAL == stored_state->group_type());	389 Advance();

337 // Parsing completed successfully.	390 if (multiline_) {

338 return builder->ToRegExp();	391 builder->AddAssertion(

339 case ')': {	392 new (Z) RegExpAssertion(RegExpAssertion::START_OF_LINE));

340 if (!stored_state->IsSubexpression()) {	393 } else {

341 ReportError("Unmatched ')'");	394 builder->AddAssertion(

342 UNREACHABLE();	395 new (Z) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

343 }	396 set_contains_anchor();

344 ASSERT(INITIAL != stored_state->group_type());	397 }

345	398 continue;

346 Advance();	399 }

347 // End disjunction parsing and convert builder content to new single	400 case '$': {

348 // regexp atom.	401 Advance();

349 RegExpTree* body = builder->ToRegExp();	402 RegExpAssertion::AssertionType assertion_type =

350	403 multiline_ ? RegExpAssertion::END_OF_LINE

351 intptr_t end_capture_index = captures_started();	404 : RegExpAssertion::END_OF_INPUT;

352	405 builder->AddAssertion(new RegExpAssertion(assertion_type));

353 intptr_t capture_index = stored_state->capture_index();	406 continue;

354 SubexpressionType group_type = stored_state->group_type();	407 }

355	408 case '.': {

356 // Restore previous state.	409 Advance();

357 stored_state = stored_state->previous_state();	410 // everything except \x0a, \x0d, \u2028 and \u2029

358 builder = stored_state->builder();

359

360 // Build result of subexpression.

361 if (group_type == CAPTURE) {

362 RegExpCapture* capture = new(Z) RegExpCapture(body, capture_index);

363 (*captures_)[capture_index - 1] = capture;

364 body = capture;

365 } else if (group_type != GROUPING) {

366 ASSERT(group_type == POSITIVE_LOOKAHEAD \|\|

367 group_type == NEGATIVE_LOOKAHEAD);

368 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);

369 body = new(Z) RegExpLookahead(body,

370 is_positive,

371 end_capture_index - capture_index,

372 capture_index);

373 }

374 builder->AddAtom(body);

375 // For compatibility with JSC and ES3, we allow quantifiers after

376 // lookaheads, and break in all cases.

377 break;

378 }

379 case '\|': {

380 Advance();

381 builder->NewAlternative();

382 continue;

383 }

384 case '*':

385 case '+':

386 case '?':

387 ReportError("Nothing to repeat");

388 UNREACHABLE();

389 case '^': {

390 Advance();

391 if (multiline_) {

392 builder->AddAssertion(

393 new(Z) RegExpAssertion(RegExpAssertion::START_OF_LINE));

394 } else {

395 builder->AddAssertion(

396 new(Z) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

397 set_contains_anchor();

398 }

399 continue;

400 }

401 case '$': {

402 Advance();

403 RegExpAssertion::AssertionType assertion_type =

404 multiline_ ? RegExpAssertion::END_OF_LINE :

405 RegExpAssertion::END_OF_INPUT;

406 builder->AddAssertion(new RegExpAssertion(assertion_type));

407 continue;

408 }

409 case '.': {

410 Advance();

411 // everything except \x0a, \x0d, \u2028 and \u2029

412 ZoneGrowableArray<CharacterRange>* ranges =

413 new ZoneGrowableArray<CharacterRange>(2);

414 CharacterRange::AddClassEscape('.', ranges);

415 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

416 builder->AddAtom(atom);

417 break;

418 }

419 case '(': {

420 SubexpressionType subexpr_type = CAPTURE;

421 Advance();

422 if (current() == '?') {

423 switch (Next()) {

424 case ':':

425 subexpr_type = GROUPING;

426 break;

427 case '=':

428 subexpr_type = POSITIVE_LOOKAHEAD;

429 break;

430 case '!':

431 subexpr_type = NEGATIVE_LOOKAHEAD;

432 break;

433 default:

434 ReportError("Invalid group");

435 UNREACHABLE();

436 }

437 Advance(2);

438 } else {

439 if (captures_ == NULL) {

440 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);

441 }

442 if (captures_started() >= kMaxCaptures) {

443 ReportError("Too many captures");

444 UNREACHABLE();

445 }

446 captures_->Add(NULL);

447 }

448 // Store current state and begin new disjunction parsing.

449 stored_state = new RegExpParserState(stored_state, subexpr_type,

450 captures_started(), Z);

451 builder = stored_state->builder();

452 continue;

453 }

454 case '[': {

455 RegExpTree* atom = ParseCharacterClass();

456 builder->AddAtom(atom);

457 break;

458 }

459 // Atom ::

460 // \ AtomEscape

461 case '\\':

462 switch (Next()) {

463 case kEndMarker:

464 ReportError("\\ at end of pattern");

465 UNREACHABLE();

466 case 'b':

467 Advance(2);

468 builder->AddAssertion(

469 new RegExpAssertion(RegExpAssertion::BOUNDARY));

470 continue;

471 case 'B':

472 Advance(2);

473 builder->AddAssertion(

474 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

475 continue;

476 // AtomEscape ::

477 // CharacterClassEscape

478 //

479 // CharacterClassEscape :: one of

480 // d D s S w W

481 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

482 uint32_t c = Next();

483 Advance(2);

484 ZoneGrowableArray<CharacterRange>* ranges =	411 ZoneGrowableArray<CharacterRange>* ranges =

485 new ZoneGrowableArray<CharacterRange>(2);	412 new ZoneGrowableArray<CharacterRange>(2);

486 CharacterRange::AddClassEscape(c, ranges);	413 CharacterRange::AddClassEscape('.', ranges);

487 RegExpTree* atom = new RegExpCharacterClass(ranges, false);	414 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

488 builder->AddAtom(atom);	415 builder->AddAtom(atom);

489 break;	416 break;

490 }	417 }

491 case '1': case '2': case '3': case '4': case '5': case '6':	418 case '(': {

492 case '7': case '8': case '9': {	419 SubexpressionType subexpr_type = CAPTURE;

493 intptr_t index = 0;	420 Advance();

494 if (ParseBackReferenceIndex(&index)) {	421 if (current() == '?') {

495 RegExpCapture* capture = NULL;	422 switch (Next()) {

496 if (captures_ != NULL && index <= captures_->length()) {	423 case ':':

497 capture = captures_->At(index - 1);	424 subexpr_type = GROUPING;

498 }	425 break;

499 if (capture == NULL) {	426 case '=':

500 builder->AddEmpty();	427 subexpr_type = POSITIVE_LOOKAHEAD;

501 break;	428 break;

502 }	429 case '!':

503 RegExpTree* atom = new RegExpBackReference(capture);	430 subexpr_type = NEGATIVE_LOOKAHEAD;

504 builder->AddAtom(atom);	431 break;

505 break;	432 default:

506 }	433 ReportError("Invalid group");

507 uint32_t first_digit = Next();	434 UNREACHABLE();

508 if (first_digit == '8' \|\| first_digit == '9') {	435 }

509 // Treat as identity escape

510 builder->AddCharacter(first_digit);

511 Advance(2);	436 Advance(2);

512 break;

513 }

514 }

515 // FALLTHROUGH

516 case '0': {

517 Advance();

518 uint32_t octal = ParseOctalLiteral();

519 builder->AddCharacter(octal);

520 break;

521 }

522 // ControlEscape :: one of

523 // f n r t v

524 case 'f':

525 Advance(2);

526 builder->AddCharacter('\f');

527 break;

528 case 'n':

529 Advance(2);

530 builder->AddCharacter('\n');

531 break;

532 case 'r':

533 Advance(2);

534 builder->AddCharacter('\r');

535 break;

536 case 't':

537 Advance(2);

538 builder->AddCharacter('\t');

539 break;

540 case 'v':

541 Advance(2);

542 builder->AddCharacter('\v');

543 break;

544 case 'c': {

545 Advance();

546 uint32_t controlLetter = Next();

547 // Special case if it is an ASCII letter.

548 // Convert lower case letters to uppercase.

549 uint32_t letter = controlLetter & ~('a' ^ 'A');

550 if (letter < 'A' \|\| 'Z' < letter) {

551 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

552 // This is outside the specification. We match JSC in

553 // reading the backslash as a literal character instead

554 // of as starting an escape.

555 builder->AddCharacter('\\');

556 } else {	437 } else {

557 Advance(2);	438 if (captures_ == NULL) {

558 builder->AddCharacter(controlLetter & 0x1f);	439 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);

559 }	440 }

560 break;	441 if (captures_started() >= kMaxCaptures) {

561 }	442 ReportError("Too many captures");

562 case 'x': {	443 UNREACHABLE();

563 Advance(2);	444 }

564 uint32_t value;	445 captures_->Add(NULL);

565 if (ParseHexEscape(2, &value)) {	446 }

566 builder->AddCharacter(value);	447 // Store current state and begin new disjunction parsing.

567 } else {	448 stored_state = new RegExpParserState(stored_state, subexpr_type,

568 builder->AddCharacter('x');	449 captures_started(), Z);

569 }	450 builder = stored_state->builder();

570 break;	451 continue;

571 }	452 }

572 case 'u': {	453 case '[': {

573 Advance(2);	454 RegExpTree* atom = ParseCharacterClass();

574 uint32_t value;	455 builder->AddAtom(atom);

575 if (ParseHexEscape(4, &value)) {	456 break;

576 builder->AddCharacter(value);	457 }

577 } else {	458 // Atom ::

578 builder->AddCharacter('u');	459 // \ AtomEscape

579 }	460 case '\\':

580 break;	461 switch (Next()) {

	462 case kEndMarker:

	463 ReportError("\\ at end of pattern");

	464 UNREACHABLE();

	465 case 'b':

	466 Advance(2);

	467 builder->AddAssertion(

	468 new RegExpAssertion(RegExpAssertion::BOUNDARY));

	469 continue;

	470 case 'B':

	471 Advance(2);

	472 builder->AddAssertion(

	473 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

	474 continue;

	475 // AtomEscape ::

	476 // CharacterClassEscape

	477 //

	478 // CharacterClassEscape :: one of

	479 // d D s S w W

	480 case 'd':

	481 case 'D':

	482 case 's':

	483 case 'S':

	484 case 'w':

	485 case 'W': {

	486 uint32_t c = Next();

	487 Advance(2);

	488 ZoneGrowableArray<CharacterRange>* ranges =

	489 new ZoneGrowableArray<CharacterRange>(2);

	490 CharacterRange::AddClassEscape(c, ranges);

	491 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

	492 builder->AddAtom(atom);

	493 break;

	494 }

	495 case '1':

	496 case '2':

	497 case '3':

	498 case '4':

	499 case '5':

	500 case '6':

	501 case '7':

	502 case '8':

	503 case '9': {

	504 intptr_t index = 0;

	505 if (ParseBackReferenceIndex(&index)) {

	506 RegExpCapture* capture = NULL;

	507 if (captures_ != NULL && index <= captures_->length()) {

	508 capture = captures_->At(index - 1);

	509 }

	510 if (capture == NULL) {

	511 builder->AddEmpty();

	512 break;

	513 }

	514 RegExpTree* atom = new RegExpBackReference(capture);

	515 builder->AddAtom(atom);

	516 break;

	517 }

	518 uint32_t first_digit = Next();

	519 if (first_digit == '8' \|\| first_digit == '9') {

	520 // Treat as identity escape

	521 builder->AddCharacter(first_digit);

	522 Advance(2);

	523 break;

	524 }

	525 }

	526 // FALLTHROUGH

	527 case '0': {

	528 Advance();

	529 uint32_t octal = ParseOctalLiteral();

	530 builder->AddCharacter(octal);

	531 break;

	532 }

	533 // ControlEscape :: one of

	534 // f n r t v

	535 case 'f':

	536 Advance(2);

	537 builder->AddCharacter('\f');

	538 break;

	539 case 'n':

	540 Advance(2);

	541 builder->AddCharacter('\n');

	542 break;

	543 case 'r':

	544 Advance(2);

	545 builder->AddCharacter('\r');

	546 break;

	547 case 't':

	548 Advance(2);

	549 builder->AddCharacter('\t');

	550 break;

	551 case 'v':

	552 Advance(2);

	553 builder->AddCharacter('\v');

	554 break;

	555 case 'c': {

	556 Advance();

	557 uint32_t controlLetter = Next();

	558 // Special case if it is an ASCII letter.

	559 // Convert lower case letters to uppercase.

	560 uint32_t letter = controlLetter & ~('a' ^ 'A');

	561 if (letter < 'A' \|\| 'Z' < letter) {

	562 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

	563 // This is outside the specification. We match JSC in

	564 // reading the backslash as a literal character instead

	565 // of as starting an escape.

	566 builder->AddCharacter('\\');

	567 } else {

	568 Advance(2);

	569 builder->AddCharacter(controlLetter & 0x1f);

	570 }

	571 break;

	572 }

	573 case 'x': {

	574 Advance(2);

	575 uint32_t value;

	576 if (ParseHexEscape(2, &value)) {

	577 builder->AddCharacter(value);

	578 } else {

	579 builder->AddCharacter('x');

	580 }

	581 break;

	582 }

	583 case 'u': {

	584 Advance(2);

	585 uint32_t value;

	586 if (ParseHexEscape(4, &value)) {

	587 builder->AddCharacter(value);

	588 } else {

	589 builder->AddCharacter('u');

	590 }

	591 break;

	592 }

	593 default:

	594 // Identity escape.

	595 builder->AddCharacter(Next());

	596 Advance(2);

	597 break;

	598 }

	599 break;

	600 case '{': {

	601 intptr_t dummy;

	602 if (ParseIntervalQuantifier(&dummy, &dummy)) {

	603 ReportError("Nothing to repeat");

	604 UNREACHABLE();

	605 }

	606 // fallthrough

581 }	607 }

582 default:	608 default:

583 // Identity escape.	609 builder->AddCharacter(current());

584 builder->AddCharacter(Next());	610 Advance();

585 Advance(2);	611 break;

586 break;

587 }

588 break;

589 case '{': {

590 intptr_t dummy;

591 if (ParseIntervalQuantifier(&dummy, &dummy)) {

592 ReportError("Nothing to repeat");

593 UNREACHABLE();

594 }

595 // fallthrough

596 }

597 default:

598 builder->AddCharacter(current());

599 Advance();

600 break;

601 } // end switch(current())	612 } // end switch(current())

602	613

603 intptr_t min;	614 intptr_t min;

604 intptr_t max;	615 intptr_t max;

605 switch (current()) {	616 switch (current()) {

606 // QuantifierPrefix ::	617 // QuantifierPrefix ::

607 // *	618 // *

608 // +	619 // +

609 // ?	620 // ?

610 // {	621 // {

611 case '*':	622 case '*':

612 min = 0;	623 min = 0;

613 max = RegExpTree::kInfinity;	624 max = RegExpTree::kInfinity;

614 Advance();	625 Advance();

615 break;	626 break;

616 case '+':	627 case '+':

617 min = 1;	628 min = 1;

618 max = RegExpTree::kInfinity;	629 max = RegExpTree::kInfinity;

619 Advance();	630 Advance();

620 break;	631 break;

621 case '?':	632 case '?':

622 min = 0;	633 min = 0;

623 max = 1;	634 max = 1;

624 Advance();	635 Advance();

625 break;	636 break;

626 case '{':	637 case '{':

627 if (ParseIntervalQuantifier(&min, &max)) {	638 if (ParseIntervalQuantifier(&min, &max)) {

628 if (max < min) {	639 if (max < min) {

629 ReportError("numbers out of order in {} quantifier.");	640 ReportError("numbers out of order in {} quantifier.");

630 UNREACHABLE();	641 UNREACHABLE();

631 }	642 }

632 break;	643 break;

633 } else {	644 } else {

634 continue;	645 continue;

635 }	646 }

636 default:	647 default:

637 continue;	648 continue;

638 }	649 }

639 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;	650 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

640 if (current() == '?') {	651 if (current() == '?') {

641 quantifier_type = RegExpQuantifier::NON_GREEDY;	652 quantifier_type = RegExpQuantifier::NON_GREEDY;

642 Advance();	653 Advance();

643 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {	654 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {

644 // FLAG_regexp_possessive_quantifier is a debug-only flag.	655 // FLAG_regexp_possessive_quantifier is a debug-only flag.

645 quantifier_type = RegExpQuantifier::POSSESSIVE;	656 quantifier_type = RegExpQuantifier::POSSESSIVE;

646 Advance();	657 Advance();

647 }	658 }

648 builder->AddQuantifierToAtom(min, max, quantifier_type);	659 builder->AddQuantifierToAtom(min, max, quantifier_type);

649 }	660 }

650 }	661 }

651	662

652	663

653 #ifdef DEBUG	664 #ifdef DEBUG

654 // Currently only used in an ASSERT.	665 // Currently only used in an ASSERT.

655 static bool IsSpecialClassEscape(uint32_t c) {	666 static bool IsSpecialClassEscape(uint32_t c) {

656 switch (c) {	667 switch (c) {

657 case 'd': case 'D':	668 case 'd':

658 case 's': case 'S':	669 case 'D':

659 case 'w': case 'W':	670 case 's':

	671 case 'S':

	672 case 'w':

	673 case 'W':

660 return true;	674 return true;

661 default:	675 default:

662 return false;	676 return false;

663 }	677 }

664 }	678 }

665 #endif	679 #endif

666	680

667	681

668 // In order to know whether an escape is a backreference or not we have to scan	682 // In order to know whether an escape is a backreference or not we have to scan

669 // the entire regexp and find the number of capturing parentheses. However we	683 // the entire regexp and find the number of capturing parentheses. However we

(...skipping 166 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
836 // If c is not a legal hexadecimal character, returns a value < 0.	850 // If c is not a legal hexadecimal character, returns a value < 0.

837 static inline intptr_t HexValue(uint32_t c) {	851 static inline intptr_t HexValue(uint32_t c) {

838 c -= '0';	852 c -= '0';

839 if (static_cast<unsigned>(c) <= 9) return c;	853 if (static_cast<unsigned>(c) <= 9) return c;

840 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.	854 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

841 if (static_cast<unsigned>(c) <= 5) return c + 10;	855 if (static_cast<unsigned>(c) <= 5) return c + 10;

842 return -1;	856 return -1;

843 }	857 }

844	858

845	859

846 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) {	860 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t* value) {

847 intptr_t start = position();	861 intptr_t start = position();

848 uint32_t val = 0;	862 uint32_t val = 0;

849 bool done = false;	863 bool done = false;

850 for (intptr_t i = 0; !done; i++) {	864 for (intptr_t i = 0; !done; i++) {

851 uint32_t c = current();	865 uint32_t c = current();

852 intptr_t d = HexValue(c);	866 intptr_t d = HexValue(c);

853 if (d < 0) {	867 if (d < 0) {

854 Reset(start);	868 Reset(start);

855 return false;	869 return false;

856 }	870 }

(...skipping 32 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
889 return '\t';	903 return '\t';

890 case 'v':	904 case 'v':

891 Advance();	905 Advance();

892 return '\v';	906 return '\v';

893 case 'c': {	907 case 'c': {

894 uint32_t controlLetter = Next();	908 uint32_t controlLetter = Next();

895 uint32_t letter = controlLetter & ~('A' ^ 'a');	909 uint32_t letter = controlLetter & ~('A' ^ 'a');

896 // For compatibility with JSC, inside a character class	910 // For compatibility with JSC, inside a character class

897 // we also accept digits and underscore as control characters.	911 // we also accept digits and underscore as control characters.

898 if ((controlLetter >= '0' && controlLetter <= '9') \|\|	912 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

899 controlLetter == '_' \|\|	913 controlLetter == '_' \|\| (letter >= 'A' && letter <= 'Z')) {

900 (letter >= 'A' && letter <= 'Z')) {

901 Advance(2);	914 Advance(2);

902 // Control letters mapped to ASCII control characters in the range	915 // Control letters mapped to ASCII control characters in the range

903 // 0x00-0x1f.	916 // 0x00-0x1f.

904 return controlLetter & 0x1f;	917 return controlLetter & 0x1f;

905 }	918 }

906 // We match JSC in reading the backslash as a literal	919 // We match JSC in reading the backslash as a literal

907 // character instead of as starting an escape.	920 // character instead of as starting an escape.

908 return '\\';	921 return '\\';

909 }	922 }

910 case '0': case '1': case '2': case '3': case '4': case '5':	923 case '0':

911 case '6': case '7':	924 case '1':

	925 case '2':

	926 case '3':

	927 case '4':

	928 case '5':

	929 case '6':

	930 case '7':

912 // For compatibility, we interpret a decimal escape that isn't	931 // For compatibility, we interpret a decimal escape that isn't

913 // a back reference (and therefore either \0 or not valid according	932 // a back reference (and therefore either \0 or not valid according

914 // to the specification) as a 1..3 digit octal character code.	933 // to the specification) as a 1..3 digit octal character code.

915 return ParseOctalLiteral();	934 return ParseOctalLiteral();

916 case 'x': {	935 case 'x': {

917 Advance();	936 Advance();

918 uint32_t value;	937 uint32_t value;

919 if (ParseHexEscape(2, &value)) {	938 if (ParseHexEscape(2, &value)) {

920 return value;	939 return value;

921 }	940 }

(...skipping 22 matching lines...) Expand all Loading...
944 }	963 }

945 return 0;	964 return 0;

946 }	965 }

947	966

948	967

949 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {	968 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {

950 ASSERT(0 == *char_class);	969 ASSERT(0 == *char_class);

951 uint32_t first = current();	970 uint32_t first = current();

952 if (first == '\\') {	971 if (first == '\\') {

953 switch (Next()) {	972 switch (Next()) {

954 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {	973 case 'w':

	974 case 'W':

	975 case 'd':

	976 case 'D':

	977 case 's':

	978 case 'S': {

955 *char_class = Next();	979 *char_class = Next();

956 Advance(2);	980 Advance(2);

957 return CharacterRange::Singleton(0); // Return dummy value.	981 return CharacterRange::Singleton(0); // Return dummy value.

958 }	982 }

959 case kEndMarker:	983 case kEndMarker:

960 ReportError("\\ at end of pattern");	984 ReportError("\\ at end of pattern");

961 UNREACHABLE();	985 UNREACHABLE();

962 default:	986 default:

963 uint32_t c = ParseClassCharacterEscape();	987 uint32_t c = ParseClassCharacterEscape();

964 return CharacterRange::Singleton(c);	988 return CharacterRange::Singleton(c);

(...skipping 26 matching lines...) Expand all Loading...
991 static const char* kRangeOutOfOrder = "Range out of order in character class";	1015 static const char* kRangeOutOfOrder = "Range out of order in character class";

992	1016

993 ASSERT(current() == '[');	1017 ASSERT(current() == '[');

994 Advance();	1018 Advance();

995 bool is_negated = false;	1019 bool is_negated = false;

996 if (current() == '^') {	1020 if (current() == '^') {

997 is_negated = true;	1021 is_negated = true;

998 Advance();	1022 Advance();

999 }	1023 }

1000 ZoneGrowableArray<CharacterRange>* ranges =	1024 ZoneGrowableArray<CharacterRange>* ranges =

1001 new(Z) ZoneGrowableArray<CharacterRange>(2);	1025 new (Z) ZoneGrowableArray<CharacterRange>(2);

1002 while (has_more() && current() != ']') {	1026 while (has_more() && current() != ']') {

1003 uint16_t char_class = kNoCharClass;	1027 uint16_t char_class = kNoCharClass;

1004 CharacterRange first = ParseClassAtom(&char_class);	1028 CharacterRange first = ParseClassAtom(&char_class);

1005 if (current() == '-') {	1029 if (current() == '-') {

1006 Advance();	1030 Advance();

1007 if (current() == kEndMarker) {	1031 if (current() == kEndMarker) {

1008 // If we reach the end we break out of the loop and let the	1032 // If we reach the end we break out of the loop and let the

1009 // following code report an error.	1033 // following code report an error.

1010 break;	1034 break;

1011 } else if (current() == ']') {	1035 } else if (current() == ']') {

(...skipping 21 matching lines...) Expand all Loading...
1033 }	1057 }

1034 if (!has_more()) {	1058 if (!has_more()) {

1035 ReportError(kUnterminated);	1059 ReportError(kUnterminated);

1036 UNREACHABLE();	1060 UNREACHABLE();

1037 }	1061 }

1038 Advance();	1062 Advance();

1039 if (ranges->length() == 0) {	1063 if (ranges->length() == 0) {

1040 ranges->Add(CharacterRange::Everything());	1064 ranges->Add(CharacterRange::Everything());

1041 is_negated = !is_negated;	1065 is_negated = !is_negated;

1042 }	1066 }

1043 return new(Z) RegExpCharacterClass(ranges, is_negated);	1067 return new (Z) RegExpCharacterClass(ranges, is_negated);

1044 }	1068 }

1045	1069

1046	1070

1047 // ----------------------------------------------------------------------------	1071 // ----------------------------------------------------------------------------

1048 // The Parser interface.	1072 // The Parser interface.

1049	1073

1050 bool RegExpParser::ParseRegExp(const String& input,	1074 bool RegExpParser::ParseRegExp(const String& input,

1051 bool multiline,	1075 bool multiline,

1052 RegExpCompileData* result) {	1076 RegExpCompileData* result) {

1053 ASSERT(result != NULL);	1077 ASSERT(result != NULL);

1054 LongJumpScope jump;	1078 LongJumpScope jump;

1055 RegExpParser parser(input, &result->error, multiline);	1079 RegExpParser parser(input, &result->error, multiline);

1056 if (setjmp(*jump.Set()) == 0) {	1080 if (setjmp(*jump.Set()) == 0) {

1057 RegExpTree* tree = parser.ParsePattern();	1081 RegExpTree* tree = parser.ParsePattern();

1058 ASSERT(tree != NULL);	1082 ASSERT(tree != NULL);

1059 ASSERT(result->error.IsNull());	1083 ASSERT(result->error.IsNull());

1060 result->tree = tree;	1084 result->tree = tree;

1061 intptr_t capture_count = parser.captures_started();	1085 intptr_t capture_count = parser.captures_started();

1062 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;	1086 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

1063 result->contains_anchor = parser.contains_anchor();	1087 result->contains_anchor = parser.contains_anchor();

1064 result->capture_count = capture_count;	1088 result->capture_count = capture_count;

1065 } else {	1089 } else {

1066 ASSERT(!result->error.IsNull());	1090 ASSERT(!result->error.IsNull());

1067 Thread::Current()->clear_sticky_error();	1091 Thread::Current()->clear_sticky_error();

1068	1092

1069 // Throw a FormatException on parsing failures.	1093 // Throw a FormatException on parsing failures.

1070 const String& message = String::Handle(	1094 const String& message =

1071 String::Concat(result->error, input));	1095 String::Handle(String::Concat(result->error, input));

1072 const Array& args = Array::Handle(Array::New(1));	1096 const Array& args = Array::Handle(Array::New(1));

1073 args.SetAt(0, message);	1097 args.SetAt(0, message);

1074	1098

1075 Exceptions::ThrowByType(Exceptions::kFormat, args);	1099 Exceptions::ThrowByType(Exceptions::kFormat, args);

1076 }	1100 }

1077 return !parser.failed();	1101 return !parser.failed();

1078 }	1102 }

1079	1103

1080 } // namespace dart	1104 } // namespace dart

OLD	NEW

« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/regexp_test.cc » ('j') | no next file with comments »