runtime/vm/regexp_parser.cc - Issue 683433003: Integrate the Irregexp Regular Expression Engine.

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 683433003: Integrate the Irregexp Regular Expression Engine. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: more comments Created 6 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file	1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file

2 // for details. All rights reserved. Use of this source code is governed by a	2 // for details. All rights reserved. Use of this source code is governed by a

3 // BSD-style license that can be found in the LICENSE file.	3 // BSD-style license that can be found in the LICENSE file.

4	4

	5 #include "vm/longjump.h"

	6 #include "vm/object_store.h"

5 #include "vm/regexp_parser.h"	7 #include "vm/regexp_parser.h"

6	8

7 // SNIP

8

9 namespace dart {	9 namespace dart {

10	10

11 RegExpBuilder::RegExpBuilder(Zone* zone)	11 #define I isolate()

12 : zone_(zone),	12

	13 // Enables possessive quantifier syntax for testing.

	14 static const bool FLAG_regexp_possessive_quantifier = false;

	15

	16 RegExpBuilder::RegExpBuilder()

	17 : isolate_(Isolate::Current()),

13 pending_empty_(false),	18 pending_empty_(false),

14 characters_(NULL),	19 characters_(NULL),

15 terms_(),	20 terms_(),

	21 text_(),

16 alternatives_()	22 alternatives_()

17 #ifdef DEBUG	23 #ifdef DEBUG

18 , last_added_(ADD_NONE)	24 , last_added_(ADD_NONE)

19 #endif	25 #endif

20 {}	26 {}

21	27

22	28

23 void RegExpBuilder::FlushCharacters() {	29 void RegExpBuilder::FlushCharacters() {

24 pending_empty_ = false;	30 pending_empty_ = false;

25 if (characters_ != NULL) {	31 if (characters_ != NULL) {

26 RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector());	32 RegExpTree* atom = new(I) RegExpAtom(characters_);

27 characters_ = NULL;	33 characters_ = NULL;

28 text_.Add(atom, zone());	34 text_.Add(atom);

29 LAST(ADD_ATOM);	35 LAST(ADD_ATOM);

30 }	36 }

31 }	37 }

32	38

33	39

34 void RegExpBuilder::FlushText() {	40 void RegExpBuilder::FlushText() {

35 FlushCharacters();	41 FlushCharacters();

36 int num_text = text_.length();	42 intptr_t num_text = text_.length();

37 if (num_text == 0) {	43 if (num_text == 0) {

38 return;	44 return;

39 } else if (num_text == 1) {	45 } else if (num_text == 1) {

40 terms_.Add(text_.last(), zone());	46 terms_.Add(text_.Last());

41 } else {	47 } else {

42 RegExpText* text = new(zone()) RegExpText(zone());	48 RegExpText* text = new(I) RegExpText();

43 for (int i = 0; i < num_text; i++)	49 for (intptr_t i = 0; i < num_text; i++)

44 text_.Get(i)->AppendToText(text, zone());	50 text_[i]->AppendToText(text);

45 terms_.Add(text, zone());	51 terms_.Add(text);

46 }	52 }

47 text_.Clear();	53 text_.Clear();

48 }	54 }

49	55

50	56

51 void RegExpBuilder::AddCharacter(uc16 c) {	57 void RegExpBuilder::AddCharacter(uint16_t c) {

52 pending_empty_ = false;	58 pending_empty_ = false;

53 if (characters_ == NULL) {	59 if (characters_ == NULL) {

54 characters_ = new(zone()) ZoneList<uc16>(4, zone());	60 characters_ = new(I) ZoneGrowableArray<uint16_t>(4);

55 }	61 }

56 characters_->Add(c, zone());	62 characters_->Add(c);

57 LAST(ADD_CHAR);	63 LAST(ADD_CHAR);

58 }	64 }

59	65

60	66

61 void RegExpBuilder::AddEmpty() {	67 void RegExpBuilder::AddEmpty() {

62 pending_empty_ = true;	68 pending_empty_ = true;

63 }	69 }

64	70

65	71

66 void RegExpBuilder::AddAtom(RegExpTree* term) {	72 void RegExpBuilder::AddAtom(RegExpTree* term) {

67 if (term->IsEmpty()) {	73 if (term->IsEmpty()) {

68 AddEmpty();	74 AddEmpty();

69 return;	75 return;

70 }	76 }

71 if (term->IsTextElement()) {	77 if (term->IsTextElement()) {

72 FlushCharacters();	78 FlushCharacters();

73 text_.Add(term, zone());	79 text_.Add(term);

74 } else {	80 } else {

75 FlushText();	81 FlushText();

76 terms_.Add(term, zone());	82 terms_.Add(term);

77 }	83 }

78 LAST(ADD_ATOM);	84 LAST(ADD_ATOM);

79 }	85 }

80	86

81	87

82 void RegExpBuilder::AddAssertion(RegExpTree* assert) {	88 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

83 FlushText();	89 FlushText();

84 terms_.Add(assert, zone());	90 terms_.Add(assert);

85 LAST(ADD_ASSERT);	91 LAST(ADD_ASSERT);

86 }	92 }

87	93

88	94

89 void RegExpBuilder::NewAlternative() {	95 void RegExpBuilder::NewAlternative() {

90 FlushTerms();	96 FlushTerms();

91 }	97 }

92	98

93	99

94 void RegExpBuilder::FlushTerms() {	100 void RegExpBuilder::FlushTerms() {

95 FlushText();	101 FlushText();

96 int num_terms = terms_.length();	102 intptr_t num_terms = terms_.length();

97 RegExpTree* alternative;	103 RegExpTree* alternative;

98 if (num_terms == 0) {	104 if (num_terms == 0) {

99 alternative = RegExpEmpty::GetInstance();	105 alternative = RegExpEmpty::GetInstance();

100 } else if (num_terms == 1) {	106 } else if (num_terms == 1) {

101 alternative = terms_.last();	107 alternative = terms_.Last();

102 } else {	108 } else {

103 alternative = new(zone()) RegExpAlternative(terms_.GetList(zone()));	109 ZoneGrowableArray<RegExpTree> terms =

	110 new(I) ZoneGrowableArray<RegExpTree*>();

	111 for (intptr_t i = 0; i < terms_.length(); i++) {

	112 terms->Add(terms_[i]);

	113 }

	114 alternative = new(I) RegExpAlternative(terms);

104 }	115 }

105 alternatives_.Add(alternative, zone());	116 alternatives_.Add(alternative);

106 terms_.Clear();	117 terms_.Clear();

107 LAST(ADD_NONE);	118 LAST(ADD_NONE);

108 }	119 }

109	120

110	121

111 RegExpTree* RegExpBuilder::ToRegExp() {	122 RegExpTree* RegExpBuilder::ToRegExp() {

112 FlushTerms();	123 FlushTerms();

113 int num_alternatives = alternatives_.length();	124 intptr_t num_alternatives = alternatives_.length();

114 if (num_alternatives == 0) {	125 if (num_alternatives == 0) {

115 return RegExpEmpty::GetInstance();	126 return RegExpEmpty::GetInstance();

116 }	127 }

117 if (num_alternatives == 1) {	128 if (num_alternatives == 1) {

118 return alternatives_.last();	129 return alternatives_.Last();

119 }	130 }

120 return new(zone()) RegExpDisjunction(alternatives_.GetList(zone()));	131 ZoneGrowableArray<RegExpTree> alternatives =

	132 new(I) ZoneGrowableArray<RegExpTree*>();

	133 for (intptr_t i = 0; i < alternatives_.length(); i++) {

	134 alternatives->Add(alternatives_[i]);

	135 }

	136 return new(I) RegExpDisjunction(alternatives);

121 }	137 }

122	138

123	139

124 void RegExpBuilder::AddQuantifierToAtom(	140 void RegExpBuilder::AddQuantifierToAtom(

125 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {	141 intptr_t min,

	142 intptr_t max,

	143 RegExpQuantifier::QuantifierType quantifier_type) {

126 if (pending_empty_) {	144 if (pending_empty_) {

127 pending_empty_ = false;	145 pending_empty_ = false;

128 return;	146 return;

129 }	147 }

130 RegExpTree* atom;	148 RegExpTree* atom;

131 if (characters_ != NULL) {	149 if (characters_ != NULL) {

132 DCHECK(last_added_ == ADD_CHAR);	150 DEBUG_ASSERT(last_added_ == ADD_CHAR);

133 // Last atom was character.	151 // Last atom was character.

134 Vector<const uc16> char_vector = characters_->ToConstVector();	152

135 int num_chars = char_vector.length();	153 ZoneGrowableArray<uint16_t> *char_vector =

	154 new(I) ZoneGrowableArray<uint16_t>();

	155 char_vector->AddArray(*characters_);

	156 intptr_t num_chars = char_vector->length();

136 if (num_chars > 1) {	157 if (num_chars > 1) {

137 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);	158 ZoneGrowableArray<uint16_t> *prefix =

138 text_.Add(new(zone()) RegExpAtom(prefix), zone());	159 new(I) ZoneGrowableArray<uint16_t>();

139 char_vector = char_vector.SubVector(num_chars - 1, num_chars);	160 for (intptr_t i = 0; i < num_chars - 1; i++) {

	161 prefix->Add(char_vector->At(i));

	162 }

	163 text_.Add(new(I) RegExpAtom(prefix));

	164 ZoneGrowableArray<uint16_t> *tail = new(I) ZoneGrowableArray<uint16_t>();

	165 tail->Add(char_vector->At(num_chars - 1));

	166 char_vector = tail;

140 }	167 }

141 characters_ = NULL;	168 characters_ = NULL;

142 atom = new(zone()) RegExpAtom(char_vector);	169 atom = new(I) RegExpAtom(char_vector);

143 FlushText();	170 FlushText();

144 } else if (text_.length() > 0) {	171 } else if (text_.length() > 0) {

145 DCHECK(last_added_ == ADD_ATOM);	172 DEBUG_ASSERT(last_added_ == ADD_ATOM);

146 atom = text_.RemoveLast();	173 atom = text_.RemoveLast();

147 FlushText();	174 FlushText();

148 } else if (terms_.length() > 0) {	175 } else if (terms_.length() > 0) {

149 DCHECK(last_added_ == ADD_ATOM);	176 DEBUG_ASSERT(last_added_ == ADD_ATOM);

150 atom = terms_.RemoveLast();	177 atom = terms_.RemoveLast();

151 if (atom->max_match() == 0) {	178 if (atom->max_match() == 0) {

152 // Guaranteed to only match an empty string.	179 // Guaranteed to only match an empty string.

153 LAST(ADD_TERM);	180 LAST(ADD_TERM);

154 if (min == 0) {	181 if (min == 0) {

155 return;	182 return;

156 }	183 }

157 terms_.Add(atom, zone());	184 terms_.Add(atom);

158 return;	185 return;

159 }	186 }

160 } else {	187 } else {

161 // Only call immediately after adding an atom or character!	188 // Only call immediately after adding an atom or character!

162 UNREACHABLE();	189 UNREACHABLE();

163 return;	190 return;

164 }	191 }

165 terms_.Add(	192 terms_.Add(new(I) RegExpQuantifier(min, max, quantifier_type, atom));

166 new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone());

167 LAST(ADD_TERM);	193 LAST(ADD_TERM);

168 }	194 }

169	195

170 // SNIP	196 // ----------------------------------------------------------------------------

	197 // Implementation of Parser

171	198

172 // ----------------------------------------------------------------------------	199 RegExpParser::RegExpParser(const String& in,

173 // Regular expressions	200 String* error,

174	201 bool multiline)

175 RegExpParser::RegExpParser(FlatStringReader* in,	202 : isolate_(Isolate::Current()),

176 Handle<String>* error,

177 bool multiline,

178 Zone* zone)

179 : isolate_(zone->isolate()),

180 zone_(zone),

181 error_(error),	203 error_(error),

182 captures_(NULL),	204 captures_(NULL),

183 in_(in),	205 in_(in),

184 current_(kEndMarker),	206 current_(kEndMarker),

185 next_pos_(0),	207 next_pos_(0),

186 capture_count_(0),	208 capture_count_(0),

187 has_more_(true),	209 has_more_(true),

188 multiline_(multiline),	210 multiline_(multiline),

189 simple_(false),	211 simple_(false),

190 contains_anchor_(false),	212 contains_anchor_(false),

191 is_scanned_for_captures_(false),	213 is_scanned_for_captures_(false),

192 failed_(false) {	214 failed_(false) {

193 Advance();	215 Advance();

194 }	216 }

195	217

196	218

197 uc32 RegExpParser::Next() {	219 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) {

	220 Isolate* isolate = parsed_function->isolate();

	221 JSRegExp& regexp = JSRegExp::Handle(parsed_function->function().regexp());

	222

	223 const String& pattern = String::Handle(regexp.pattern());

	224 const bool multiline = regexp.is_multi_line();

	225

	226 RegExpCompileData* compile_data = new(isolate) RegExpCompileData();

	227 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {

	228 // Parsing failures are handled in the JSRegExp factory constructor.

	229 UNREACHABLE();

	230 }

	231

	232 regexp.set_num_bracket_expressions(compile_data->capture_count);

	233 if (compile_data->simple) {

	234 regexp.set_is_simple();

	235 } else {

	236 regexp.set_is_complex();

	237 }

	238

	239 parsed_function->SetRegExpCompileData(compile_data);

	240

	241 return true;

	242 }

	243

	244

	245 uint32_t RegExpParser::Next() {

198 if (has_next()) {	246 if (has_next()) {

199 return in()->Get(next_pos_);	247 return in().CharAt(next_pos_);

200 } else {	248 } else {

201 return kEndMarker;	249 return kEndMarker;

202 }	250 }

203 }	251 }

204	252

205	253

206 void RegExpParser::Advance() {	254 void RegExpParser::Advance() {

207 if (next_pos_ < in()->length()) {	255 if (next_pos_ < in().Length()) {

208 StackLimitCheck check(isolate());	256 current_ = in().CharAt(next_pos_);

209 if (check.HasOverflowed()) {	257 next_pos_++;

210 ReportError(CStrVector(Isolate::kStackOverflowMessage));

211 } else if (zone()->excess_allocation()) {

212 ReportError(CStrVector("Regular expression too large"));

213 } else {

214 current_ = in()->Get(next_pos_);

215 next_pos_++;

216 }

217 } else {	258 } else {

218 current_ = kEndMarker;	259 current_ = kEndMarker;

219 has_more_ = false;	260 has_more_ = false;

220 }	261 }

221 }	262 }

222	263

223	264

224 void RegExpParser::Reset(int pos) {	265 void RegExpParser::Reset(intptr_t pos) {

225 next_pos_ = pos;	266 next_pos_ = pos;

226 has_more_ = (pos < in()->length());	267 has_more_ = (pos < in().Length());

227 Advance();	268 Advance();

228 }	269 }

229	270

230	271

231 void RegExpParser::Advance(int dist) {	272 void RegExpParser::Advance(intptr_t dist) {

232 next_pos_ += dist - 1;	273 next_pos_ += dist - 1;

233 Advance();	274 Advance();

234 }	275 }

235	276

236	277

237 bool RegExpParser::simple() {	278 bool RegExpParser::simple() {

238 return simple_;	279 return simple_;

239 }	280 }

240	281

241	282

242 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {	283 void RegExpParser::ReportError(const char* message) {

243 failed_ = true;	284 failed_ = true;

244 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();	285 *error_ = String::New(message);

245 // Zip to the end to make sure the no more input is read.	286 // Zip to the end to make sure the no more input is read.

246 current_ = kEndMarker;	287 current_ = kEndMarker;

247 next_pos_ = in()->length();	288 next_pos_ = in().Length();

248 return NULL;	289

	290 const Error& error = Error::Handle(LanguageError::New(*error_));

	291 Report::LongJump(error);

	292 UNREACHABLE();

249 }	293 }

250	294

251	295

252 // Pattern ::	296 // Pattern ::

253 // Disjunction	297 // Disjunction

254 RegExpTree* RegExpParser::ParsePattern() {	298 RegExpTree* RegExpParser::ParsePattern() {

255 RegExpTree* result = ParseDisjunction(CHECK_FAILED);	299 RegExpTree* result = ParseDisjunction();

256 DCHECK(!has_more());	300 ASSERT(!has_more());

257 // If the result of parsing is a literal string atom, and it has the	301 // If the result of parsing is a literal string atom, and it has the

258 // same length as the input, then the atom is identical to the input.	302 // same length as the input, then the atom is identical to the input.

259 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {	303 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {

260 simple_ = true;	304 simple_ = true;

261 }	305 }

262 return result;	306 return result;

263 }	307 }

264	308

265	309

266 // Disjunction ::	310 // Disjunction ::

267 // Alternative	311 // Alternative

268 // Alternative \| Disjunction	312 // Alternative \| Disjunction

269 // Alternative ::	313 // Alternative ::

270 // [empty]	314 // [empty]

271 // Term Alternative	315 // Term Alternative

272 // Term ::	316 // Term ::

273 // Assertion	317 // Assertion

274 // Atom	318 // Atom

275 // Atom Quantifier	319 // Atom Quantifier

276 RegExpTree* RegExpParser::ParseDisjunction() {	320 RegExpTree* RegExpParser::ParseDisjunction() {

277 // Used to store current state while parsing subexpressions.	321 // Used to store current state while parsing subexpressions.

278 RegExpParserState initial_state(NULL, INITIAL, 0, zone());	322 RegExpParserState initial_state(NULL, INITIAL, 0, I);

279 RegExpParserState* stored_state = &initial_state;	323 RegExpParserState* stored_state = &initial_state;

280 // Cache the builder in a local variable for quick access.	324 // Cache the builder in a local variable for quick access.

281 RegExpBuilder* builder = initial_state.builder();	325 RegExpBuilder* builder = initial_state.builder();

282 while (true) {	326 while (true) {

283 switch (current()) {	327 switch (current()) {

284 case kEndMarker:	328 case kEndMarker:

285 if (stored_state->IsSubexpression()) {	329 if (stored_state->IsSubexpression()) {

286 // Inside a parenthesized group when hitting end of input.	330 // Inside a parenthesized group when hitting end of input.

287 ReportError(CStrVector("Unterminated group") CHECK_FAILED);	331 ReportError("Unterminated group");

	332 UNREACHABLE();

288 }	333 }

289 DCHECK_EQ(INITIAL, stored_state->group_type());	334 ASSERT(INITIAL == stored_state->group_type());

290 // Parsing completed successfully.	335 // Parsing completed successfully.

291 return builder->ToRegExp();	336 return builder->ToRegExp();

292 case ')': {	337 case ')': {

293 if (!stored_state->IsSubexpression()) {	338 if (!stored_state->IsSubexpression()) {

294 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);	339 ReportError("Unmatched ')'");

	340 UNREACHABLE();

295 }	341 }

296 DCHECK_NE(INITIAL, stored_state->group_type());	342 ASSERT(INITIAL != stored_state->group_type());

297	343

298 Advance();	344 Advance();

299 // End disjunction parsing and convert builder content to new single	345 // End disjunction parsing and convert builder content to new single

300 // regexp atom.	346 // regexp atom.

301 RegExpTree* body = builder->ToRegExp();	347 RegExpTree* body = builder->ToRegExp();

302	348

303 int end_capture_index = captures_started();	349 intptr_t end_capture_index = captures_started();

304	350

305 int capture_index = stored_state->capture_index();	351 intptr_t capture_index = stored_state->capture_index();

306 SubexpressionType group_type = stored_state->group_type();	352 SubexpressionType group_type = stored_state->group_type();

307	353

308 // Restore previous state.	354 // Restore previous state.

309 stored_state = stored_state->previous_state();	355 stored_state = stored_state->previous_state();

310 builder = stored_state->builder();	356 builder = stored_state->builder();

311	357

312 // Build result of subexpression.	358 // Build result of subexpression.

313 if (group_type == CAPTURE) {	359 if (group_type == CAPTURE) {

314 RegExpCapture* capture = new(zone()) RegExpCapture(body, capture_index);	360 RegExpCapture* capture = new(I) RegExpCapture(body, capture_index);

315 captures_->at(capture_index - 1) = capture;	361 (*captures_)[capture_index - 1] = capture;

316 body = capture;	362 body = capture;

317 } else if (group_type != GROUPING) {	363 } else if (group_type != GROUPING) {

318 DCHECK(group_type == POSITIVE_LOOKAHEAD \|\|	364 ASSERT(group_type == POSITIVE_LOOKAHEAD \|\|

319 group_type == NEGATIVE_LOOKAHEAD);	365 group_type == NEGATIVE_LOOKAHEAD);

320 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);	366 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);

321 body = new(zone()) RegExpLookahead(body,	367 body = new(I) RegExpLookahead(body,

322 is_positive,	368 is_positive,

323 end_capture_index - capture_index,	369 end_capture_index - capture_index,

324 capture_index);	370 capture_index);

325 }	371 }

326 builder->AddAtom(body);	372 builder->AddAtom(body);

327 // For compatability with JSC and ES3, we allow quantifiers after	373 // For compatibility with JSC and ES3, we allow quantifiers after

328 // lookaheads, and break in all cases.	374 // lookaheads, and break in all cases.

329 break;	375 break;

330 }	376 }

331 case '\|': {	377 case '\|': {

332 Advance();	378 Advance();

333 builder->NewAlternative();	379 builder->NewAlternative();

334 continue;	380 continue;

335 }	381 }

336 case '*':	382 case '*':

337 case '+':	383 case '+':

338 case '?':	384 case '?':

339 return ReportError(CStrVector("Nothing to repeat"));	385 ReportError("Nothing to repeat");

	386 UNREACHABLE();

340 case '^': {	387 case '^': {

341 Advance();	388 Advance();

342 if (multiline_) {	389 if (multiline_) {

343 builder->AddAssertion(	390 builder->AddAssertion(

344 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));	391 new(I) RegExpAssertion(RegExpAssertion::START_OF_LINE));

345 } else {	392 } else {

346 builder->AddAssertion(	393 builder->AddAssertion(

347 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));	394 new(I) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

348 set_contains_anchor();	395 set_contains_anchor();

349 }	396 }

350 continue;	397 continue;

351 }	398 }

352 case '$': {	399 case '$': {

353 Advance();	400 Advance();

354 RegExpAssertion::AssertionType assertion_type =	401 RegExpAssertion::AssertionType assertion_type =

355 multiline_ ? RegExpAssertion::END_OF_LINE :	402 multiline_ ? RegExpAssertion::END_OF_LINE :

356 RegExpAssertion::END_OF_INPUT;	403 RegExpAssertion::END_OF_INPUT;

357 builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type));	404 builder->AddAssertion(new RegExpAssertion(assertion_type));

358 continue;	405 continue;

359 }	406 }

360 case '.': {	407 case '.': {

361 Advance();	408 Advance();

362 // everything except \x0a, \x0d, \u2028 and \u2029	409 // everything except \x0a, \x0d, \u2028 and \u2029

363 ZoneList<CharacterRange>* ranges =	410 ZoneGrowableArray<CharacterRange>* ranges =

364 new(zone()) ZoneList<CharacterRange>(2, zone());	411 new ZoneGrowableArray<CharacterRange>(2);

365 CharacterRange::AddClassEscape('.', ranges, zone());	412 CharacterRange::AddClassEscape('.', ranges);

366 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);	413 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

367 builder->AddAtom(atom);	414 builder->AddAtom(atom);

368 break;	415 break;

369 }	416 }

370 case '(': {	417 case '(': {

371 SubexpressionType subexpr_type = CAPTURE;	418 SubexpressionType subexpr_type = CAPTURE;

372 Advance();	419 Advance();

373 if (current() == '?') {	420 if (current() == '?') {

374 switch (Next()) {	421 switch (Next()) {

375 case ':':	422 case ':':

376 subexpr_type = GROUPING;	423 subexpr_type = GROUPING;

377 break;	424 break;

378 case '=':	425 case '=':

379 subexpr_type = POSITIVE_LOOKAHEAD;	426 subexpr_type = POSITIVE_LOOKAHEAD;

380 break;	427 break;

381 case '!':	428 case '!':

382 subexpr_type = NEGATIVE_LOOKAHEAD;	429 subexpr_type = NEGATIVE_LOOKAHEAD;

383 break;	430 break;

384 default:	431 default:

385 ReportError(CStrVector("Invalid group") CHECK_FAILED);	432 ReportError("Invalid group");

386 break;	433 UNREACHABLE();

387 }	434 }

388 Advance(2);	435 Advance(2);

389 } else {	436 } else {

390 if (captures_ == NULL) {	437 if (captures_ == NULL) {

391 captures_ = new(zone()) ZoneList<RegExpCapture*>(2, zone());	438 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);

392 }	439 }

393 if (captures_started() >= kMaxCaptures) {	440 if (captures_started() >= kMaxCaptures) {

394 ReportError(CStrVector("Too many captures") CHECK_FAILED);	441 ReportError("Too many captures");

	442 UNREACHABLE();

395 }	443 }

396 captures_->Add(NULL, zone());	444 captures_->Add(NULL);

397 }	445 }

398 // Store current state and begin new disjunction parsing.	446 // Store current state and begin new disjunction parsing.

399 stored_state = new(zone()) RegExpParserState(stored_state, subexpr_type,	447 stored_state = new RegExpParserState(stored_state, subexpr_type,

400 captures_started(), zone());	448 captures_started(), I);

401 builder = stored_state->builder();	449 builder = stored_state->builder();

402 continue;	450 continue;

403 }	451 }

404 case '[': {	452 case '[': {

405 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);	453 RegExpTree* atom = ParseCharacterClass();

406 builder->AddAtom(atom);	454 builder->AddAtom(atom);

407 break;	455 break;

408 }	456 }

409 // Atom ::	457 // Atom ::

410 // \ AtomEscape	458 // \ AtomEscape

411 case '\\':	459 case '\\':

412 switch (Next()) {	460 switch (Next()) {

413 case kEndMarker:	461 case kEndMarker:

414 return ReportError(CStrVector("\\ at end of pattern"));	462 ReportError("\\ at end of pattern");

	463 UNREACHABLE();

415 case 'b':	464 case 'b':

416 Advance(2);	465 Advance(2);

417 builder->AddAssertion(	466 builder->AddAssertion(

418 new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));	467 new RegExpAssertion(RegExpAssertion::BOUNDARY));

419 continue;	468 continue;

420 case 'B':	469 case 'B':

421 Advance(2);	470 Advance(2);

422 builder->AddAssertion(	471 builder->AddAssertion(

423 new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));	472 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

424 continue;	473 continue;

425 // AtomEscape ::	474 // AtomEscape ::

426 // CharacterClassEscape	475 // CharacterClassEscape

427 //	476 //

428 // CharacterClassEscape :: one of	477 // CharacterClassEscape :: one of

429 // d D s S w W	478 // d D s S w W

430 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {	479 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

431 uc32 c = Next();	480 uint32_t c = Next();

432 Advance(2);	481 Advance(2);

433 ZoneList<CharacterRange>* ranges =	482 ZoneGrowableArray<CharacterRange>* ranges =

434 new(zone()) ZoneList<CharacterRange>(2, zone());	483 new ZoneGrowableArray<CharacterRange>(2);

435 CharacterRange::AddClassEscape(c, ranges, zone());	484 CharacterRange::AddClassEscape(c, ranges);

436 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);	485 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

437 builder->AddAtom(atom);	486 builder->AddAtom(atom);

438 break;	487 break;

439 }	488 }

440 case '1': case '2': case '3': case '4': case '5': case '6':	489 case '1': case '2': case '3': case '4': case '5': case '6':

441 case '7': case '8': case '9': {	490 case '7': case '8': case '9': {

442 int index = 0;	491 intptr_t index = 0;

443 if (ParseBackReferenceIndex(&index)) {	492 if (ParseBackReferenceIndex(&index)) {

444 RegExpCapture* capture = NULL;	493 RegExpCapture* capture = NULL;

445 if (captures_ != NULL && index <= captures_->length()) {	494 if (captures_ != NULL && index <= captures_->length()) {

446 capture = captures_->at(index - 1);	495 capture = captures_->At(index - 1);

447 }	496 }

448 if (capture == NULL) {	497 if (capture == NULL) {

449 builder->AddEmpty();	498 builder->AddEmpty();

450 break;	499 break;

451 }	500 }

452 RegExpTree* atom = new(zone()) RegExpBackReference(capture);	501 RegExpTree* atom = new RegExpBackReference(capture);

453 builder->AddAtom(atom);	502 builder->AddAtom(atom);

454 break;	503 break;

455 }	504 }

456 uc32 first_digit = Next();	505 uint32_t first_digit = Next();

457 if (first_digit == '8' \|\| first_digit == '9') {	506 if (first_digit == '8' \|\| first_digit == '9') {

458 // Treat as identity escape	507 // Treat as identity escape

459 builder->AddCharacter(first_digit);	508 builder->AddCharacter(first_digit);

460 Advance(2);	509 Advance(2);

461 break;	510 break;

462 }	511 }

463 }	512 }

464 // FALLTHROUGH	513 // FALLTHROUGH

465 case '0': {	514 case '0': {

466 Advance();	515 Advance();

467 uc32 octal = ParseOctalLiteral();	516 uint32_t octal = ParseOctalLiteral();

468 builder->AddCharacter(octal);	517 builder->AddCharacter(octal);

469 break;	518 break;

470 }	519 }

471 // ControlEscape :: one of	520 // ControlEscape :: one of

472 // f n r t v	521 // f n r t v

473 case 'f':	522 case 'f':

474 Advance(2);	523 Advance(2);

475 builder->AddCharacter('\f');	524 builder->AddCharacter('\f');

476 break;	525 break;

477 case 'n':	526 case 'n':

478 Advance(2);	527 Advance(2);

479 builder->AddCharacter('\n');	528 builder->AddCharacter('\n');

480 break;	529 break;

481 case 'r':	530 case 'r':

482 Advance(2);	531 Advance(2);

483 builder->AddCharacter('\r');	532 builder->AddCharacter('\r');

484 break;	533 break;

485 case 't':	534 case 't':

486 Advance(2);	535 Advance(2);

487 builder->AddCharacter('\t');	536 builder->AddCharacter('\t');

488 break;	537 break;

489 case 'v':	538 case 'v':

490 Advance(2);	539 Advance(2);

491 builder->AddCharacter('\v');	540 builder->AddCharacter('\v');

492 break;	541 break;

493 case 'c': {	542 case 'c': {

494 Advance();	543 Advance();

495 uc32 controlLetter = Next();	544 uint32_t controlLetter = Next();

496 // Special case if it is an ASCII letter.	545 // Special case if it is an ASCII letter.

497 // Convert lower case letters to uppercase.	546 // Convert lower case letters to uppercase.

498 uc32 letter = controlLetter & ~('a' ^ 'A');	547 uint32_t letter = controlLetter & ~('a' ^ 'A');

499 if (letter < 'A' \|\| 'Z' < letter) {	548 if (letter < 'A' \|\| 'Z' < letter) {

500 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.	549 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

501 // This is outside the specification. We match JSC in	550 // This is outside the specification. We match JSC in

502 // reading the backslash as a literal character instead	551 // reading the backslash as a literal character instead

503 // of as starting an escape.	552 // of as starting an escape.

504 builder->AddCharacter('\\');	553 builder->AddCharacter('\\');

505 } else {	554 } else {

506 Advance(2);	555 Advance(2);

507 builder->AddCharacter(controlLetter & 0x1f);	556 builder->AddCharacter(controlLetter & 0x1f);

508 }	557 }

509 break;	558 break;

510 }	559 }

511 case 'x': {	560 case 'x': {

512 Advance(2);	561 Advance(2);

513 uc32 value;	562 uint32_t value;

514 if (ParseHexEscape(2, &value)) {	563 if (ParseHexEscape(2, &value)) {

515 builder->AddCharacter(value);	564 builder->AddCharacter(value);

516 } else {	565 } else {

517 builder->AddCharacter('x');	566 builder->AddCharacter('x');

518 }	567 }

519 break;	568 break;

520 }	569 }

521 case 'u': {	570 case 'u': {

522 Advance(2);	571 Advance(2);

523 uc32 value;	572 uint32_t value;

524 if (ParseHexEscape(4, &value)) {	573 if (ParseHexEscape(4, &value)) {

525 builder->AddCharacter(value);	574 builder->AddCharacter(value);

526 } else {	575 } else {

527 builder->AddCharacter('u');	576 builder->AddCharacter('u');

528 }	577 }

529 break;	578 break;

530 }	579 }

531 default:	580 default:

532 // Identity escape.	581 // Identity escape.

533 builder->AddCharacter(Next());	582 builder->AddCharacter(Next());

534 Advance(2);	583 Advance(2);

535 break;	584 break;

536 }	585 }

537 break;	586 break;

538 case '{': {	587 case '{': {

539 int dummy;	588 intptr_t dummy;

540 if (ParseIntervalQuantifier(&dummy, &dummy)) {	589 if (ParseIntervalQuantifier(&dummy, &dummy)) {

541 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);	590 ReportError("Nothing to repeat");

	591 UNREACHABLE();

542 }	592 }

543 // fallthrough	593 // fallthrough

544 }	594 }

545 default:	595 default:

546 builder->AddCharacter(current());	596 builder->AddCharacter(current());

547 Advance();	597 Advance();

548 break;	598 break;

549 } // end switch(current())	599 } // end switch(current())

550	600

551 int min;	601 intptr_t min;

552 int max;	602 intptr_t max;

553 switch (current()) {	603 switch (current()) {

554 // QuantifierPrefix ::	604 // QuantifierPrefix ::

555 // *	605 // *

556 // +	606 // +

557 // ?	607 // ?

558 // {	608 // {

559 case '*':	609 case '*':

560 min = 0;	610 min = 0;

561 max = RegExpTree::kInfinity;	611 max = RegExpTree::kInfinity;

562 Advance();	612 Advance();

563 break;	613 break;

564 case '+':	614 case '+':

565 min = 1;	615 min = 1;

566 max = RegExpTree::kInfinity;	616 max = RegExpTree::kInfinity;

567 Advance();	617 Advance();

568 break;	618 break;

569 case '?':	619 case '?':

570 min = 0;	620 min = 0;

571 max = 1;	621 max = 1;

572 Advance();	622 Advance();

573 break;	623 break;

574 case '{':	624 case '{':

575 if (ParseIntervalQuantifier(&min, &max)) {	625 if (ParseIntervalQuantifier(&min, &max)) {

576 if (max < min) {	626 if (max < min) {

577 ReportError(CStrVector("numbers out of order in {} quantifier.")	627 ReportError("numbers out of order in {} quantifier.");

578 CHECK_FAILED);	628 UNREACHABLE();

579 }	629 }

580 break;	630 break;

581 } else {	631 } else {

582 continue;	632 continue;

583 }	633 }

584 default:	634 default:

585 continue;	635 continue;

586 }	636 }

587 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;	637 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

588 if (current() == '?') {	638 if (current() == '?') {

589 quantifier_type = RegExpQuantifier::NON_GREEDY;	639 quantifier_type = RegExpQuantifier::NON_GREEDY;

590 Advance();	640 Advance();

591 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {	641 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {

592 // FLAG_regexp_possessive_quantifier is a debug-only flag.	642 // FLAG_regexp_possessive_quantifier is a debug-only flag.

593 quantifier_type = RegExpQuantifier::POSSESSIVE;	643 quantifier_type = RegExpQuantifier::POSSESSIVE;

594 Advance();	644 Advance();

595 }	645 }

596 builder->AddQuantifierToAtom(min, max, quantifier_type);	646 builder->AddQuantifierToAtom(min, max, quantifier_type);

597 }	647 }

598 }	648 }

599	649

600	650

601 #ifdef DEBUG	651 #ifdef DEBUG

602 // Currently only used in an DCHECK.	652 // Currently only used in an ASSERT.

603 static bool IsSpecialClassEscape(uc32 c) {	653 static bool IsSpecialClassEscape(uint32_t c) {

604 switch (c) {	654 switch (c) {

605 case 'd': case 'D':	655 case 'd': case 'D':

606 case 's': case 'S':	656 case 's': case 'S':

607 case 'w': case 'W':	657 case 'w': case 'W':

608 return true;	658 return true;

609 default:	659 default:

610 return false;	660 return false;

611 }	661 }

612 }	662 }

613 #endif	663 #endif

614	664

615	665

616 // In order to know whether an escape is a backreference or not we have to scan	666 // In order to know whether an escape is a backreference or not we have to scan

617 // the entire regexp and find the number of capturing parentheses. However we	667 // the entire regexp and find the number of capturing parentheses. However we

618 // don't want to scan the regexp twice unless it is necessary. This mini-parser	668 // don't want to scan the regexp twice unless it is necessary. This mini-parser

619 // is called when needed. It can see the difference between capturing and	669 // is called when needed. It can see the difference between capturing and

620 // noncapturing parentheses and can skip character classes and backslash-escaped	670 // noncapturing parentheses and can skip character classes and backslash-escaped

621 // characters.	671 // characters.

622 void RegExpParser::ScanForCaptures() {	672 void RegExpParser::ScanForCaptures() {

623 // Start with captures started previous to current position	673 // Start with captures started previous to current position

624 int capture_count = captures_started();	674 intptr_t capture_count = captures_started();

625 // Add count of captures after this position.	675 // Add count of captures after this position.

626 int n;	676 intptr_t n;

627 while ((n = current()) != kEndMarker) {	677 while ((n = current()) != kEndMarker) {

628 Advance();	678 Advance();

629 switch (n) {	679 switch (n) {

630 case '\\':	680 case '\\':

631 Advance();	681 Advance();

632 break;	682 break;

633 case '[': {	683 case '[': {

634 int c;	684 intptr_t c;

635 while ((c = current()) != kEndMarker) {	685 while ((c = current()) != kEndMarker) {

636 Advance();	686 Advance();

637 if (c == '\\') {	687 if (c == '\\') {

638 Advance();	688 Advance();

639 } else {	689 } else {

640 if (c == ']') break;	690 if (c == ']') break;

641 }	691 }

642 }	692 }

643 break;	693 break;

644 }	694 }

645 case '(':	695 case '(':

646 if (current() != '?') capture_count++;	696 if (current() != '?') capture_count++;

647 break;	697 break;

648 }	698 }

649 }	699 }

650 capture_count_ = capture_count;	700 capture_count_ = capture_count;

651 is_scanned_for_captures_ = true;	701 is_scanned_for_captures_ = true;

652 }	702 }

653	703

654	704

655 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {	705 static inline bool IsDecimalDigit(int32_t c) {

656 DCHECK_EQ('\\', current());	706 return '0' <= c && c <= '9';

657 DCHECK('1' <= Next() && Next() <= '9');	707 }

	708

	709

	710 bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {

	711 ASSERT('\\' == current());

	712 ASSERT('1' <= Next() && Next() <= '9');

658 // Try to parse a decimal literal that is no greater than the total number	713 // Try to parse a decimal literal that is no greater than the total number

659 // of left capturing parentheses in the input.	714 // of left capturing parentheses in the input.

660 int start = position();	715 intptr_t start = position();

661 int value = Next() - '0';	716 intptr_t value = Next() - '0';

662 Advance(2);	717 Advance(2);

663 while (true) {	718 while (true) {

664 uc32 c = current();	719 uint32_t c = current();

665 if (IsDecimalDigit(c)) {	720 if (IsDecimalDigit(c)) {

666 value = 10 * value + (c - '0');	721 value = 10 * value + (c - '0');

667 if (value > kMaxCaptures) {	722 if (value > kMaxCaptures) {

668 Reset(start);	723 Reset(start);

669 return false;	724 return false;

670 }	725 }

671 Advance();	726 Advance();

672 } else {	727 } else {

673 break;	728 break;

674 }	729 }

675 }	730 }

676 if (value > captures_started()) {	731 if (value > captures_started()) {

677 if (!is_scanned_for_captures_) {	732 if (!is_scanned_for_captures_) {

678 int saved_position = position();	733 intptr_t saved_position = position();

679 ScanForCaptures();	734 ScanForCaptures();

680 Reset(saved_position);	735 Reset(saved_position);

681 }	736 }

682 if (value > capture_count_) {	737 if (value > capture_count_) {

683 Reset(start);	738 Reset(start);

684 return false;	739 return false;

685 }	740 }

686 }	741 }

687 *index_out = value;	742 *index_out = value;

688 return true;	743 return true;

689 }	744 }

690	745

691	746

692 // QuantifierPrefix ::	747 // QuantifierPrefix ::

693 // { DecimalDigits }	748 // { DecimalDigits }

694 // { DecimalDigits , }	749 // { DecimalDigits , }

695 // { DecimalDigits , DecimalDigits }	750 // { DecimalDigits , DecimalDigits }

696 //	751 //

697 // Returns true if parsing succeeds, and set the min_out and max_out	752 // Returns true if parsing succeeds, and set the min_out and max_out

698 // values. Values are truncated to RegExpTree::kInfinity if they overflow.	753 // values. Values are truncated to RegExpTree::kInfinity if they overflow.

699 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {	754 bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,

700 DCHECK_EQ(current(), '{');	755 intptr_t* max_out) {

701 int start = position();	756 ASSERT(current() == '{');

	757 intptr_t start = position();

702 Advance();	758 Advance();

703 int min = 0;	759 intptr_t min = 0;

704 if (!IsDecimalDigit(current())) {	760 if (!IsDecimalDigit(current())) {

705 Reset(start);	761 Reset(start);

706 return false;	762 return false;

707 }	763 }

708 while (IsDecimalDigit(current())) {	764 while (IsDecimalDigit(current())) {

709 int next = current() - '0';	765 intptr_t next = current() - '0';

710 if (min > (RegExpTree::kInfinity - next) / 10) {	766 if (min > (RegExpTree::kInfinity - next) / 10) {

711 // Overflow. Skip past remaining decimal digits and return -1.	767 // Overflow. Skip past remaining decimal digits and return -1.

712 do {	768 do {

713 Advance();	769 Advance();

714 } while (IsDecimalDigit(current()));	770 } while (IsDecimalDigit(current()));

715 min = RegExpTree::kInfinity;	771 min = RegExpTree::kInfinity;

716 break;	772 break;

717 }	773 }

718 min = 10 * min + next;	774 min = 10 * min + next;

719 Advance();	775 Advance();

720 }	776 }

721 int max = 0;	777 intptr_t max = 0;

722 if (current() == '}') {	778 if (current() == '}') {

723 max = min;	779 max = min;

724 Advance();	780 Advance();

725 } else if (current() == ',') {	781 } else if (current() == ',') {

726 Advance();	782 Advance();

727 if (current() == '}') {	783 if (current() == '}') {

728 max = RegExpTree::kInfinity;	784 max = RegExpTree::kInfinity;

729 Advance();	785 Advance();

730 } else {	786 } else {

731 while (IsDecimalDigit(current())) {	787 while (IsDecimalDigit(current())) {

732 int next = current() - '0';	788 intptr_t next = current() - '0';

733 if (max > (RegExpTree::kInfinity - next) / 10) {	789 if (max > (RegExpTree::kInfinity - next) / 10) {

734 do {	790 do {

735 Advance();	791 Advance();

736 } while (IsDecimalDigit(current()));	792 } while (IsDecimalDigit(current()));

737 max = RegExpTree::kInfinity;	793 max = RegExpTree::kInfinity;

738 break;	794 break;

739 }	795 }

740 max = 10 * max + next;	796 max = 10 * max + next;

741 Advance();	797 Advance();

742 }	798 }

743 if (current() != '}') {	799 if (current() != '}') {

744 Reset(start);	800 Reset(start);

745 return false;	801 return false;

746 }	802 }

747 Advance();	803 Advance();

748 }	804 }

749 } else {	805 } else {

750 Reset(start);	806 Reset(start);

751 return false;	807 return false;

752 }	808 }

753 *min_out = min;	809 *min_out = min;

754 *max_out = max;	810 *max_out = max;

755 return true;	811 return true;

756 }	812 }

757	813

758	814

759 uc32 RegExpParser::ParseOctalLiteral() {	815 uint32_t RegExpParser::ParseOctalLiteral() {

760 DCHECK(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);	816 ASSERT(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);

761 // For compatibility with some other browsers (not all), we parse	817 // For compatibility with some other browsers (not all), we parse

762 // up to three octal digits with a value below 256.	818 // up to three octal digits with a value below 256.

763 uc32 value = current() - '0';	819 uint32_t value = current() - '0';

764 Advance();	820 Advance();

765 if ('0' <= current() && current() <= '7') {	821 if ('0' <= current() && current() <= '7') {

766 value = value * 8 + current() - '0';	822 value = value * 8 + current() - '0';

767 Advance();	823 Advance();

768 if (value < 32 && '0' <= current() && current() <= '7') {	824 if (value < 32 && '0' <= current() && current() <= '7') {

769 value = value * 8 + current() - '0';	825 value = value * 8 + current() - '0';

770 Advance();	826 Advance();

771 }	827 }

772 }	828 }

773 return value;	829 return value;

774 }	830 }

775	831

776	832

777 bool RegExpParser::ParseHexEscape(int length, uc32 *value) {	833 // Returns the value (0 .. 15) of a hexadecimal character c.

778 int start = position();	834 // If c is not a legal hexadecimal character, returns a value < 0.

779 uc32 val = 0;	835 static inline intptr_t HexValue(uint32_t c) {

	836 c -= '0';

	837 if (static_cast<unsigned>(c) <= 9) return c;

	838 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

	839 if (static_cast<unsigned>(c) <= 5) return c + 10;

	840 return -1;

	841 }

	842

	843

	844 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) {

	845 intptr_t start = position();

	846 uint32_t val = 0;

780 bool done = false;	847 bool done = false;

781 for (int i = 0; !done; i++) {	848 for (intptr_t i = 0; !done; i++) {

782 uc32 c = current();	849 uint32_t c = current();

783 int d = HexValue(c);	850 intptr_t d = HexValue(c);

784 if (d < 0) {	851 if (d < 0) {

785 Reset(start);	852 Reset(start);

786 return false;	853 return false;

787 }	854 }

788 val = val * 16 + d;	855 val = val * 16 + d;

789 Advance();	856 Advance();

790 if (i == length - 1) {	857 if (i == length - 1) {

791 done = true;	858 done = true;

792 }	859 }

793 }	860 }

794 *value = val;	861 *value = val;

795 return true;	862 return true;

796 }	863 }

797	864

798	865

799 uc32 RegExpParser::ParseClassCharacterEscape() {	866 uint32_t RegExpParser::ParseClassCharacterEscape() {

800 DCHECK(current() == '\\');	867 ASSERT(current() == '\\');

801 DCHECK(has_next() && !IsSpecialClassEscape(Next()));	868 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));

802 Advance();	869 Advance();

803 switch (current()) {	870 switch (current()) {

804 case 'b':	871 case 'b':

805 Advance();	872 Advance();

806 return '\b';	873 return '\b';

807 // ControlEscape :: one of	874 // ControlEscape :: one of

808 // f n r t v	875 // f n r t v

809 case 'f':	876 case 'f':

810 Advance();	877 Advance();

811 return '\f';	878 return '\f';

812 case 'n':	879 case 'n':

813 Advance();	880 Advance();

814 return '\n';	881 return '\n';

815 case 'r':	882 case 'r':

816 Advance();	883 Advance();

817 return '\r';	884 return '\r';

818 case 't':	885 case 't':

819 Advance();	886 Advance();

820 return '\t';	887 return '\t';

821 case 'v':	888 case 'v':

822 Advance();	889 Advance();

823 return '\v';	890 return '\v';

824 case 'c': {	891 case 'c': {

825 uc32 controlLetter = Next();	892 uint32_t controlLetter = Next();

826 uc32 letter = controlLetter & ~('A' ^ 'a');	893 uint32_t letter = controlLetter & ~('A' ^ 'a');

827 // For compatibility with JSC, inside a character class	894 // For compatibility with JSC, inside a character class

828 // we also accept digits and underscore as control characters.	895 // we also accept digits and underscore as control characters.

829 if ((controlLetter >= '0' && controlLetter <= '9') \|\|	896 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

830 controlLetter == '_' \|\|	897 controlLetter == '_' \|\|

831 (letter >= 'A' && letter <= 'Z')) {	898 (letter >= 'A' && letter <= 'Z')) {

832 Advance(2);	899 Advance(2);

833 // Control letters mapped to ASCII control characters in the range	900 // Control letters mapped to ASCII control characters in the range

834 // 0x00-0x1f.	901 // 0x00-0x1f.

835 return controlLetter & 0x1f;	902 return controlLetter & 0x1f;

836 }	903 }

837 // We match JSC in reading the backslash as a literal	904 // We match JSC in reading the backslash as a literal

838 // character instead of as starting an escape.	905 // character instead of as starting an escape.

839 return '\\';	906 return '\\';

840 }	907 }

841 case '0': case '1': case '2': case '3': case '4': case '5':	908 case '0': case '1': case '2': case '3': case '4': case '5':

842 case '6': case '7':	909 case '6': case '7':

843 // For compatibility, we interpret a decimal escape that isn't	910 // For compatibility, we interpret a decimal escape that isn't

844 // a back reference (and therefore either \0 or not valid according	911 // a back reference (and therefore either \0 or not valid according

845 // to the specification) as a 1..3 digit octal character code.	912 // to the specification) as a 1..3 digit octal character code.

846 return ParseOctalLiteral();	913 return ParseOctalLiteral();

847 case 'x': {	914 case 'x': {

848 Advance();	915 Advance();

849 uc32 value;	916 uint32_t value;

850 if (ParseHexEscape(2, &value)) {	917 if (ParseHexEscape(2, &value)) {

851 return value;	918 return value;

852 }	919 }

853 // If \x is not followed by a two-digit hexadecimal, treat it	920 // If \x is not followed by a two-digit hexadecimal, treat it

854 // as an identity escape.	921 // as an identity escape.

855 return 'x';	922 return 'x';

856 }	923 }

857 case 'u': {	924 case 'u': {

858 Advance();	925 Advance();

859 uc32 value;	926 uint32_t value;

860 if (ParseHexEscape(4, &value)) {	927 if (ParseHexEscape(4, &value)) {

861 return value;	928 return value;

862 }	929 }

863 // If \u is not followed by a four-digit hexadecimal, treat it	930 // If \u is not followed by a four-digit hexadecimal, treat it

864 // as an identity escape.	931 // as an identity escape.

865 return 'u';	932 return 'u';

866 }	933 }

867 default: {	934 default: {

868 // Extended identity escape. We accept any character that hasn't	935 // Extended identity escape. We accept any character that hasn't

869 // been matched by a more specific case, not just the subset required	936 // been matched by a more specific case, not just the subset required

870 // by the ECMAScript specification.	937 // by the ECMAScript specification.

871 uc32 result = current();	938 uint32_t result = current();

872 Advance();	939 Advance();

873 return result;	940 return result;

874 }	941 }

875 }	942 }

876 return 0;	943 return 0;

877 }	944 }

878	945

879	946

880 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {	947 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {

881 DCHECK_EQ(0, *char_class);	948 ASSERT(0 == *char_class);

882 uc32 first = current();	949 uint32_t first = current();

883 if (first == '\\') {	950 if (first == '\\') {

884 switch (Next()) {	951 switch (Next()) {

885 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {	952 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {

886 *char_class = Next();	953 *char_class = Next();

887 Advance(2);	954 Advance(2);

888 return CharacterRange::Singleton(0); // Return dummy value.	955 return CharacterRange::Singleton(0); // Return dummy value.

889 }	956 }

890 case kEndMarker:	957 case kEndMarker:

891 return ReportError(CStrVector("\\ at end of pattern"));	958 ReportError("\\ at end of pattern");

	959 UNREACHABLE();

892 default:	960 default:

893 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);	961 uint32_t c = ParseClassCharacterEscape();

894 return CharacterRange::Singleton(c);	962 return CharacterRange::Singleton(c);

895 }	963 }

896 } else {	964 } else {

897 Advance();	965 Advance();

898 return CharacterRange::Singleton(first);	966 return CharacterRange::Singleton(first);

899 }	967 }

900 }	968 }

901	969

902	970

903 static const uc16 kNoCharClass = 0;	971 static const uint16_t kNoCharClass = 0;

904	972

905 // Adds range or pre-defined character class to character ranges.	973 // Adds range or pre-defined character class to character ranges.

906 // If char_class is not kInvalidClass, it's interpreted as a class	974 // If char_class is not kInvalidClass, it's interpreted as a class

907 // escape (i.e., 's' means whitespace, from '\s').	975 // escape (i.e., 's' means whitespace, from '\s').

908 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,	976 static inline void AddRangeOrEscape(ZoneGrowableArray<CharacterRange>* ranges,

909 uc16 char_class,	977 uint16_t char_class,

910 CharacterRange range,	978 CharacterRange range) {

911 Zone* zone) {

912 if (char_class != kNoCharClass) {	979 if (char_class != kNoCharClass) {

913 CharacterRange::AddClassEscape(char_class, ranges, zone);	980 CharacterRange::AddClassEscape(char_class, ranges);

914 } else {	981 } else {

915 ranges->Add(range, zone);	982 ranges->Add(range);

916 }	983 }

917 }	984 }

918	985

919	986

920 RegExpTree* RegExpParser::ParseCharacterClass() {	987 RegExpTree* RegExpParser::ParseCharacterClass() {

921 static const char* kUnterminated = "Unterminated character class";	988 static const char* kUnterminated = "Unterminated character class";

922 static const char* kRangeOutOfOrder = "Range out of order in character class";	989 static const char* kRangeOutOfOrder = "Range out of order in character class";

923	990

924 DCHECK_EQ(current(), '[');	991 ASSERT(current() == '[');

925 Advance();	992 Advance();

926 bool is_negated = false;	993 bool is_negated = false;

927 if (current() == '^') {	994 if (current() == '^') {

928 is_negated = true;	995 is_negated = true;

929 Advance();	996 Advance();

930 }	997 }

931 ZoneList<CharacterRange>* ranges =	998 ZoneGrowableArray<CharacterRange>* ranges =

932 new(zone()) ZoneList<CharacterRange>(2, zone());	999 new(I) ZoneGrowableArray<CharacterRange>(2);

933 while (has_more() && current() != ']') {	1000 while (has_more() && current() != ']') {

934 uc16 char_class = kNoCharClass;	1001 uint16_t char_class = kNoCharClass;

935 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);	1002 CharacterRange first = ParseClassAtom(&char_class);

936 if (current() == '-') {	1003 if (current() == '-') {

937 Advance();	1004 Advance();

938 if (current() == kEndMarker) {	1005 if (current() == kEndMarker) {

939 // If we reach the end we break out of the loop and let the	1006 // If we reach the end we break out of the loop and let the

940 // following code report an error.	1007 // following code report an error.

941 break;	1008 break;

942 } else if (current() == ']') {	1009 } else if (current() == ']') {

943 AddRangeOrEscape(ranges, char_class, first, zone());	1010 AddRangeOrEscape(ranges, char_class, first);

944 ranges->Add(CharacterRange::Singleton('-'), zone());	1011 ranges->Add(CharacterRange::Singleton('-'));

945 break;	1012 break;

946 }	1013 }

947 uc16 char_class_2 = kNoCharClass;	1014 uint16_t char_class_2 = kNoCharClass;

948 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);	1015 CharacterRange next = ParseClassAtom(&char_class_2);

949 if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {	1016 if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {

950 // Either end is an escaped character class. Treat the '-' verbatim.	1017 // Either end is an escaped character class. Treat the '-' verbatim.

951 AddRangeOrEscape(ranges, char_class, first, zone());	1018 AddRangeOrEscape(ranges, char_class, first);

952 ranges->Add(CharacterRange::Singleton('-'), zone());	1019 ranges->Add(CharacterRange::Singleton('-'));

953 AddRangeOrEscape(ranges, char_class_2, next, zone());	1020 AddRangeOrEscape(ranges, char_class_2, next);

954 continue;	1021 continue;

955 }	1022 }

956 if (first.from() > next.to()) {	1023 if (first.from() > next.to()) {

957 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);	1024 ReportError(kRangeOutOfOrder);

	1025 UNREACHABLE();

958 }	1026 }

959 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());	1027 ranges->Add(CharacterRange::Range(first.from(), next.to()));

960 } else {	1028 } else {

961 AddRangeOrEscape(ranges, char_class, first, zone());	1029 AddRangeOrEscape(ranges, char_class, first);

962 }	1030 }

963 }	1031 }

964 if (!has_more()) {	1032 if (!has_more()) {

965 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);	1033 ReportError(kUnterminated);

	1034 UNREACHABLE();

966 }	1035 }

967 Advance();	1036 Advance();

968 if (ranges->length() == 0) {	1037 if (ranges->length() == 0) {

969 ranges->Add(CharacterRange::Everything(), zone());	1038 ranges->Add(CharacterRange::Everything());

970 is_negated = !is_negated;	1039 is_negated = !is_negated;

971 }	1040 }

972 return new(zone()) RegExpCharacterClass(ranges, is_negated);	1041 return new(I) RegExpCharacterClass(ranges, is_negated);

973 }	1042 }

974	1043

975	1044

976 // ----------------------------------------------------------------------------	1045 // ----------------------------------------------------------------------------

977 // The Parser interface.	1046 // The Parser interface.

978	1047

979 bool RegExpParser::ParseRegExp(FlatStringReader* input,	1048 bool RegExpParser::ParseRegExp(const String& input,

980 bool multiline,	1049 bool multiline,

981 RegExpCompileData* result,	1050 RegExpCompileData* result) {

982 Zone* zone) {	1051 ASSERT(result != NULL);

983 DCHECK(result != NULL);	1052 LongJumpScope jump;

984 RegExpParser parser(input, &result->error, multiline, zone);	1053 RegExpParser parser(input, &result->error, multiline);

985 RegExpTree* tree = parser.ParsePattern();	1054 if (setjmp(*jump.Set()) == 0) {

986 if (parser.failed()) {	1055 RegExpTree* tree = parser.ParsePattern();

987 DCHECK(tree == NULL);	1056 ASSERT(tree != NULL);

988 DCHECK(!result->error.is_null());	1057 ASSERT(result->error.IsNull());

989 } else {

990 DCHECK(tree != NULL);

991 DCHECK(result->error.is_null());

992 result->tree = tree;	1058 result->tree = tree;

993 int capture_count = parser.captures_started();	1059 intptr_t capture_count = parser.captures_started();

994 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;	1060 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

995 result->contains_anchor = parser.contains_anchor();	1061 result->contains_anchor = parser.contains_anchor();

996 result->capture_count = capture_count;	1062 result->capture_count = capture_count;

	1063 } else {

	1064 ASSERT(!result->error.IsNull());

	1065 Isolate::Current()->object_store()->clear_sticky_error();

	1066

	1067 // Throw a FormatException on parsing failures.

	1068 const String& message = String::Handle(

	1069 String::Concat(result->error, input));

	1070 const Array& args = Array::Handle(Array::New(1));

	1071 args.SetAt(0, message);

	1072

	1073 Exceptions::ThrowByType(Exceptions::kFormat, args);

997 }	1074 }

998 return !parser.failed();	1075 return !parser.failed();

999 }	1076 }

1000	1077

1001 // SNIP

1002

1003 } // namespace dart	1078 } // namespace dart

OLD	NEW

« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »