src/parsing/parser.cc - Issue 1565183002: [regexp] move regexp parser into own files.

Side by Side Diff: src/parsing/parser.cc

Issue 1565183002: [regexp] move regexp parser into own files. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: fix test compile Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/parsing/parser.h"	5 #include "src/parsing/parser.h"

6	6

7 #include "src/api.h"	7 #include "src/api.h"

8 #include "src/ast/ast.h"	8 #include "src/ast/ast.h"

9 #include "src/ast/ast-expression-visitor.h"	9 #include "src/ast/ast-expression-visitor.h"

10 #include "src/ast/ast-literal-reindexer.h"	10 #include "src/ast/ast-literal-reindexer.h"

(...skipping 77 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
88 set_stack_limit(isolate_->stack_guard()->real_climit());	88 set_stack_limit(isolate_->stack_guard()->real_climit());

89 set_unicode_cache(isolate_->unicode_cache());	89 set_unicode_cache(isolate_->unicode_cache());

90 set_script(script);	90 set_script(script);

91	91

92 if (script->type() == Script::TYPE_NATIVE) {	92 if (script->type() == Script::TYPE_NATIVE) {

93 set_native();	93 set_native();

94 }	94 }

95 }	95 }

96	96

97	97

98 RegExpBuilder::RegExpBuilder(Zone* zone)

99 : zone_(zone),

100 pending_empty_(false),

101 characters_(NULL),

102 terms_(),

103 alternatives_()

104 #ifdef DEBUG

105 , last_added_(ADD_NONE)

106 #endif

107 {}

108

109

110 void RegExpBuilder::FlushCharacters() {

111 pending_empty_ = false;

112 if (characters_ != NULL) {

113 RegExpTree* atom = new(zone()) RegExpAtom(characters_->ToConstVector());

114 characters_ = NULL;

115 text_.Add(atom, zone());

116 LAST(ADD_ATOM);

117 }

118 }

119

120

121 void RegExpBuilder::FlushText() {

122 FlushCharacters();

123 int num_text = text_.length();

124 if (num_text == 0) {

125 return;

126 } else if (num_text == 1) {

127 terms_.Add(text_.last(), zone());

128 } else {

129 RegExpText* text = new(zone()) RegExpText(zone());

130 for (int i = 0; i < num_text; i++)

131 text_.Get(i)->AppendToText(text, zone());

132 terms_.Add(text, zone());

133 }

134 text_.Clear();

135 }

136

137

138 void RegExpBuilder::AddCharacter(uc16 c) {

139 pending_empty_ = false;

140 if (characters_ == NULL) {

141 characters_ = new(zone()) ZoneList<uc16>(4, zone());

142 }

143 characters_->Add(c, zone());

144 LAST(ADD_CHAR);

145 }

146

147

148 void RegExpBuilder::AddEmpty() {

149 pending_empty_ = true;

150 }

151

152

153 void RegExpBuilder::AddAtom(RegExpTree* term) {

154 if (term->IsEmpty()) {

155 AddEmpty();

156 return;

157 }

158 if (term->IsTextElement()) {

159 FlushCharacters();

160 text_.Add(term, zone());

161 } else {

162 FlushText();

163 terms_.Add(term, zone());

164 }

165 LAST(ADD_ATOM);

166 }

167

168

169 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

170 FlushText();

171 terms_.Add(assert, zone());

172 LAST(ADD_ASSERT);

173 }

174

175

176 void RegExpBuilder::NewAlternative() {

177 FlushTerms();

178 }

179

180

181 void RegExpBuilder::FlushTerms() {

182 FlushText();

183 int num_terms = terms_.length();

184 RegExpTree* alternative;

185 if (num_terms == 0) {

186 alternative = new (zone()) RegExpEmpty();

187 } else if (num_terms == 1) {

188 alternative = terms_.last();

189 } else {

190 alternative = new(zone()) RegExpAlternative(terms_.GetList(zone()));

191 }

192 alternatives_.Add(alternative, zone());

193 terms_.Clear();

194 LAST(ADD_NONE);

195 }

196

197

198 RegExpTree* RegExpBuilder::ToRegExp() {

199 FlushTerms();

200 int num_alternatives = alternatives_.length();

201 if (num_alternatives == 0) return new (zone()) RegExpEmpty();

202 if (num_alternatives == 1) return alternatives_.last();

203 return new(zone()) RegExpDisjunction(alternatives_.GetList(zone()));

204 }

205

206

207 void RegExpBuilder::AddQuantifierToAtom(

208 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

209 if (pending_empty_) {

210 pending_empty_ = false;

211 return;

212 }

213 RegExpTree* atom;

214 if (characters_ != NULL) {

215 DCHECK(last_added_ == ADD_CHAR);

216 // Last atom was character.

217 Vector<const uc16> char_vector = characters_->ToConstVector();

218 int num_chars = char_vector.length();

219 if (num_chars > 1) {

220 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);

221 text_.Add(new(zone()) RegExpAtom(prefix), zone());

222 char_vector = char_vector.SubVector(num_chars - 1, num_chars);

223 }

224 characters_ = NULL;

225 atom = new(zone()) RegExpAtom(char_vector);

226 FlushText();

227 } else if (text_.length() > 0) {

228 DCHECK(last_added_ == ADD_ATOM);

229 atom = text_.RemoveLast();

230 FlushText();

231 } else if (terms_.length() > 0) {

232 DCHECK(last_added_ == ADD_ATOM);

233 atom = terms_.RemoveLast();

234 if (atom->max_match() == 0) {

235 // Guaranteed to only match an empty string.

236 LAST(ADD_TERM);

237 if (min == 0) {

238 return;

239 }

240 terms_.Add(atom, zone());

241 return;

242 }

243 } else {

244 // Only call immediately after adding an atom or character!

245 UNREACHABLE();

246 return;

247 }

248 terms_.Add(

249 new(zone()) RegExpQuantifier(min, max, quantifier_type, atom), zone());

250 LAST(ADD_TERM);

251 }

252

253

254 FunctionEntry ParseData::GetFunctionEntry(int start) {	98 FunctionEntry ParseData::GetFunctionEntry(int start) {

255 // The current pre-data entry must be a FunctionEntry with the given	99 // The current pre-data entry must be a FunctionEntry with the given

256 // start position.	100 // start position.

257 if ((function_index_ + FunctionEntry::kSize <= Length()) &&	101 if ((function_index_ + FunctionEntry::kSize <= Length()) &&

258 (static_cast<int>(Data()[function_index_]) == start)) {	102 (static_cast<int>(Data()[function_index_]) == start)) {

259 int index = function_index_;	103 int index = function_index_;

260 function_index_ += FunctionEntry::kSize;	104 function_index_ += FunctionEntry::kSize;

261 Vector<unsigned> subvector(&(Data()[index]), FunctionEntry::kSize);	105 Vector<unsigned> subvector(&(Data()[index]), FunctionEntry::kSize);

262 return FunctionEntry(subvector);	106 return FunctionEntry(subvector);

263 }	107 }

(...skipping 4927 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5191 for (int i = 0; i < use_counts_[feature]; ++i) {	5035 for (int i = 0; i < use_counts_[feature]; ++i) {

5192 isolate->CountUsage(v8::Isolate::UseCounterFeature(feature));	5036 isolate->CountUsage(v8::Isolate::UseCounterFeature(feature));

5193 }	5037 }

5194 }	5038 }

5195 isolate->counters()->total_preparse_skipped()->Increment(	5039 isolate->counters()->total_preparse_skipped()->Increment(

5196 total_preparse_skipped_);	5040 total_preparse_skipped_);

5197 }	5041 }

5198	5042

5199	5043

5200 // ----------------------------------------------------------------------------	5044 // ----------------------------------------------------------------------------

5201 // Regular expressions

5202

5203

5204 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

5205 bool multiline, bool unicode, Isolate* isolate,

5206 Zone* zone)

5207 : isolate_(isolate),

5208 zone_(zone),

5209 error_(error),

5210 captures_(NULL),

5211 in_(in),

5212 current_(kEndMarker),

5213 next_pos_(0),

5214 captures_started_(0),

5215 capture_count_(0),

5216 has_more_(true),

5217 multiline_(multiline),

5218 unicode_(unicode),

5219 simple_(false),

5220 contains_anchor_(false),

5221 is_scanned_for_captures_(false),

5222 failed_(false) {

5223 Advance();

5224 }

5225

5226

5227 uc32 RegExpParser::Next() {

5228 if (has_next()) {

5229 return in()->Get(next_pos_);

5230 } else {

5231 return kEndMarker;

5232 }

5233 }

5234

5235

5236 void RegExpParser::Advance() {

5237 if (next_pos_ < in()->length()) {

5238 StackLimitCheck check(isolate());

5239 if (check.HasOverflowed()) {

5240 ReportError(CStrVector(Isolate::kStackOverflowMessage));

5241 } else if (zone()->excess_allocation()) {

5242 ReportError(CStrVector("Regular expression too large"));

5243 } else {

5244 current_ = in()->Get(next_pos_);

5245 next_pos_++;

5246 }

5247 } else {

5248 current_ = kEndMarker;

5249 // Advance so that position() points to 1-after-the-last-character. This is

5250 // important so that Reset() to this position works correctly.

5251 next_pos_ = in()->length() + 1;

5252 has_more_ = false;

5253 }

5254 }

5255

5256

5257 void RegExpParser::Reset(int pos) {

5258 next_pos_ = pos;

5259 has_more_ = (pos < in()->length());

5260 Advance();

5261 }

5262

5263

5264 void RegExpParser::Advance(int dist) {

5265 next_pos_ += dist - 1;

5266 Advance();

5267 }

5268

5269

5270 bool RegExpParser::simple() {

5271 return simple_;

5272 }

5273

5274

5275 bool RegExpParser::IsSyntaxCharacter(uc32 c) {

5276 return c == '^' \|\| c == '$' \|\| c == '\\' \|\| c == '.' \|\| c == '*' \|\|

5277 c == '+' \|\| c == '?' \|\| c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\|

5278 c == '{' \|\| c == '}' \|\| c == '\|';

5279 }

5280

5281

5282 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {

5283 failed_ = true;

5284 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();

5285 // Zip to the end to make sure the no more input is read.

5286 current_ = kEndMarker;

5287 next_pos_ = in()->length();

5288 return NULL;

5289 }

5290

5291

5292 // Pattern ::

5293 // Disjunction

5294 RegExpTree* RegExpParser::ParsePattern() {

5295 RegExpTree* result = ParseDisjunction(CHECK_FAILED);

5296 DCHECK(!has_more());

5297 // If the result of parsing is a literal string atom, and it has the

5298 // same length as the input, then the atom is identical to the input.

5299 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {

5300 simple_ = true;

5301 }

5302 return result;

5303 }

5304

5305

5306 // Disjunction ::

5307 // Alternative

5308 // Alternative \| Disjunction

5309 // Alternative ::

5310 // [empty]

5311 // Term Alternative

5312 // Term ::

5313 // Assertion

5314 // Atom

5315 // Atom Quantifier

5316 RegExpTree* RegExpParser::ParseDisjunction() {

5317 // Used to store current state while parsing subexpressions.

5318 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,

5319 zone());

5320 RegExpParserState* state = &initial_state;

5321 // Cache the builder in a local variable for quick access.

5322 RegExpBuilder* builder = initial_state.builder();

5323 while (true) {

5324 switch (current()) {

5325 case kEndMarker:

5326 if (state->IsSubexpression()) {

5327 // Inside a parenthesized group when hitting end of input.

5328 ReportError(CStrVector("Unterminated group") CHECK_FAILED);

5329 }

5330 DCHECK_EQ(INITIAL, state->group_type());

5331 // Parsing completed successfully.

5332 return builder->ToRegExp();

5333 case ')': {

5334 if (!state->IsSubexpression()) {

5335 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);

5336 }

5337 DCHECK_NE(INITIAL, state->group_type());

5338

5339 Advance();

5340 // End disjunction parsing and convert builder content to new single

5341 // regexp atom.

5342 RegExpTree* body = builder->ToRegExp();

5343

5344 int end_capture_index = captures_started();

5345

5346 int capture_index = state->capture_index();

5347 SubexpressionType group_type = state->group_type();

5348

5349 // Build result of subexpression.

5350 if (group_type == CAPTURE) {

5351 RegExpCapture* capture = GetCapture(capture_index);

5352 capture->set_body(body);

5353 body = capture;

5354 } else if (group_type != GROUPING) {

5355 DCHECK(group_type == POSITIVE_LOOKAROUND \|\|

5356 group_type == NEGATIVE_LOOKAROUND);

5357 bool is_positive = (group_type == POSITIVE_LOOKAROUND);

5358 body = new (zone()) RegExpLookaround(

5359 body, is_positive, end_capture_index - capture_index, capture_index,

5360 state->lookaround_type());

5361 }

5362

5363 // Restore previous state.

5364 state = state->previous_state();

5365 builder = state->builder();

5366

5367 builder->AddAtom(body);

5368 // For compatability with JSC and ES3, we allow quantifiers after

5369 // lookaheads, and break in all cases.

5370 break;

5371 }

5372 case '\|': {

5373 Advance();

5374 builder->NewAlternative();

5375 continue;

5376 }

5377 case '*':

5378 case '+':

5379 case '?':

5380 return ReportError(CStrVector("Nothing to repeat"));

5381 case '^': {

5382 Advance();

5383 if (multiline_) {

5384 builder->AddAssertion(

5385 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));

5386 } else {

5387 builder->AddAssertion(

5388 new(zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

5389 set_contains_anchor();

5390 }

5391 continue;

5392 }

5393 case '$': {

5394 Advance();

5395 RegExpAssertion::AssertionType assertion_type =

5396 multiline_ ? RegExpAssertion::END_OF_LINE :

5397 RegExpAssertion::END_OF_INPUT;

5398 builder->AddAssertion(new(zone()) RegExpAssertion(assertion_type));

5399 continue;

5400 }

5401 case '.': {

5402 Advance();

5403 // everything except \x0a, \x0d, \u2028 and \u2029

5404 ZoneList<CharacterRange>* ranges =

5405 new(zone()) ZoneList<CharacterRange>(2, zone());

5406 CharacterRange::AddClassEscape('.', ranges, zone());

5407 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);

5408 builder->AddAtom(atom);

5409 break;

5410 }

5411 case '(': {

5412 SubexpressionType subexpr_type = CAPTURE;

5413 RegExpLookaround::Type lookaround_type = state->lookaround_type();

5414 Advance();

5415 if (current() == '?') {

5416 switch (Next()) {

5417 case ':':

5418 subexpr_type = GROUPING;

5419 break;

5420 case '=':

5421 lookaround_type = RegExpLookaround::LOOKAHEAD;

5422 subexpr_type = POSITIVE_LOOKAROUND;

5423 break;

5424 case '!':

5425 lookaround_type = RegExpLookaround::LOOKAHEAD;

5426 subexpr_type = NEGATIVE_LOOKAROUND;

5427 break;

5428 case '<':

5429 if (FLAG_harmony_regexp_lookbehind) {

5430 Advance();

5431 lookaround_type = RegExpLookaround::LOOKBEHIND;

5432 if (Next() == '=') {

5433 subexpr_type = POSITIVE_LOOKAROUND;

5434 break;

5435 } else if (Next() == '!') {

5436 subexpr_type = NEGATIVE_LOOKAROUND;

5437 break;

5438 }

5439 }

5440 // Fall through.

5441 default:

5442 ReportError(CStrVector("Invalid group") CHECK_FAILED);

5443 break;

5444 }

5445 Advance(2);

5446 } else {

5447 if (captures_started_ >= kMaxCaptures) {

5448 ReportError(CStrVector("Too many captures") CHECK_FAILED);

5449 }

5450 captures_started_++;

5451 }

5452 // Store current state and begin new disjunction parsing.

5453 state = new (zone()) RegExpParserState(

5454 state, subexpr_type, lookaround_type, captures_started_, zone());

5455 builder = state->builder();

5456 continue;

5457 }

5458 case '[': {

5459 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);

5460 builder->AddAtom(atom);

5461 break;

5462 }

5463 // Atom ::

5464 // \ AtomEscape

5465 case '\\':

5466 switch (Next()) {

5467 case kEndMarker:

5468 return ReportError(CStrVector("\\ at end of pattern"));

5469 case 'b':

5470 Advance(2);

5471 builder->AddAssertion(

5472 new(zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));

5473 continue;

5474 case 'B':

5475 Advance(2);

5476 builder->AddAssertion(

5477 new(zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

5478 continue;

5479 // AtomEscape ::

5480 // CharacterClassEscape

5481 //

5482 // CharacterClassEscape :: one of

5483 // d D s S w W

5484 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

5485 uc32 c = Next();

5486 Advance(2);

5487 ZoneList<CharacterRange>* ranges =

5488 new(zone()) ZoneList<CharacterRange>(2, zone());

5489 CharacterRange::AddClassEscape(c, ranges, zone());

5490 RegExpTree* atom = new(zone()) RegExpCharacterClass(ranges, false);

5491 builder->AddAtom(atom);

5492 break;

5493 }

5494 case '1': case '2': case '3': case '4': case '5': case '6':

5495 case '7': case '8': case '9': {

5496 int index = 0;

5497 if (ParseBackReferenceIndex(&index)) {

5498 if (state->IsInsideCaptureGroup(index)) {

5499 // The back reference is inside the capture group it refers to.

5500 // Nothing can possibly have been captured yet, so we use empty

5501 // instead. This ensures that, when checking a back reference,

5502 // the capture registers of the referenced capture are either

5503 // both set or both cleared.

5504 builder->AddEmpty();

5505 } else {

5506 RegExpCapture* capture = GetCapture(index);

5507 RegExpTree* atom = new (zone()) RegExpBackReference(capture);

5508 builder->AddAtom(atom);

5509 }

5510 break;

5511 }

5512 uc32 first_digit = Next();

5513 if (first_digit == '8' \|\| first_digit == '9') {

5514 // If the 'u' flag is present, only syntax characters can be escaped,

5515 // no other identity escapes are allowed. If the 'u' flag is not

5516 // present, all identity escapes are allowed.

5517 if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

5518 builder->AddCharacter(first_digit);

5519 Advance(2);

5520 } else {

5521 return ReportError(CStrVector("Invalid escape"));

5522 }

5523 break;

5524 }

5525 }

5526 // FALLTHROUGH

5527 case '0': {

5528 Advance();

5529 uc32 octal = ParseOctalLiteral();

5530 builder->AddCharacter(octal);

5531 break;

5532 }

5533 // ControlEscape :: one of

5534 // f n r t v

5535 case 'f':

5536 Advance(2);

5537 builder->AddCharacter('\f');

5538 break;

5539 case 'n':

5540 Advance(2);

5541 builder->AddCharacter('\n');

5542 break;

5543 case 'r':

5544 Advance(2);

5545 builder->AddCharacter('\r');

5546 break;

5547 case 't':

5548 Advance(2);

5549 builder->AddCharacter('\t');

5550 break;

5551 case 'v':

5552 Advance(2);

5553 builder->AddCharacter('\v');

5554 break;

5555 case 'c': {

5556 Advance();

5557 uc32 controlLetter = Next();

5558 // Special case if it is an ASCII letter.

5559 // Convert lower case letters to uppercase.

5560 uc32 letter = controlLetter & ~('a' ^ 'A');

5561 if (letter < 'A' \|\| 'Z' < letter) {

5562 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

5563 // This is outside the specification. We match JSC in

5564 // reading the backslash as a literal character instead

5565 // of as starting an escape.

5566 builder->AddCharacter('\\');

5567 } else {

5568 Advance(2);

5569 builder->AddCharacter(controlLetter & 0x1f);

5570 }

5571 break;

5572 }

5573 case 'x': {

5574 Advance(2);

5575 uc32 value;

5576 if (ParseHexEscape(2, &value)) {

5577 builder->AddCharacter(value);

5578 } else if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

5579 builder->AddCharacter('x');

5580 } else {

5581 // If the 'u' flag is present, invalid escapes are not treated as

5582 // identity escapes.

5583 return ReportError(CStrVector("Invalid escape"));

5584 }

5585 break;

5586 }

5587 case 'u': {

5588 Advance(2);

5589 uc32 value;

5590 if (ParseUnicodeEscape(&value)) {

5591 builder->AddCharacter(value);

5592 } else if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

5593 builder->AddCharacter('u');

5594 } else {

5595 // If the 'u' flag is present, invalid escapes are not treated as

5596 // identity escapes.

5597 return ReportError(CStrVector("Invalid unicode escape"));

5598 }

5599 break;

5600 }

5601 default:

5602 Advance();

5603 // If the 'u' flag is present, only syntax characters can be escaped, no

5604 // other identity escapes are allowed. If the 'u' flag is not present,

5605 // all identity escapes are allowed.

5606 if (!FLAG_harmony_unicode_regexps \|\| !unicode_ \|\|

5607 IsSyntaxCharacter(current())) {

5608 builder->AddCharacter(current());

5609 Advance();

5610 } else {

5611 return ReportError(CStrVector("Invalid escape"));

5612 }

5613 break;

5614 }

5615 break;

5616 case '{': {

5617 int dummy;

5618 if (ParseIntervalQuantifier(&dummy, &dummy)) {

5619 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);

5620 }

5621 // fallthrough

5622 }

5623 default:

5624 builder->AddCharacter(current());

5625 Advance();

5626 break;

5627 } // end switch(current())

5628

5629 int min;

5630 int max;

5631 switch (current()) {

5632 // QuantifierPrefix ::

5633 // *

5634 // +

5635 // ?

5636 // {

5637 case '*':

5638 min = 0;

5639 max = RegExpTree::kInfinity;

5640 Advance();

5641 break;

5642 case '+':

5643 min = 1;

5644 max = RegExpTree::kInfinity;

5645 Advance();

5646 break;

5647 case '?':

5648 min = 0;

5649 max = 1;

5650 Advance();

5651 break;

5652 case '{':

5653 if (ParseIntervalQuantifier(&min, &max)) {

5654 if (max < min) {

5655 ReportError(CStrVector("numbers out of order in {} quantifier.")

5656 CHECK_FAILED);

5657 }

5658 break;

5659 } else {

5660 continue;

5661 }

5662 default:

5663 continue;

5664 }

5665 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

5666 if (current() == '?') {

5667 quantifier_type = RegExpQuantifier::NON_GREEDY;

5668 Advance();

5669 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {

5670 // FLAG_regexp_possessive_quantifier is a debug-only flag.

5671 quantifier_type = RegExpQuantifier::POSSESSIVE;

5672 Advance();

5673 }

5674 builder->AddQuantifierToAtom(min, max, quantifier_type);

5675 }

5676 }

5677

5678

5679 #ifdef DEBUG

5680 // Currently only used in an DCHECK.

5681 static bool IsSpecialClassEscape(uc32 c) {

5682 switch (c) {

5683 case 'd': case 'D':

5684 case 's': case 'S':

5685 case 'w': case 'W':

5686 return true;

5687 default:

5688 return false;

5689 }

5690 }

5691 #endif

5692

5693

5694 // In order to know whether an escape is a backreference or not we have to scan

5695 // the entire regexp and find the number of capturing parentheses. However we

5696 // don't want to scan the regexp twice unless it is necessary. This mini-parser

5697 // is called when needed. It can see the difference between capturing and

5698 // noncapturing parentheses and can skip character classes and backslash-escaped

5699 // characters.

5700 void RegExpParser::ScanForCaptures() {

5701 // Start with captures started previous to current position

5702 int capture_count = captures_started();

5703 // Add count of captures after this position.

5704 int n;

5705 while ((n = current()) != kEndMarker) {

5706 Advance();

5707 switch (n) {

5708 case '\\':

5709 Advance();

5710 break;

5711 case '[': {

5712 int c;

5713 while ((c = current()) != kEndMarker) {

5714 Advance();

5715 if (c == '\\') {

5716 Advance();

5717 } else {

5718 if (c == ']') break;

5719 }

5720 }

5721 break;

5722 }

5723 case '(':

5724 if (current() != '?') capture_count++;

5725 break;

5726 }

5727 }

5728 capture_count_ = capture_count;

5729 is_scanned_for_captures_ = true;

5730 }

5731

5732

5733 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

5734 DCHECK_EQ('\\', current());

5735 DCHECK('1' <= Next() && Next() <= '9');

5736 // Try to parse a decimal literal that is no greater than the total number

5737 // of left capturing parentheses in the input.

5738 int start = position();

5739 int value = Next() - '0';

5740 Advance(2);

5741 while (true) {

5742 uc32 c = current();

5743 if (IsDecimalDigit(c)) {

5744 value = 10 * value + (c - '0');

5745 if (value > kMaxCaptures) {

5746 Reset(start);

5747 return false;

5748 }

5749 Advance();

5750 } else {

5751 break;

5752 }

5753 }

5754 if (value > captures_started()) {

5755 if (!is_scanned_for_captures_) {

5756 int saved_position = position();

5757 ScanForCaptures();

5758 Reset(saved_position);

5759 }

5760 if (value > capture_count_) {

5761 Reset(start);

5762 return false;

5763 }

5764 }

5765 *index_out = value;

5766 return true;

5767 }

5768

5769

5770 RegExpCapture* RegExpParser::GetCapture(int index) {

5771 // The index for the capture groups are one-based. Its index in the list is

5772 // zero-based.

5773 int know_captures =

5774 is_scanned_for_captures_ ? capture_count_ : captures_started_;

5775 DCHECK(index <= know_captures);

5776 if (captures_ == NULL) {

5777 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());

5778 }

5779 while (captures_->length() < know_captures) {

5780 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());

5781 }

5782 return captures_->at(index - 1);

5783 }

5784

5785

5786 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {

5787 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {

5788 if (s->group_type() != CAPTURE) continue;

5789 // Return true if we found the matching capture index.

5790 if (index == s->capture_index()) return true;

5791 // Abort if index is larger than what has been parsed up till this state.

5792 if (index > s->capture_index()) return false;

5793 }

5794 return false;

5795 }

5796

5797

5798 // QuantifierPrefix ::

5799 // { DecimalDigits }

5800 // { DecimalDigits , }

5801 // { DecimalDigits , DecimalDigits }

5802 //

5803 // Returns true if parsing succeeds, and set the min_out and max_out

5804 // values. Values are truncated to RegExpTree::kInfinity if they overflow.

5805 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {

5806 DCHECK_EQ(current(), '{');

5807 int start = position();

5808 Advance();

5809 int min = 0;

5810 if (!IsDecimalDigit(current())) {

5811 Reset(start);

5812 return false;

5813 }

5814 while (IsDecimalDigit(current())) {

5815 int next = current() - '0';

5816 if (min > (RegExpTree::kInfinity - next) / 10) {

5817 // Overflow. Skip past remaining decimal digits and return -1.

5818 do {

5819 Advance();

5820 } while (IsDecimalDigit(current()));

5821 min = RegExpTree::kInfinity;

5822 break;

5823 }

5824 min = 10 * min + next;

5825 Advance();

5826 }

5827 int max = 0;

5828 if (current() == '}') {

5829 max = min;

5830 Advance();

5831 } else if (current() == ',') {

5832 Advance();

5833 if (current() == '}') {

5834 max = RegExpTree::kInfinity;

5835 Advance();

5836 } else {

5837 while (IsDecimalDigit(current())) {

5838 int next = current() - '0';

5839 if (max > (RegExpTree::kInfinity - next) / 10) {

5840 do {

5841 Advance();

5842 } while (IsDecimalDigit(current()));

5843 max = RegExpTree::kInfinity;

5844 break;

5845 }

5846 max = 10 * max + next;

5847 Advance();

5848 }

5849 if (current() != '}') {

5850 Reset(start);

5851 return false;

5852 }

5853 Advance();

5854 }

5855 } else {

5856 Reset(start);

5857 return false;

5858 }

5859 *min_out = min;

5860 *max_out = max;

5861 return true;

5862 }

5863

5864

5865 uc32 RegExpParser::ParseOctalLiteral() {

5866 DCHECK(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);

5867 // For compatibility with some other browsers (not all), we parse

5868 // up to three octal digits with a value below 256.

5869 uc32 value = current() - '0';

5870 Advance();

5871 if ('0' <= current() && current() <= '7') {

5872 value = value * 8 + current() - '0';

5873 Advance();

5874 if (value < 32 && '0' <= current() && current() <= '7') {

5875 value = value * 8 + current() - '0';

5876 Advance();

5877 }

5878 }

5879 return value;

5880 }

5881

5882

5883 bool RegExpParser::ParseHexEscape(int length, uc32* value) {

5884 int start = position();

5885 uc32 val = 0;

5886 for (int i = 0; i < length; ++i) {

5887 uc32 c = current();

5888 int d = HexValue(c);

5889 if (d < 0) {

5890 Reset(start);

5891 return false;

5892 }

5893 val = val * 16 + d;

5894 Advance();

5895 }

5896 *value = val;

5897 return true;

5898 }

5899

5900

5901 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

5902 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

5903 // allowed). In the latter case, the number of hex digits between { } is

5904 // arbitrary. \ and u have already been read.

5905 if (current() == '{' && FLAG_harmony_unicode_regexps && unicode_) {

5906 int start = position();

5907 Advance();

5908 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

5909 if (current() == '}') {

5910 Advance();

5911 return true;

5912 }

5913 }

5914 Reset(start);

5915 return false;

5916 }

5917 // \u but no {, or \u{...} escapes not allowed.

5918 return ParseHexEscape(4, value);

5919 }

5920

5921

5922 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {

5923 uc32 x = 0;

5924 int d = HexValue(current());

5925 if (d < 0) {

5926 return false;

5927 }

5928 while (d >= 0) {

5929 x = x * 16 + d;

5930 if (x > max_value) {

5931 return false;

5932 }

5933 Advance();

5934 d = HexValue(current());

5935 }

5936 *value = x;

5937 return true;

5938 }

5939

5940

5941 uc32 RegExpParser::ParseClassCharacterEscape() {

5942 DCHECK(current() == '\\');

5943 DCHECK(has_next() && !IsSpecialClassEscape(Next()));

5944 Advance();

5945 switch (current()) {

5946 case 'b':

5947 Advance();

5948 return '\b';

5949 // ControlEscape :: one of

5950 // f n r t v

5951 case 'f':

5952 Advance();

5953 return '\f';

5954 case 'n':

5955 Advance();

5956 return '\n';

5957 case 'r':

5958 Advance();

5959 return '\r';

5960 case 't':

5961 Advance();

5962 return '\t';

5963 case 'v':

5964 Advance();

5965 return '\v';

5966 case 'c': {

5967 uc32 controlLetter = Next();

5968 uc32 letter = controlLetter & ~('A' ^ 'a');

5969 // For compatibility with JSC, inside a character class

5970 // we also accept digits and underscore as control characters.

5971 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

5972 controlLetter == '_' \|\|

5973 (letter >= 'A' && letter <= 'Z')) {

5974 Advance(2);

5975 // Control letters mapped to ASCII control characters in the range

5976 // 0x00-0x1f.

5977 return controlLetter & 0x1f;

5978 }

5979 // We match JSC in reading the backslash as a literal

5980 // character instead of as starting an escape.

5981 return '\\';

5982 }

5983 case '0': case '1': case '2': case '3': case '4': case '5':

5984 case '6': case '7':

5985 // For compatibility, we interpret a decimal escape that isn't

5986 // a back reference (and therefore either \0 or not valid according

5987 // to the specification) as a 1..3 digit octal character code.

5988 return ParseOctalLiteral();

5989 case 'x': {

5990 Advance();

5991 uc32 value;

5992 if (ParseHexEscape(2, &value)) {

5993 return value;

5994 }

5995 if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

5996 // If \x is not followed by a two-digit hexadecimal, treat it

5997 // as an identity escape.

5998 return 'x';

5999 }

6000 // If the 'u' flag is present, invalid escapes are not treated as

6001 // identity escapes.

6002 ReportError(CStrVector("Invalid escape"));

6003 return 0;

6004 }

6005 case 'u': {

6006 Advance();

6007 uc32 value;

6008 if (ParseUnicodeEscape(&value)) {

6009 return value;

6010 }

6011 if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

6012 return 'u';

6013 }

6014 // If the 'u' flag is present, invalid escapes are not treated as

6015 // identity escapes.

6016 ReportError(CStrVector("Invalid unicode escape"));

6017 return 0;

6018 }

6019 default: {

6020 uc32 result = current();

6021 // If the 'u' flag is present, only syntax characters can be escaped, no

6022 // other identity escapes are allowed. If the 'u' flag is not present, all

6023 // identity escapes are allowed.

6024 if (!FLAG_harmony_unicode_regexps \|\| !unicode_ \|\|

6025 IsSyntaxCharacter(result)) {

6026 Advance();

6027 return result;

6028 }

6029 ReportError(CStrVector("Invalid escape"));

6030 return 0;

6031 }

6032 }

6033 return 0;

6034 }

6035

6036

6037 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

6038 DCHECK_EQ(0, *char_class);

6039 uc32 first = current();

6040 if (first == '\\') {

6041 switch (Next()) {

6042 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {

6043 *char_class = Next();

6044 Advance(2);

6045 return CharacterRange::Singleton(0); // Return dummy value.

6046 }

6047 case kEndMarker:

6048 return ReportError(CStrVector("\\ at end of pattern"));

6049 default:

6050 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);

6051 return CharacterRange::Singleton(c);

6052 }

6053 } else {

6054 Advance();

6055 return CharacterRange::Singleton(first);

6056 }

6057 }

6058

6059

6060 static const uc16 kNoCharClass = 0;

6061

6062 // Adds range or pre-defined character class to character ranges.

6063 // If char_class is not kInvalidClass, it's interpreted as a class

6064 // escape (i.e., 's' means whitespace, from '\s').

6065 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

6066 uc16 char_class,

6067 CharacterRange range,

6068 Zone* zone) {

6069 if (char_class != kNoCharClass) {

6070 CharacterRange::AddClassEscape(char_class, ranges, zone);

6071 } else {

6072 ranges->Add(range, zone);

6073 }

6074 }

6075

6076

6077 RegExpTree* RegExpParser::ParseCharacterClass() {

6078 static const char* kUnterminated = "Unterminated character class";

6079 static const char* kRangeOutOfOrder = "Range out of order in character class";

6080

6081 DCHECK_EQ(current(), '[');

6082 Advance();

6083 bool is_negated = false;

6084 if (current() == '^') {

6085 is_negated = true;

6086 Advance();

6087 }

6088 ZoneList<CharacterRange>* ranges =

6089 new(zone()) ZoneList<CharacterRange>(2, zone());

6090 while (has_more() && current() != ']') {

6091 uc16 char_class = kNoCharClass;

6092 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);

6093 if (current() == '-') {

6094 Advance();

6095 if (current() == kEndMarker) {

6096 // If we reach the end we break out of the loop and let the

6097 // following code report an error.

6098 break;

6099 } else if (current() == ']') {

6100 AddRangeOrEscape(ranges, char_class, first, zone());

6101 ranges->Add(CharacterRange::Singleton('-'), zone());

6102 break;

6103 }

6104 uc16 char_class_2 = kNoCharClass;

6105 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);

6106 if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {

6107 // Either end is an escaped character class. Treat the '-' verbatim.

6108 AddRangeOrEscape(ranges, char_class, first, zone());

6109 ranges->Add(CharacterRange::Singleton('-'), zone());

6110 AddRangeOrEscape(ranges, char_class_2, next, zone());

6111 continue;

6112 }

6113 if (first.from() > next.to()) {

6114 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);

6115 }

6116 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());

6117 } else {

6118 AddRangeOrEscape(ranges, char_class, first, zone());

6119 }

6120 }

6121 if (!has_more()) {

6122 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);

6123 }

6124 Advance();

6125 if (ranges->length() == 0) {

6126 ranges->Add(CharacterRange::Everything(), zone());

6127 is_negated = !is_negated;

6128 }

6129 return new (zone()) RegExpCharacterClass(ranges, is_negated);

6130 }

6131

6132

6133 // ----------------------------------------------------------------------------

6134 // The Parser interface.	5045 // The Parser interface.

6135	5046

6136 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

6137 FlatStringReader* input, bool multiline,

6138 bool unicode, RegExpCompileData* result) {

6139 DCHECK(result != NULL);

6140 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);

6141 RegExpTree* tree = parser.ParsePattern();

6142 if (parser.failed()) {

6143 DCHECK(tree == NULL);

6144 DCHECK(!result->error.is_null());

6145 } else {

6146 DCHECK(tree != NULL);

6147 DCHECK(result->error.is_null());

6148 result->tree = tree;

6149 int capture_count = parser.captures_started();

6150 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

6151 result->contains_anchor = parser.contains_anchor();

6152 result->capture_count = capture_count;

6153 }

6154 return !parser.failed();

6155 }

6156

6157	5047

6158 bool Parser::ParseStatic(ParseInfo* info) {	5048 bool Parser::ParseStatic(ParseInfo* info) {

6159 Parser parser(info);	5049 Parser parser(info);

6160 if (parser.Parse(info)) {	5050 if (parser.Parse(info)) {

6161 info->set_language_mode(info->literal()->language_mode());	5051 info->set_language_mode(info->literal()->language_mode());

6162 return true;	5052 return true;

6163 }	5053 }

6164 return false;	5054 return false;

6165 }	5055 }

6166	5056

(...skipping 393 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6560 auto class_literal = value->AsClassLiteral();	5450 auto class_literal = value->AsClassLiteral();

6561 if (class_literal->raw_name() == nullptr) {	5451 if (class_literal->raw_name() == nullptr) {

6562 class_literal->set_raw_name(name);	5452 class_literal->set_raw_name(name);

6563 }	5453 }

6564 }	5454 }

6565 }	5455 }

6566	5456

6567	5457

6568 } // namespace internal	5458 } // namespace internal

6569 } // namespace v8	5459 } // namespace v8

OLD	NEW

« no previous file with comments | « src/parsing/parser.h ('k') | src/parsing/scanner.h » ('j') | no next file with comments »