runtime/vm/regexp_parser.cc - Issue 539153002: Port and integrate the irregexp engine from V8

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 539153002: Port and integrate the irregexp engine from V8 (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: Addressed Ivan's comments. Created 6 years, 2 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 #include "vm/longjump.h"

	6 #include "vm/object_store.h"

	7 #include "vm/regexp_parser.h"

	8

	9 namespace dart {

	10

	11 #define I isolate()

	12

	13 // Enables possessive quantifier syntax for testing.

	14 static const bool FLAG_regexp_possessive_quantifier = false;

	15

	16 RegExpBuilder::RegExpBuilder()

	17 : isolate_(Isolate::Current()),

	18 pending_empty_(false),

	19 characters_(NULL),

	20 terms_(),

	21 text_(),

	22 alternatives_()

	23 #ifdef DEBUG

	24 , last_added_(ADD_NONE)

	25 #endif

	26 {}

	27

	28

	29 void RegExpBuilder::FlushCharacters() {

	30 pending_empty_ = false;

	31 if (characters_ != NULL) {

	32 RegExpTree* atom = new(I) RegExpAtom(characters_);

	33 characters_ = NULL;

	34 text_.Add(atom);

	35 LAST(ADD_ATOM);

	36 }

	37 }

	38

	39

	40 void RegExpBuilder::FlushText() {

	41 FlushCharacters();

	42 intptr_t num_text = text_.length();

	43 if (num_text == 0) {

	44 return;

	45 } else if (num_text == 1) {

	46 terms_.Add(text_.Last());

	47 } else {

	48 RegExpText* text = new(I) RegExpText();

	49 for (intptr_t i = 0; i < num_text; i++)

	50 text_[i]->AppendToText(text);

	51 terms_.Add(text);

	52 }

	53 text_.Clear();

	54 }

	55

	56

	57 void RegExpBuilder::AddCharacter(uint16_t c) {

	58 pending_empty_ = false;

	59 if (characters_ == NULL) {

	60 characters_ = new(I) ZoneGrowableArray<uint16_t>(4);

	61 }

	62 characters_->Add(c);

	63 LAST(ADD_CHAR);

	64 }

	65

	66

	67 void RegExpBuilder::AddEmpty() {

	68 pending_empty_ = true;

	69 }

	70

	71

	72 void RegExpBuilder::AddAtom(RegExpTree* term) {

	73 if (term->IsEmpty()) {

	74 AddEmpty();

	75 return;

	76 }

	77 if (term->IsTextElement()) {

	78 FlushCharacters();

	79 text_.Add(term);

	80 } else {

	81 FlushText();

	82 terms_.Add(term);

	83 }

	84 LAST(ADD_ATOM);

	85 }

	86

	87

	88 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

	89 FlushText();

	90 terms_.Add(assert);

	91 LAST(ADD_ASSERT);

	92 }

	93

	94

	95 void RegExpBuilder::NewAlternative() {

	96 FlushTerms();

	97 }

	98

	99

	100 void RegExpBuilder::FlushTerms() {

	101 FlushText();

	102 intptr_t num_terms = terms_.length();

	103 RegExpTree* alternative;

	104 if (num_terms == 0) {

	105 alternative = RegExpEmpty::GetInstance();

	106 } else if (num_terms == 1) {

	107 alternative = terms_.Last();

	108 } else {

	109 ZoneGrowableArray<RegExpTree> terms =

	110 new(I) ZoneGrowableArray<RegExpTree*>();

	111 for (intptr_t i = 0; i < terms_.length(); i++) {

	112 terms->Add(terms_[i]);

	113 }

	114 alternative = new(I) RegExpAlternative(terms);

	115 }

	116 alternatives_.Add(alternative);

	117 terms_.Clear();

	118 LAST(ADD_NONE);

	119 }

	120

	121

	122 RegExpTree* RegExpBuilder::ToRegExp() {

	123 FlushTerms();

	124 intptr_t num_alternatives = alternatives_.length();

	125 if (num_alternatives == 0) {

	126 return RegExpEmpty::GetInstance();

	127 }

	128 if (num_alternatives == 1) {

	129 return alternatives_.Last();

	130 }

	131 ZoneGrowableArray<RegExpTree> alternatives =

	132 new(I) ZoneGrowableArray<RegExpTree*>();

	133 for (intptr_t i = 0; i < alternatives_.length(); i++) {

	134 alternatives->Add(alternatives_[i]);

	135 }

	136 return new(I) RegExpDisjunction(alternatives);

	137 }

	138

	139

	140 void RegExpBuilder::AddQuantifierToAtom(

	141 intptr_t min,

	142 intptr_t max,

	143 RegExpQuantifier::QuantifierType quantifier_type) {

	144 if (pending_empty_) {

	145 pending_empty_ = false;

	146 return;

	147 }

	148 RegExpTree* atom;

	149 if (characters_ != NULL) {

	150 DEBUG_ASSERT(last_added_ == ADD_CHAR);

	151 // Last atom was character.

	152

	153 ZoneGrowableArray<uint16_t> *char_vector =

	154 new(I) ZoneGrowableArray<uint16_t>();

	155 char_vector->AddArray(*characters_);

	156 intptr_t num_chars = char_vector->length();

	157 if (num_chars > 1) {

	158 ZoneGrowableArray<uint16_t> *prefix =

	159 new(I) ZoneGrowableArray<uint16_t>();

	160 for (intptr_t i = 0; i < num_chars - 1; i++) {

	161 prefix->Add(char_vector->At(i));

	162 }

	163 text_.Add(new(I) RegExpAtom(prefix));

	164 ZoneGrowableArray<uint16_t> *tail = new(I) ZoneGrowableArray<uint16_t>();

	165 tail->Add(char_vector->At(num_chars - 1));

	166 char_vector = tail;

	167 }

	168 characters_ = NULL;

	169 atom = new(I) RegExpAtom(char_vector);

	170 FlushText();

	171 } else if (text_.length() > 0) {

	172 DEBUG_ASSERT(last_added_ == ADD_ATOM);

	173 atom = text_.RemoveLast();

	174 FlushText();

	175 } else if (terms_.length() > 0) {

	176 DEBUG_ASSERT(last_added_ == ADD_ATOM);

	177 atom = terms_.RemoveLast();

	178 if (atom->max_match() == 0) {

	179 // Guaranteed to only match an empty string.

	180 LAST(ADD_TERM);

	181 if (min == 0) {

	182 return;

	183 }

	184 terms_.Add(atom);

	185 return;

	186 }

	187 } else {

	188 // Only call immediately after adding an atom or character!

	189 UNREACHABLE();

	190 return;

	191 }

	192 terms_.Add(new(I) RegExpQuantifier(min, max, quantifier_type, atom));

	193 LAST(ADD_TERM);

	194 }

	195

	196 // ----------------------------------------------------------------------------

	197 // Implementation of Parser

	198

	199 RegExpParser::RegExpParser(const String& in,

	200 String* error,

	201 bool multiline)

	202 : isolate_(Isolate::Current()),

	203 error_(error),

	204 captures_(NULL),

	205 in_(in),

	206 current_(kEndMarker),

	207 next_pos_(0),

	208 capture_count_(0),

	209 has_more_(true),

	210 multiline_(multiline),

	211 simple_(false),

	212 contains_anchor_(false),

	213 is_scanned_for_captures_(false),

	214 failed_(false) {

	215 Advance();

	216 }

	217

	218

	219 uint32_t RegExpParser::Next() {

	220 if (has_next()) {

	221 return in().CharAt(next_pos_);

	222 } else {

	223 return kEndMarker;

	224 }

	225 }

	226

	227

	228 void RegExpParser::Advance() {

	229 if (next_pos_ < in().Length()) {

	230 current_ = in().CharAt(next_pos_);

	231 next_pos_++;

	232 } else {

	233 current_ = kEndMarker;

	234 has_more_ = false;

	235 }

	236 }

	237

	238

	239 void RegExpParser::Advance(intptr_t dist) {

	240 next_pos_ += dist - 1;

	241 Advance();

	242 }

	243

	244

	245 void RegExpParser::Reset(intptr_t pos) {

	246 next_pos_ = pos;

	247 has_more_ = (pos < in().Length());

	248 Advance();

	249 }

	250

	251

	252 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) {

	253 Isolate* isolate = parsed_function->isolate();

	254 JSRegExp& regexp = JSRegExp::Handle(parsed_function->function().regexp());

	255

	256 const String& pattern = String::Handle(regexp.pattern());

	257 const bool multiline = regexp.is_multi_line();

	258

	259 RegExpCompileData* compile_data = new(isolate) RegExpCompileData();

	260 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {

	261 // Parsing failures are handled in the JSRegExp factory constructor.

	262 UNREACHABLE();

	263 }

	264

	265 regexp.set_num_bracket_expressions(compile_data->capture_count);

	266 if (compile_data->simple) {

	267 regexp.set_is_simple();

	268 } else {

	269 regexp.set_is_complex();

	270 }

	271

	272 parsed_function->SetRegExpCompileData(compile_data);

	273

	274 return true;

	275 }

	276

	277

	278 bool RegExpParser::ParseRegExp(const String& input,

	279 bool multiline,

	280 RegExpCompileData* result) {

	281 ASSERT(result != NULL);

	282 LongJumpScope jump;

	283 RegExpParser parser(input, &result->error, multiline);

	284 if (setjmp(*jump.Set()) == 0) {

	285 RegExpTree* tree = parser.ParsePattern();

	286 ASSERT(tree != NULL);

	287 ASSERT(result->error.IsNull());

	288 result->tree = tree;

	289 intptr_t capture_count = parser.captures_started();

	290 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

	291 result->contains_anchor = parser.contains_anchor();

	292 result->capture_count = capture_count;

	293 } else {

	294 ASSERT(!result->error.IsNull());

	295 Isolate::Current()->object_store()->clear_sticky_error();

	296

	297 // Throw a FormatException on parsing failures.

	298 const String& message = String::Handle(

	299 String::Concat(result->error, input));

	300 const Array& args = Array::Handle(Array::New(1));

	301 args.SetAt(0, message);

	302

	303 Exceptions::ThrowByType(Exceptions::kFormat, args);

	304 }

	305 return !parser.failed();

	306 }

	307

	308

	309 void RegExpParser::ReportError(const char* message) {

	310 failed_ = true;

	311 *error_ = String::New(message);

	312 // Zip to the end to make sure the no more input is read.

	313 current_ = kEndMarker;

	314 next_pos_ = in().Length();

	315

	316 const Error& error = Error::Handle(LanguageError::New(*error_));

	317 Report::LongJump(error);

	318 UNREACHABLE();

	319 }

	320

	321

	322 // Pattern ::

	323 // Disjunction

	324 RegExpTree* RegExpParser::ParsePattern() {

	325 RegExpTree* result = ParseDisjunction();

	326 ASSERT(!has_more());

	327 // If the result of parsing is a literal string atom, and it has the

	328 // same length as the input, then the atom is identical to the input.

	329 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {

	330 simple_ = true;

	331 }

	332 return result;

	333 }

	334

	335

	336 // Disjunction ::

	337 // Alternative

	338 // Alternative \| Disjunction

	339 // Alternative ::

	340 // [empty]

	341 // Term Alternative

	342 // Term ::

	343 // Assertion

	344 // Atom

	345 // Atom Quantifier

	346 RegExpTree* RegExpParser::ParseDisjunction() {

	347 // Used to store current state while parsing subexpressions.

	348 RegExpParserState initial_state(NULL, INITIAL, 0, I);

	349 RegExpParserState* stored_state = &initial_state;

	350 // Cache the builder in a local variable for quick access.

	351 RegExpBuilder* builder = initial_state.builder();

	352 while (true) {

	353 switch (current()) {

	354 case kEndMarker:

	355 if (stored_state->IsSubexpression()) {

	356 // Inside a parenthesized group when hitting end of input.

	357 ReportError("Unterminated group");

	358 UNREACHABLE();

	359 }

	360 ASSERT(INITIAL == stored_state->group_type());

	361 // Parsing completed successfully.

	362 return builder->ToRegExp();

	363 case ')': {

	364 if (!stored_state->IsSubexpression()) {

	365 ReportError("Unmatched ')'");

	366 UNREACHABLE();

	367 }

	368 ASSERT(INITIAL != stored_state->group_type());

	369

	370 Advance();

	371 // End disjunction parsing and convert builder content to new single

	372 // regexp atom.

	373 RegExpTree* body = builder->ToRegExp();

	374

	375 intptr_t end_capture_index = captures_started();

	376

	377 intptr_t capture_index = stored_state->capture_index();

	378 SubexpressionType group_type = stored_state->group_type();

	379

	380 // Restore previous state.

	381 stored_state = stored_state->previous_state();

	382 builder = stored_state->builder();

	383

	384 // Build result of subexpression.

	385 if (group_type == CAPTURE) {

	386 RegExpCapture* capture = new(I) RegExpCapture(body, capture_index);

	387 (*captures_)[capture_index - 1] = capture;

	388 body = capture;

	389 } else if (group_type != GROUPING) {

	390 ASSERT(group_type == POSITIVE_LOOKAHEAD \|\|

	391 group_type == NEGATIVE_LOOKAHEAD);

	392 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);

	393 body = new(I) RegExpLookahead(body,

	394 is_positive,

	395 end_capture_index - capture_index,

	396 capture_index);

	397 }

	398 builder->AddAtom(body);

	399 // For compatibility with JSC and ES3, we allow quantifiers after

	400 // lookaheads, and break in all cases.

	401 break;

	402 }

	403 case '\|': {

	404 Advance();

	405 builder->NewAlternative();

	406 continue;

	407 }

	408 case '*':

	409 case '+':

	410 case '?':

	411 ReportError("Nothing to repeat");

	412 UNREACHABLE();

	413 case '^': {

	414 Advance();

	415 if (multiline_) {

	416 builder->AddAssertion(

	417 new(I) RegExpAssertion(RegExpAssertion::START_OF_LINE));

	418 } else {

	419 builder->AddAssertion(

	420 new(I) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

	421 set_contains_anchor();

	422 }

	423 continue;

	424 }

	425 case '$': {

	426 Advance();

	427 RegExpAssertion::AssertionType assertion_type =

	428 multiline_ ? RegExpAssertion::END_OF_LINE :

	429 RegExpAssertion::END_OF_INPUT;

	430 builder->AddAssertion(new RegExpAssertion(assertion_type));

	431 continue;

	432 }

	433 case '.': {

	434 Advance();

	435 // everything except \x0a, \x0d, \u2028 and \u2029

	436 ZoneGrowableArray<CharacterRange>* ranges =

	437 new ZoneGrowableArray<CharacterRange>(2);

	438 CharacterRange::AddClassEscape('.', ranges);

	439 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

	440 builder->AddAtom(atom);

	441 break;

	442 }

	443 case '(': {

	444 SubexpressionType subexpr_type = CAPTURE;

	445 Advance();

	446 if (current() == '?') {

	447 switch (Next()) {

	448 case ':':

	449 subexpr_type = GROUPING;

	450 break;

	451 case '=':

	452 subexpr_type = POSITIVE_LOOKAHEAD;

	453 break;

	454 case '!':

	455 subexpr_type = NEGATIVE_LOOKAHEAD;

	456 break;

	457 default:

	458 ReportError("Invalid group");

	459 UNREACHABLE();

	460 }

	461 Advance(2);

	462 } else {

	463 if (captures_ == NULL) {

	464 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);

	465 }

	466 if (captures_started() >= kMaxCaptures) {

	467 ReportError("Too many captures");

	468 UNREACHABLE();

	469 }

	470 captures_->Add(NULL);

	471 }

	472 // Store current state and begin new disjunction parsing.

	473 stored_state = new RegExpParserState(stored_state, subexpr_type,

	474 captures_started(), I);

	475 builder = stored_state->builder();

	476 continue;

	477 }

	478 case '[': {

	479 RegExpTree* atom = ParseCharacterClass();

	480 builder->AddAtom(atom);

	481 break;

	482 }

	483 // Atom ::

	484 // \ AtomEscape

	485 case '\\':

	486 switch (Next()) {

	487 case kEndMarker:

	488 ReportError("\\ at end of pattern");

	489 UNREACHABLE();

	490 case 'b':

	491 Advance(2);

	492 builder->AddAssertion(

	493 new RegExpAssertion(RegExpAssertion::BOUNDARY));

	494 continue;

	495 case 'B':

	496 Advance(2);

	497 builder->AddAssertion(

	498 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

	499 continue;

	500 // AtomEscape ::

	501 // CharacterClassEscape

	502 //

	503 // CharacterClassEscape :: one of

	504 // d D s S w W

	505 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

	506 uint32_t c = Next();

	507 Advance(2);

	508 ZoneGrowableArray<CharacterRange>* ranges =

	509 new ZoneGrowableArray<CharacterRange>(2);

	510 CharacterRange::AddClassEscape(c, ranges);

	511 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

	512 builder->AddAtom(atom);

	513 break;

	514 }

	515 case '1': case '2': case '3': case '4': case '5': case '6':

	516 case '7': case '8': case '9': {

	517 intptr_t index = 0;

	518 if (ParseBackReferenceIndex(&index)) {

	519 RegExpCapture* capture = NULL;

	520 if (captures_ != NULL && index <= captures_->length()) {

	521 capture = captures_->At(index - 1);

	522 }

	523 if (capture == NULL) {

	524 builder->AddEmpty();

	525 break;

	526 }

	527 RegExpTree* atom = new RegExpBackReference(capture);

	528 builder->AddAtom(atom);

	529 break;

	530 }

	531 uint32_t first_digit = Next();

	532 if (first_digit == '8' \|\| first_digit == '9') {

	533 // Treat as identity escape

	534 builder->AddCharacter(first_digit);

	535 Advance(2);

	536 break;

	537 }

	538 }

	539 // FALLTHROUGH

	540 case '0': {

	541 Advance();

	542 uint32_t octal = ParseOctalLiteral();

	543 builder->AddCharacter(octal);

	544 break;

	545 }

	546 // ControlEscape :: one of

	547 // f n r t v

	548 case 'f':

	549 Advance(2);

	550 builder->AddCharacter('\f');

	551 break;

	552 case 'n':

	553 Advance(2);

	554 builder->AddCharacter('\n');

	555 break;

	556 case 'r':

	557 Advance(2);

	558 builder->AddCharacter('\r');

	559 break;

	560 case 't':

	561 Advance(2);

	562 builder->AddCharacter('\t');

	563 break;

	564 case 'v':

	565 Advance(2);

	566 builder->AddCharacter('\v');

	567 break;

	568 case 'c': {

	569 Advance();

	570 uint32_t controlLetter = Next();

	571 // Special case if it is an ASCII letter.

	572 // Convert lower case letters to uppercase.

	573 uint32_t letter = controlLetter & ~('a' ^ 'A');

	574 if (letter < 'A' \|\| 'Z' < letter) {

	575 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

	576 // This is outside the specification. We match JSC in

	577 // reading the backslash as a literal character instead

	578 // of as starting an escape.

	579 builder->AddCharacter('\\');

	580 } else {

	581 Advance(2);

	582 builder->AddCharacter(controlLetter & 0x1f);

	583 }

	584 break;

	585 }

	586 case 'x': {

	587 Advance(2);

	588 uint32_t value;

	589 if (ParseHexEscape(2, &value)) {

	590 builder->AddCharacter(value);

	591 } else {

	592 builder->AddCharacter('x');

	593 }

	594 break;

	595 }

	596 case 'u': {

	597 Advance(2);

	598 uint32_t value;

	599 if (ParseHexEscape(4, &value)) {

	600 builder->AddCharacter(value);

	601 } else {

	602 builder->AddCharacter('u');

	603 }

	604 break;

	605 }

	606 default:

	607 // Identity escape.

	608 builder->AddCharacter(Next());

	609 Advance(2);

	610 break;

	611 }

	612 break;

	613 case '{': {

	614 intptr_t dummy;

	615 if (ParseIntervalQuantifier(&dummy, &dummy)) {

	616 ReportError("Nothing to repeat");

	617 UNREACHABLE();

	618 }

	619 // fallthrough

	620 }

	621 default:

	622 builder->AddCharacter(current());

	623 Advance();

	624 break;

	625 } // end switch(current())

	626

	627 intptr_t min;

	628 intptr_t max;

	629 switch (current()) {

	630 // QuantifierPrefix ::

	631 // *

	632 // +

	633 // ?

	634 // {

	635 case '*':

	636 min = 0;

	637 max = RegExpTree::kInfinity;

	638 Advance();

	639 break;

	640 case '+':

	641 min = 1;

	642 max = RegExpTree::kInfinity;

	643 Advance();

	644 break;

	645 case '?':

	646 min = 0;

	647 max = 1;

	648 Advance();

	649 break;

	650 case '{':

	651 if (ParseIntervalQuantifier(&min, &max)) {

	652 if (max < min) {

	653 ReportError("numbers out of order in {} quantifier.");

	654 UNREACHABLE();

	655 }

	656 break;

	657 } else {

	658 continue;

	659 }

	660 default:

	661 continue;

	662 }

	663 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

	664 if (current() == '?') {

	665 quantifier_type = RegExpQuantifier::NON_GREEDY;

	666 Advance();

	667 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {

	668 // FLAG_regexp_possessive_quantifier is a debug-only flag.

	669 quantifier_type = RegExpQuantifier::POSSESSIVE;

	670 Advance();

	671 }

	672 builder->AddQuantifierToAtom(min, max, quantifier_type);

	673 }

	674 }

	675

	676

	677 static const uint16_t kNoCharClass = 0;

	678

	679 // Adds range or pre-defined character class to character ranges.

	680 // If char_class is not kInvalidClass, it's interpreted as a class

	681 // escape (i.e., 's' means whitespace, from '\s').

	682 static inline void AddRangeOrEscape(ZoneGrowableArray<CharacterRange>* ranges,

	683 uint16_t char_class,

	684 CharacterRange range) {

	685 if (char_class != kNoCharClass) {

	686 CharacterRange::AddClassEscape(char_class, ranges);

	687 } else {

	688 ranges->Add(range);

	689 }

	690 }

	691

	692

	693 RegExpTree* RegExpParser::ParseCharacterClass() {

	694 static const char* kUnterminated = "Unterminated character class";

	695 static const char* kRangeOutOfOrder = "Range out of order in character class";

	696

	697 ASSERT(current() == '[');

	698 Advance();

	699 bool is_negated = false;

	700 if (current() == '^') {

	701 is_negated = true;

	702 Advance();

	703 }

	704 ZoneGrowableArray<CharacterRange>* ranges =

	705 new(I) ZoneGrowableArray<CharacterRange>(2);

	706 while (has_more() && current() != ']') {

	707 uint16_t char_class = kNoCharClass;

	708 CharacterRange first = ParseClassAtom(&char_class);

	709 if (current() == '-') {

	710 Advance();

	711 if (current() == kEndMarker) {

	712 // If we reach the end we break out of the loop and let the

	713 // following code report an error.

	714 break;

	715 } else if (current() == ']') {

	716 AddRangeOrEscape(ranges, char_class, first);

	717 ranges->Add(CharacterRange::Singleton('-'));

	718 break;

	719 }

	720 uint16_t char_class_2 = kNoCharClass;

	721 CharacterRange next = ParseClassAtom(&char_class_2);

	722 if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {

	723 // Either end is an escaped character class. Treat the '-' verbatim.

	724 AddRangeOrEscape(ranges, char_class, first);

	725 ranges->Add(CharacterRange::Singleton('-'));

	726 AddRangeOrEscape(ranges, char_class_2, next);

	727 continue;

	728 }

	729 if (first.from() > next.to()) {

	730 ReportError(kRangeOutOfOrder);

	731 UNREACHABLE();

	732 }

	733 ranges->Add(CharacterRange::Range(first.from(), next.to()));

	734 } else {

	735 AddRangeOrEscape(ranges, char_class, first);

	736 }

	737 }

	738 if (!has_more()) {

	739 ReportError(kUnterminated);

	740 UNREACHABLE();

	741 }

	742 Advance();

	743 if (ranges->length() == 0) {

	744 ranges->Add(CharacterRange::Everything());

	745 is_negated = !is_negated;

	746 }

	747 return new(I) RegExpCharacterClass(ranges, is_negated);

	748 }

	749

	750

	751 #ifdef DEBUG

	752 // Currently only used in an ASSERT.

	753 static bool IsSpecialClassEscape(uint32_t c) {

	754 switch (c) {

	755 case 'd': case 'D':

	756 case 's': case 'S':

	757 case 'w': case 'W':

	758 return true;

	759 default:

	760 return false;

	761 }

	762 }

	763 #endif

	764

	765

	766 // In order to know whether an escape is a backreference or not we have to scan

	767 // the entire regexp and find the number of capturing parentheses. However we

	768 // don't want to scan the regexp twice unless it is necessary. This mini-parser

	769 // is called when needed. It can see the difference between capturing and

	770 // noncapturing parentheses and can skip character classes and backslash-escaped

	771 // characters.

	772 void RegExpParser::ScanForCaptures() {

	773 // Start with captures started previous to current position

	774 intptr_t capture_count = captures_started();

	775 // Add count of captures after this position.

	776 intptr_t n;

	777 while ((n = current()) != kEndMarker) {

	778 Advance();

	779 switch (n) {

	780 case '\\':

	781 Advance();

	782 break;

	783 case '[': {

	784 intptr_t c;

	785 while ((c = current()) != kEndMarker) {

	786 Advance();

	787 if (c == '\\') {

	788 Advance();

	789 } else {

	790 if (c == ']') break;

	791 }

	792 }

	793 break;

	794 }

	795 case '(':

	796 if (current() != '?') capture_count++;

	797 break;

	798 }

	799 }

	800 capture_count_ = capture_count;

	801 is_scanned_for_captures_ = true;

	802 }

	803

	804

	805 static inline bool IsDecimalDigit(int32_t c) {

	806 return '0' <= c && c <= '9';

	807 }

	808

	809

	810 bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {

	811 ASSERT('\\' == current());

	812 ASSERT('1' <= Next() && Next() <= '9');

	813 // Try to parse a decimal literal that is no greater than the total number

	814 // of left capturing parentheses in the input.

	815 intptr_t start = position();

	816 intptr_t value = Next() - '0';

	817 Advance(2);

	818 while (true) {

	819 uint32_t c = current();

	820 if (IsDecimalDigit(c)) {

	821 value = 10 * value + (c - '0');

	822 if (value > kMaxCaptures) {

	823 Reset(start);

	824 return false;

	825 }

	826 Advance();

	827 } else {

	828 break;

	829 }

	830 }

	831 if (value > captures_started()) {

	832 if (!is_scanned_for_captures_) {

	833 intptr_t saved_position = position();

	834 ScanForCaptures();

	835 Reset(saved_position);

	836 }

	837 if (value > capture_count_) {

	838 Reset(start);

	839 return false;

	840 }

	841 }

	842 *index_out = value;

	843 return true;

	844 }

	845

	846

	847 // QuantifierPrefix ::

	848 // { DecimalDigits }

	849 // { DecimalDigits , }

	850 // { DecimalDigits , DecimalDigits }

	851 //

	852 // Returns true if parsing succeeds, and set the min_out and max_out

	853 // values. Values are truncated to RegExpTree::kInfinity if they overflow.

	854 bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,

	855 intptr_t* max_out) {

	856 ASSERT(current() == '{');

	857 intptr_t start = position();

	858 Advance();

	859 intptr_t min = 0;

	860 if (!IsDecimalDigit(current())) {

	861 Reset(start);

	862 return false;

	863 }

	864 while (IsDecimalDigit(current())) {

	865 intptr_t next = current() - '0';

	866 if (min > (RegExpTree::kInfinity - next) / 10) {

	867 // Overflow. Skip past remaining decimal digits and return -1.

	868 do {

	869 Advance();

	870 } while (IsDecimalDigit(current()));

	871 min = RegExpTree::kInfinity;

	872 break;

	873 }

	874 min = 10 * min + next;

	875 Advance();

	876 }

	877 intptr_t max = 0;

	878 if (current() == '}') {

	879 max = min;

	880 Advance();

	881 } else if (current() == ',') {

	882 Advance();

	883 if (current() == '}') {

	884 max = RegExpTree::kInfinity;

	885 Advance();

	886 } else {

	887 while (IsDecimalDigit(current())) {

	888 intptr_t next = current() - '0';

	889 if (max > (RegExpTree::kInfinity - next) / 10) {

	890 do {

	891 Advance();

	892 } while (IsDecimalDigit(current()));

	893 max = RegExpTree::kInfinity;

	894 break;

	895 }

	896 max = 10 * max + next;

	897 Advance();

	898 }

	899 if (current() != '}') {

	900 Reset(start);

	901 return false;

	902 }

	903 Advance();

	904 }

	905 } else {

	906 Reset(start);

	907 return false;

	908 }

	909 *min_out = min;

	910 *max_out = max;

	911 return true;

	912 }

	913

	914

	915 uint32_t RegExpParser::ParseOctalLiteral() {

	916 ASSERT(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);

	917 // For compatibility with some other browsers (not all), we parse

	918 // up to three octal digits with a value below 256.

	919 uint32_t value = current() - '0';

	920 Advance();

	921 if ('0' <= current() && current() <= '7') {

	922 value = value * 8 + current() - '0';

	923 Advance();

	924 if (value < 32 && '0' <= current() && current() <= '7') {

	925 value = value * 8 + current() - '0';

	926 Advance();

	927 }

	928 }

	929 return value;

	930 }

	931

	932

	933 // Returns the value (0 .. 15) of a hexadecimal character c.

	934 // If c is not a legal hexadecimal character, returns a value < 0.

	935 static inline intptr_t HexValue(uint32_t c) {

	936 c -= '0';

	937 if (static_cast<unsigned>(c) <= 9) return c;

	938 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

	939 if (static_cast<unsigned>(c) <= 5) return c + 10;

	940 return -1;

	941 }

	942

	943

	944 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) {

	945 intptr_t start = position();

	946 uint32_t val = 0;

	947 bool done = false;

	948 for (intptr_t i = 0; !done; i++) {

	949 uint32_t c = current();

	950 intptr_t d = HexValue(c);

	951 if (d < 0) {

	952 Reset(start);

	953 return false;

	954 }

	955 val = val * 16 + d;

	956 Advance();

	957 if (i == length - 1) {

	958 done = true;

	959 }

	960 }

	961 *value = val;

	962 return true;

	963 }

	964

	965

	966 uint32_t RegExpParser::ParseClassCharacterEscape() {

	967 ASSERT(current() == '\\');

	968 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));

	969 Advance();

	970 switch (current()) {

	971 case 'b':

	972 Advance();

	973 return '\b';

	974 // ControlEscape :: one of

	975 // f n r t v

	976 case 'f':

	977 Advance();

	978 return '\f';

	979 case 'n':

	980 Advance();

	981 return '\n';

	982 case 'r':

	983 Advance();

	984 return '\r';

	985 case 't':

	986 Advance();

	987 return '\t';

	988 case 'v':

	989 Advance();

	990 return '\v';

	991 case 'c': {

	992 uint32_t controlLetter = Next();

	993 uint32_t letter = controlLetter & ~('A' ^ 'a');

	994 // For compatibility with JSC, inside a character class

	995 // we also accept digits and underscore as control characters.

	996 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

	997 controlLetter == '_' \|\|

	998 (letter >= 'A' && letter <= 'Z')) {

	999 Advance(2);

	1000 // Control letters mapped to ASCII control characters in the range

	1001 // 0x00-0x1f.

	1002 return controlLetter & 0x1f;

	1003 }

	1004 // We match JSC in reading the backslash as a literal

	1005 // character instead of as starting an escape.

	1006 return '\\';

	1007 }

	1008 case '0': case '1': case '2': case '3': case '4': case '5':

	1009 case '6': case '7':

	1010 // For compatibility, we interpret a decimal escape that isn't

	1011 // a back reference (and therefore either \0 or not valid according

	1012 // to the specification) as a 1..3 digit octal character code.

	1013 return ParseOctalLiteral();

	1014 case 'x': {

	1015 Advance();

	1016 uint32_t value;

	1017 if (ParseHexEscape(2, &value)) {

	1018 return value;

	1019 }

	1020 // If \x is not followed by a two-digit hexadecimal, treat it

	1021 // as an identity escape.

	1022 return 'x';

	1023 }

	1024 case 'u': {

	1025 Advance();

	1026 uint32_t value;

	1027 if (ParseHexEscape(4, &value)) {

	1028 return value;

	1029 }

	1030 // If \u is not followed by a four-digit hexadecimal, treat it

	1031 // as an identity escape.

	1032 return 'u';

	1033 }

	1034 default: {

	1035 // Extended identity escape. We accept any character that hasn't

	1036 // been matched by a more specific case, not just the subset required

	1037 // by the ECMAScript specification.

	1038 uint32_t result = current();

	1039 Advance();

	1040 return result;

	1041 }

	1042 }

	1043 return 0;

	1044 }

	1045

	1046

	1047 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {

	1048 ASSERT(0 == *char_class);

	1049 uint32_t first = current();

	1050 if (first == '\\') {

	1051 switch (Next()) {

	1052 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {

	1053 *char_class = Next();

	1054 Advance(2);

	1055 return CharacterRange::Singleton(0); // Return dummy value.

	1056 }

	1057 case kEndMarker:

	1058 ReportError("\\ at end of pattern");

	1059 UNREACHABLE();

	1060 default:

	1061 uint32_t c = ParseClassCharacterEscape();

	1062 return CharacterRange::Singleton(c);

	1063 }

	1064 } else {

	1065 Advance();

	1066 return CharacterRange::Singleton(first);

	1067 }

	1068 }

	1069

	1070 } // namespace dart

OLD	NEW

« no previous file with comments | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »