runtime/vm/regexp_parser.cc - Issue 683433003: Integrate the Irregexp Regular Expression Engine.

Side by Side Diff: runtime/vm/regexp_parser.cc

Issue 683433003: Integrate the Irregexp Regular Expression Engine. (Closed) Base URL: https://dart.googlecode.com/svn/branches/bleeding_edge/dart

Patch Set: rebase, enable tests, remove *CodeUnitsAt Created 6 years, 1 month ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch | Annotate | Revision Log

OLD	NEW
(Empty)
	1 // Copyright (c) 2014, the Dart project authors. Please see the AUTHORS file

	2 // for details. All rights reserved. Use of this source code is governed by a

	3 // BSD-style license that can be found in the LICENSE file.

	4

	5 #include "vm/longjump.h"

	6 #include "vm/object_store.h"

	7 #include "vm/regexp_parser.h"

	8

	9 namespace dart {

	10

	11 #define I isolate()

	12

	13 // Enables possessive quantifier syntax for testing.

	14 static const bool FLAG_regexp_possessive_quantifier = false;

	15

	16 RegExpBuilder::RegExpBuilder()

	17 : isolate_(Isolate::Current()),

	18 pending_empty_(false),

	19 characters_(NULL),

	20 terms_(),

	21 text_(),

	22 alternatives_()

	23 #ifdef DEBUG

	24 , last_added_(ADD_NONE)

	25 #endif

	26 {}

	27

	28

	29 void RegExpBuilder::FlushCharacters() {

	30 pending_empty_ = false;

	31 if (characters_ != NULL) {

	32 RegExpTree* atom = new(I) RegExpAtom(characters_);

	33 characters_ = NULL;

	34 text_.Add(atom);

	35 LAST(ADD_ATOM);

	36 }

	37 }

	38

	39

	40 void RegExpBuilder::FlushText() {

	41 FlushCharacters();

	42 intptr_t num_text = text_.length();

	43 if (num_text == 0) {

	44 return;

	45 } else if (num_text == 1) {

	46 terms_.Add(text_.Last());

	47 } else {

	48 RegExpText* text = new(I) RegExpText();

	49 for (intptr_t i = 0; i < num_text; i++)

	50 text_[i]->AppendToText(text);

	51 terms_.Add(text);

	52 }

	53 text_.Clear();

	54 }

	55

	56

	57 void RegExpBuilder::AddCharacter(uint16_t c) {

	58 pending_empty_ = false;

	59 if (characters_ == NULL) {

	60 characters_ = new(I) ZoneGrowableArray<uint16_t>(4);

	61 }

	62 characters_->Add(c);

	63 LAST(ADD_CHAR);

	64 }

	65

	66

	67 void RegExpBuilder::AddEmpty() {

	68 pending_empty_ = true;

	69 }

	70

	71

	72 void RegExpBuilder::AddAtom(RegExpTree* term) {

	73 if (term->IsEmpty()) {

	74 AddEmpty();

	75 return;

	76 }

	77 if (term->IsTextElement()) {

	78 FlushCharacters();

	79 text_.Add(term);

	80 } else {

	81 FlushText();

	82 terms_.Add(term);

	83 }

	84 LAST(ADD_ATOM);

	85 }

	86

	87

	88 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

	89 FlushText();

	90 terms_.Add(assert);

	91 LAST(ADD_ASSERT);

	92 }

	93

	94

	95 void RegExpBuilder::NewAlternative() {

	96 FlushTerms();

	97 }

	98

	99

	100 void RegExpBuilder::FlushTerms() {

	101 FlushText();

	102 intptr_t num_terms = terms_.length();

	103 RegExpTree* alternative;

	104 if (num_terms == 0) {

	105 alternative = RegExpEmpty::GetInstance();

	106 } else if (num_terms == 1) {

	107 alternative = terms_.Last();

	108 } else {

	109 ZoneGrowableArray<RegExpTree> terms =

	110 new(I) ZoneGrowableArray<RegExpTree*>();

	111 for (intptr_t i = 0; i < terms_.length(); i++) {

	112 terms->Add(terms_[i]);

	113 }

	114 alternative = new(I) RegExpAlternative(terms);

	115 }

	116 alternatives_.Add(alternative);

	117 terms_.Clear();

	118 LAST(ADD_NONE);

	119 }

	120

	121

	122 RegExpTree* RegExpBuilder::ToRegExp() {

	123 FlushTerms();

	124 intptr_t num_alternatives = alternatives_.length();

	125 if (num_alternatives == 0) {

	126 return RegExpEmpty::GetInstance();

	127 }

	128 if (num_alternatives == 1) {

	129 return alternatives_.Last();

	130 }

	131 ZoneGrowableArray<RegExpTree> alternatives =

	132 new(I) ZoneGrowableArray<RegExpTree*>();

	133 for (intptr_t i = 0; i < alternatives_.length(); i++) {

	134 alternatives->Add(alternatives_[i]);

	135 }

	136 return new(I) RegExpDisjunction(alternatives);

	137 }

	138

	139

	140 void RegExpBuilder::AddQuantifierToAtom(

	141 intptr_t min,

	142 intptr_t max,

	143 RegExpQuantifier::QuantifierType quantifier_type) {

	144 if (pending_empty_) {

	145 pending_empty_ = false;

	146 return;

	147 }

	148 RegExpTree* atom;

	149 if (characters_ != NULL) {

	150 DEBUG_ASSERT(last_added_ == ADD_CHAR);

	151 // Last atom was character.

	152

	153 ZoneGrowableArray<uint16_t> *char_vector =

	154 new(I) ZoneGrowableArray<uint16_t>();

	155 char_vector->AddArray(*characters_);

	156 intptr_t num_chars = char_vector->length();

	157 if (num_chars > 1) {

	158 ZoneGrowableArray<uint16_t> *prefix =

	159 new(I) ZoneGrowableArray<uint16_t>();

	160 for (intptr_t i = 0; i < num_chars - 1; i++) {

	161 prefix->Add(char_vector->At(i));

	162 }

	163 text_.Add(new(I) RegExpAtom(prefix));

	164 ZoneGrowableArray<uint16_t> *tail = new(I) ZoneGrowableArray<uint16_t>();

	165 tail->Add(char_vector->At(num_chars - 1));

	166 char_vector = tail;

	167 }

	168 characters_ = NULL;

	169 atom = new(I) RegExpAtom(char_vector);

	170 FlushText();

	171 } else if (text_.length() > 0) {

	172 DEBUG_ASSERT(last_added_ == ADD_ATOM);

	173 atom = text_.RemoveLast();

	174 FlushText();

	175 } else if (terms_.length() > 0) {

	176 DEBUG_ASSERT(last_added_ == ADD_ATOM);

	177 atom = terms_.RemoveLast();

	178 if (atom->max_match() == 0) {

	179 // Guaranteed to only match an empty string.

	180 LAST(ADD_TERM);

	181 if (min == 0) {

	182 return;

	183 }

	184 terms_.Add(atom);

	185 return;

	186 }

	187 } else {

	188 // Only call immediately after adding an atom or character!

	189 UNREACHABLE();

	190 return;

	191 }

	192 terms_.Add(new(I) RegExpQuantifier(min, max, quantifier_type, atom));

	193 LAST(ADD_TERM);

	194 }

	195

	196 // ----------------------------------------------------------------------------

	197 // Implementation of Parser

	198

	199 RegExpParser::RegExpParser(const String& in,

	200 String* error,

	201 bool multiline)

	202 : isolate_(Isolate::Current()),

	203 error_(error),

	204 captures_(NULL),

	205 in_(in),

	206 current_(kEndMarker),

	207 next_pos_(0),

	208 capture_count_(0),

	209 has_more_(true),

	210 multiline_(multiline),

	211 simple_(false),

	212 contains_anchor_(false),

	213 is_scanned_for_captures_(false),

	214 failed_(false) {

	215 Advance();

	216 }

	217

	218

	219 bool RegExpParser::ParseFunction(ParsedFunction *parsed_function) {

	220 Isolate* isolate = parsed_function->isolate();

	221 JSRegExp& regexp = JSRegExp::Handle(parsed_function->function().regexp());

	222

	223 const String& pattern = String::Handle(regexp.pattern());

	224 const bool multiline = regexp.is_multi_line();

	225

	226 RegExpCompileData* compile_data = new(isolate) RegExpCompileData();

	227 if (!RegExpParser::ParseRegExp(pattern, multiline, compile_data)) {

	228 // Parsing failures are handled in the JSRegExp factory constructor.

	229 UNREACHABLE();

	230 }

	231

	232 regexp.set_num_bracket_expressions(compile_data->capture_count);

	233 if (compile_data->simple) {

	234 regexp.set_is_simple();

	235 } else {

	236 regexp.set_is_complex();

	237 }

	238

	239 parsed_function->SetRegExpCompileData(compile_data);

	240

	241 return true;

	242 }

	243

	244

	245 uint32_t RegExpParser::Next() {

	246 if (has_next()) {

	247 return in().CharAt(next_pos_);

	248 } else {

	249 return kEndMarker;

	250 }

	251 }

	252

	253

	254 void RegExpParser::Advance() {

	255 if (next_pos_ < in().Length()) {

	256 current_ = in().CharAt(next_pos_);

	257 next_pos_++;

	258 } else {

	259 current_ = kEndMarker;

	260 has_more_ = false;

	261 }

	262 }

	263

	264

	265 void RegExpParser::Reset(intptr_t pos) {

	266 next_pos_ = pos;

	267 has_more_ = (pos < in().Length());

	268 Advance();

	269 }

	270

	271

	272 void RegExpParser::Advance(intptr_t dist) {

	273 next_pos_ += dist - 1;

	274 Advance();

	275 }

	276

	277

	278 bool RegExpParser::simple() {

	279 return simple_;

	280 }

	281

	282

	283 void RegExpParser::ReportError(const char* message) {

	284 failed_ = true;

	285 *error_ = String::New(message);

	286 // Zip to the end to make sure the no more input is read.

	287 current_ = kEndMarker;

	288 next_pos_ = in().Length();

	289

	290 const Error& error = Error::Handle(LanguageError::New(*error_));

	291 Report::LongJump(error);

	292 UNREACHABLE();

	293 }

	294

	295

	296 // Pattern ::

	297 // Disjunction

	298 RegExpTree* RegExpParser::ParsePattern() {

	299 RegExpTree* result = ParseDisjunction();

	300 ASSERT(!has_more());

	301 // If the result of parsing is a literal string atom, and it has the

	302 // same length as the input, then the atom is identical to the input.

	303 if (result->IsAtom() && result->AsAtom()->length() == in().Length()) {

	304 simple_ = true;

	305 }

	306 return result;

	307 }

	308

	309

	310 // Disjunction ::

	311 // Alternative

	312 // Alternative \| Disjunction

	313 // Alternative ::

	314 // [empty]

	315 // Term Alternative

	316 // Term ::

	317 // Assertion

	318 // Atom

	319 // Atom Quantifier

	320 RegExpTree* RegExpParser::ParseDisjunction() {

	321 // Used to store current state while parsing subexpressions.

	322 RegExpParserState initial_state(NULL, INITIAL, 0, I);

	323 RegExpParserState* stored_state = &initial_state;

	324 // Cache the builder in a local variable for quick access.

	325 RegExpBuilder* builder = initial_state.builder();

	326 while (true) {

	327 switch (current()) {

	328 case kEndMarker:

	329 if (stored_state->IsSubexpression()) {

	330 // Inside a parenthesized group when hitting end of input.

	331 ReportError("Unterminated group");

	332 UNREACHABLE();

	333 }

	334 ASSERT(INITIAL == stored_state->group_type());

	335 // Parsing completed successfully.

	336 return builder->ToRegExp();

	337 case ')': {

	338 if (!stored_state->IsSubexpression()) {

	339 ReportError("Unmatched ')'");

	340 UNREACHABLE();

	341 }

	342 ASSERT(INITIAL != stored_state->group_type());

	343

	344 Advance();

	345 // End disjunction parsing and convert builder content to new single

	346 // regexp atom.

	347 RegExpTree* body = builder->ToRegExp();

	348

	349 intptr_t end_capture_index = captures_started();

	350

	351 intptr_t capture_index = stored_state->capture_index();

	352 SubexpressionType group_type = stored_state->group_type();

	353

	354 // Restore previous state.

	355 stored_state = stored_state->previous_state();

	356 builder = stored_state->builder();

	357

	358 // Build result of subexpression.

	359 if (group_type == CAPTURE) {

	360 RegExpCapture* capture = new(I) RegExpCapture(body, capture_index);

	361 (*captures_)[capture_index - 1] = capture;

	362 body = capture;

	363 } else if (group_type != GROUPING) {

	364 ASSERT(group_type == POSITIVE_LOOKAHEAD \|\|

	365 group_type == NEGATIVE_LOOKAHEAD);

	366 bool is_positive = (group_type == POSITIVE_LOOKAHEAD);

	367 body = new(I) RegExpLookahead(body,

	368 is_positive,

	369 end_capture_index - capture_index,

	370 capture_index);

	371 }

	372 builder->AddAtom(body);

	373 // For compatibility with JSC and ES3, we allow quantifiers after

	374 // lookaheads, and break in all cases.

	375 break;

	376 }

	377 case '\|': {

	378 Advance();

	379 builder->NewAlternative();

	380 continue;

	381 }

	382 case '*':

	383 case '+':

	384 case '?':

	385 ReportError("Nothing to repeat");

	386 UNREACHABLE();

	387 case '^': {

	388 Advance();

	389 if (multiline_) {

	390 builder->AddAssertion(

	391 new(I) RegExpAssertion(RegExpAssertion::START_OF_LINE));

	392 } else {

	393 builder->AddAssertion(

	394 new(I) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

	395 set_contains_anchor();

	396 }

	397 continue;

	398 }

	399 case '$': {

	400 Advance();

	401 RegExpAssertion::AssertionType assertion_type =

	402 multiline_ ? RegExpAssertion::END_OF_LINE :

	403 RegExpAssertion::END_OF_INPUT;

	404 builder->AddAssertion(new RegExpAssertion(assertion_type));

	405 continue;

	406 }

	407 case '.': {

	408 Advance();

	409 // everything except \x0a, \x0d, \u2028 and \u2029

	410 ZoneGrowableArray<CharacterRange>* ranges =

	411 new ZoneGrowableArray<CharacterRange>(2);

	412 CharacterRange::AddClassEscape('.', ranges);

	413 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

	414 builder->AddAtom(atom);

	415 break;

	416 }

	417 case '(': {

	418 SubexpressionType subexpr_type = CAPTURE;

	419 Advance();

	420 if (current() == '?') {

	421 switch (Next()) {

	422 case ':':

	423 subexpr_type = GROUPING;

	424 break;

	425 case '=':

	426 subexpr_type = POSITIVE_LOOKAHEAD;

	427 break;

	428 case '!':

	429 subexpr_type = NEGATIVE_LOOKAHEAD;

	430 break;

	431 default:

	432 ReportError("Invalid group");

	433 UNREACHABLE();

	434 }

	435 Advance(2);

	436 } else {

	437 if (captures_ == NULL) {

	438 captures_ = new ZoneGrowableArray<RegExpCapture*>(2);

	439 }

	440 if (captures_started() >= kMaxCaptures) {

	441 ReportError("Too many captures");

	442 UNREACHABLE();

	443 }

	444 captures_->Add(NULL);

	445 }

	446 // Store current state and begin new disjunction parsing.

	447 stored_state = new RegExpParserState(stored_state, subexpr_type,

	448 captures_started(), I);

	449 builder = stored_state->builder();

	450 continue;

	451 }

	452 case '[': {

	453 RegExpTree* atom = ParseCharacterClass();

	454 builder->AddAtom(atom);

	455 break;

	456 }

	457 // Atom ::

	458 // \ AtomEscape

	459 case '\\':

	460 switch (Next()) {

	461 case kEndMarker:

	462 ReportError("\\ at end of pattern");

	463 UNREACHABLE();

	464 case 'b':

	465 Advance(2);

	466 builder->AddAssertion(

	467 new RegExpAssertion(RegExpAssertion::BOUNDARY));

	468 continue;

	469 case 'B':

	470 Advance(2);

	471 builder->AddAssertion(

	472 new RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

	473 continue;

	474 // AtomEscape ::

	475 // CharacterClassEscape

	476 //

	477 // CharacterClassEscape :: one of

	478 // d D s S w W

	479 case 'd': case 'D': case 's': case 'S': case 'w': case 'W': {

	480 uint32_t c = Next();

	481 Advance(2);

	482 ZoneGrowableArray<CharacterRange>* ranges =

	483 new ZoneGrowableArray<CharacterRange>(2);

	484 CharacterRange::AddClassEscape(c, ranges);

	485 RegExpTree* atom = new RegExpCharacterClass(ranges, false);

	486 builder->AddAtom(atom);

	487 break;

	488 }

	489 case '1': case '2': case '3': case '4': case '5': case '6':

	490 case '7': case '8': case '9': {

	491 intptr_t index = 0;

	492 if (ParseBackReferenceIndex(&index)) {

	493 RegExpCapture* capture = NULL;

	494 if (captures_ != NULL && index <= captures_->length()) {

	495 capture = captures_->At(index - 1);

	496 }

	497 if (capture == NULL) {

	498 builder->AddEmpty();

	499 break;

	500 }

	501 RegExpTree* atom = new RegExpBackReference(capture);

	502 builder->AddAtom(atom);

	503 break;

	504 }

	505 uint32_t first_digit = Next();

	506 if (first_digit == '8' \|\| first_digit == '9') {

	507 // Treat as identity escape

	508 builder->AddCharacter(first_digit);

	509 Advance(2);

	510 break;

	511 }

	512 }

	513 // FALLTHROUGH

	514 case '0': {

	515 Advance();

	516 uint32_t octal = ParseOctalLiteral();

	517 builder->AddCharacter(octal);

	518 break;

	519 }

	520 // ControlEscape :: one of

	521 // f n r t v

	522 case 'f':

	523 Advance(2);

	524 builder->AddCharacter('\f');

	525 break;

	526 case 'n':

	527 Advance(2);

	528 builder->AddCharacter('\n');

	529 break;

	530 case 'r':

	531 Advance(2);

	532 builder->AddCharacter('\r');

	533 break;

	534 case 't':

	535 Advance(2);

	536 builder->AddCharacter('\t');

	537 break;

	538 case 'v':

	539 Advance(2);

	540 builder->AddCharacter('\v');

	541 break;

	542 case 'c': {

	543 Advance();

	544 uint32_t controlLetter = Next();

	545 // Special case if it is an ASCII letter.

	546 // Convert lower case letters to uppercase.

	547 uint32_t letter = controlLetter & ~('a' ^ 'A');

	548 if (letter < 'A' \|\| 'Z' < letter) {

	549 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

	550 // This is outside the specification. We match JSC in

	551 // reading the backslash as a literal character instead

	552 // of as starting an escape.

	553 builder->AddCharacter('\\');

	554 } else {

	555 Advance(2);

	556 builder->AddCharacter(controlLetter & 0x1f);

	557 }

	558 break;

	559 }

	560 case 'x': {

	561 Advance(2);

	562 uint32_t value;

	563 if (ParseHexEscape(2, &value)) {

	564 builder->AddCharacter(value);

	565 } else {

	566 builder->AddCharacter('x');

	567 }

	568 break;

	569 }

	570 case 'u': {

	571 Advance(2);

	572 uint32_t value;

	573 if (ParseHexEscape(4, &value)) {

	574 builder->AddCharacter(value);

	575 } else {

	576 builder->AddCharacter('u');

	577 }

	578 break;

	579 }

	580 default:

	581 // Identity escape.

	582 builder->AddCharacter(Next());

	583 Advance(2);

	584 break;

	585 }

	586 break;

	587 case '{': {

	588 intptr_t dummy;

	589 if (ParseIntervalQuantifier(&dummy, &dummy)) {

	590 ReportError("Nothing to repeat");

	591 UNREACHABLE();

	592 }

	593 // fallthrough

	594 }

	595 default:

	596 builder->AddCharacter(current());

	597 Advance();

	598 break;

	599 } // end switch(current())

	600

	601 intptr_t min;

	602 intptr_t max;

	603 switch (current()) {

	604 // QuantifierPrefix ::

	605 // *

	606 // +

	607 // ?

	608 // {

	609 case '*':

	610 min = 0;

	611 max = RegExpTree::kInfinity;

	612 Advance();

	613 break;

	614 case '+':

	615 min = 1;

	616 max = RegExpTree::kInfinity;

	617 Advance();

	618 break;

	619 case '?':

	620 min = 0;

	621 max = 1;

	622 Advance();

	623 break;

	624 case '{':

	625 if (ParseIntervalQuantifier(&min, &max)) {

	626 if (max < min) {

	627 ReportError("numbers out of order in {} quantifier.");

	628 UNREACHABLE();

	629 }

	630 break;

	631 } else {

	632 continue;

	633 }

	634 default:

	635 continue;

	636 }

	637 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

	638 if (current() == '?') {

	639 quantifier_type = RegExpQuantifier::NON_GREEDY;

	640 Advance();

	641 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {

	642 // FLAG_regexp_possessive_quantifier is a debug-only flag.

	643 quantifier_type = RegExpQuantifier::POSSESSIVE;

	644 Advance();

	645 }

	646 builder->AddQuantifierToAtom(min, max, quantifier_type);

	647 }

	648 }

	649

	650

	651 #ifdef DEBUG

	652 // Currently only used in an ASSERT.

	653 static bool IsSpecialClassEscape(uint32_t c) {

	654 switch (c) {

	655 case 'd': case 'D':

	656 case 's': case 'S':

	657 case 'w': case 'W':

	658 return true;

	659 default:

	660 return false;

	661 }

	662 }

	663 #endif

	664

	665

	666 // In order to know whether an escape is a backreference or not we have to scan

	667 // the entire regexp and find the number of capturing parentheses. However we

	668 // don't want to scan the regexp twice unless it is necessary. This mini-parser

	669 // is called when needed. It can see the difference between capturing and

	670 // noncapturing parentheses and can skip character classes and backslash-escaped

	671 // characters.

	672 void RegExpParser::ScanForCaptures() {

	673 // Start with captures started previous to current position

	674 intptr_t capture_count = captures_started();

	675 // Add count of captures after this position.

	676 intptr_t n;

	677 while ((n = current()) != kEndMarker) {

	678 Advance();

	679 switch (n) {

	680 case '\\':

	681 Advance();

	682 break;

	683 case '[': {

	684 intptr_t c;

	685 while ((c = current()) != kEndMarker) {

	686 Advance();

	687 if (c == '\\') {

	688 Advance();

	689 } else {

	690 if (c == ']') break;

	691 }

	692 }

	693 break;

	694 }

	695 case '(':

	696 if (current() != '?') capture_count++;

	697 break;

	698 }

	699 }

	700 capture_count_ = capture_count;

	701 is_scanned_for_captures_ = true;

	702 }

	703

	704

	705 static inline bool IsDecimalDigit(int32_t c) {

	706 return '0' <= c && c <= '9';

	707 }

	708

	709

	710 bool RegExpParser::ParseBackReferenceIndex(intptr_t* index_out) {

	711 ASSERT('\\' == current());

	712 ASSERT('1' <= Next() && Next() <= '9');

	713 // Try to parse a decimal literal that is no greater than the total number

	714 // of left capturing parentheses in the input.

	715 intptr_t start = position();

	716 intptr_t value = Next() - '0';

	717 Advance(2);

	718 while (true) {

	719 uint32_t c = current();

	720 if (IsDecimalDigit(c)) {

	721 value = 10 * value + (c - '0');

	722 if (value > kMaxCaptures) {

	723 Reset(start);

	724 return false;

	725 }

	726 Advance();

	727 } else {

	728 break;

	729 }

	730 }

	731 if (value > captures_started()) {

	732 if (!is_scanned_for_captures_) {

	733 intptr_t saved_position = position();

	734 ScanForCaptures();

	735 Reset(saved_position);

	736 }

	737 if (value > capture_count_) {

	738 Reset(start);

	739 return false;

	740 }

	741 }

	742 *index_out = value;

	743 return true;

	744 }

	745

	746

	747 // QuantifierPrefix ::

	748 // { DecimalDigits }

	749 // { DecimalDigits , }

	750 // { DecimalDigits , DecimalDigits }

	751 //

	752 // Returns true if parsing succeeds, and set the min_out and max_out

	753 // values. Values are truncated to RegExpTree::kInfinity if they overflow.

	754 bool RegExpParser::ParseIntervalQuantifier(intptr_t* min_out,

	755 intptr_t* max_out) {

	756 ASSERT(current() == '{');

	757 intptr_t start = position();

	758 Advance();

	759 intptr_t min = 0;

	760 if (!IsDecimalDigit(current())) {

	761 Reset(start);

	762 return false;

	763 }

	764 while (IsDecimalDigit(current())) {

	765 intptr_t next = current() - '0';

	766 if (min > (RegExpTree::kInfinity - next) / 10) {

	767 // Overflow. Skip past remaining decimal digits and return -1.

	768 do {

	769 Advance();

	770 } while (IsDecimalDigit(current()));

	771 min = RegExpTree::kInfinity;

	772 break;

	773 }

	774 min = 10 * min + next;

	775 Advance();

	776 }

	777 intptr_t max = 0;

	778 if (current() == '}') {

	779 max = min;

	780 Advance();

	781 } else if (current() == ',') {

	782 Advance();

	783 if (current() == '}') {

	784 max = RegExpTree::kInfinity;

	785 Advance();

	786 } else {

	787 while (IsDecimalDigit(current())) {

	788 intptr_t next = current() - '0';

	789 if (max > (RegExpTree::kInfinity - next) / 10) {

	790 do {

	791 Advance();

	792 } while (IsDecimalDigit(current()));

	793 max = RegExpTree::kInfinity;

	794 break;

	795 }

	796 max = 10 * max + next;

	797 Advance();

	798 }

	799 if (current() != '}') {

	800 Reset(start);

	801 return false;

	802 }

	803 Advance();

	804 }

	805 } else {

	806 Reset(start);

	807 return false;

	808 }

	809 *min_out = min;

	810 *max_out = max;

	811 return true;

	812 }

	813

	814

	815 uint32_t RegExpParser::ParseOctalLiteral() {

	816 ASSERT(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);

	817 // For compatibility with some other browsers (not all), we parse

	818 // up to three octal digits with a value below 256.

	819 uint32_t value = current() - '0';

	820 Advance();

	821 if ('0' <= current() && current() <= '7') {

	822 value = value * 8 + current() - '0';

	823 Advance();

	824 if (value < 32 && '0' <= current() && current() <= '7') {

	825 value = value * 8 + current() - '0';

	826 Advance();

	827 }

	828 }

	829 return value;

	830 }

	831

	832

	833 // Returns the value (0 .. 15) of a hexadecimal character c.

	834 // If c is not a legal hexadecimal character, returns a value < 0.

	835 static inline intptr_t HexValue(uint32_t c) {

	836 c -= '0';

	837 if (static_cast<unsigned>(c) <= 9) return c;

	838 c = (c \| 0x20) - ('a' - '0'); // detect 0x11..0x16 and 0x31..0x36.

	839 if (static_cast<unsigned>(c) <= 5) return c + 10;

	840 return -1;

	841 }

	842

	843

	844 bool RegExpParser::ParseHexEscape(intptr_t length, uint32_t *value) {

	845 intptr_t start = position();

	846 uint32_t val = 0;

	847 bool done = false;

	848 for (intptr_t i = 0; !done; i++) {

	849 uint32_t c = current();

	850 intptr_t d = HexValue(c);

	851 if (d < 0) {

	852 Reset(start);

	853 return false;

	854 }

	855 val = val * 16 + d;

	856 Advance();

	857 if (i == length - 1) {

	858 done = true;

	859 }

	860 }

	861 *value = val;

	862 return true;

	863 }

	864

	865

	866 uint32_t RegExpParser::ParseClassCharacterEscape() {

	867 ASSERT(current() == '\\');

	868 DEBUG_ASSERT(has_next() && !IsSpecialClassEscape(Next()));

	869 Advance();

	870 switch (current()) {

	871 case 'b':

	872 Advance();

	873 return '\b';

	874 // ControlEscape :: one of

	875 // f n r t v

	876 case 'f':

	877 Advance();

	878 return '\f';

	879 case 'n':

	880 Advance();

	881 return '\n';

	882 case 'r':

	883 Advance();

	884 return '\r';

	885 case 't':

	886 Advance();

	887 return '\t';

	888 case 'v':

	889 Advance();

	890 return '\v';

	891 case 'c': {

	892 uint32_t controlLetter = Next();

	893 uint32_t letter = controlLetter & ~('A' ^ 'a');

	894 // For compatibility with JSC, inside a character class

	895 // we also accept digits and underscore as control characters.

	896 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

	897 controlLetter == '_' \|\|

	898 (letter >= 'A' && letter <= 'Z')) {

	899 Advance(2);

	900 // Control letters mapped to ASCII control characters in the range

	901 // 0x00-0x1f.

	902 return controlLetter & 0x1f;

	903 }

	904 // We match JSC in reading the backslash as a literal

	905 // character instead of as starting an escape.

	906 return '\\';

	907 }

	908 case '0': case '1': case '2': case '3': case '4': case '5':

	909 case '6': case '7':

	910 // For compatibility, we interpret a decimal escape that isn't

	911 // a back reference (and therefore either \0 or not valid according

	912 // to the specification) as a 1..3 digit octal character code.

	913 return ParseOctalLiteral();

	914 case 'x': {

	915 Advance();

	916 uint32_t value;

	917 if (ParseHexEscape(2, &value)) {

	918 return value;

	919 }

	920 // If \x is not followed by a two-digit hexadecimal, treat it

	921 // as an identity escape.

	922 return 'x';

	923 }

	924 case 'u': {

	925 Advance();

	926 uint32_t value;

	927 if (ParseHexEscape(4, &value)) {

	928 return value;

	929 }

	930 // If \u is not followed by a four-digit hexadecimal, treat it

	931 // as an identity escape.

	932 return 'u';

	933 }

	934 default: {

	935 // Extended identity escape. We accept any character that hasn't

	936 // been matched by a more specific case, not just the subset required

	937 // by the ECMAScript specification.

	938 uint32_t result = current();

	939 Advance();

	940 return result;

	941 }

	942 }

	943 return 0;

	944 }

	945

	946

	947 CharacterRange RegExpParser::ParseClassAtom(uint16_t* char_class) {

	948 ASSERT(0 == *char_class);

	949 uint32_t first = current();

	950 if (first == '\\') {

	951 switch (Next()) {

	952 case 'w': case 'W': case 'd': case 'D': case 's': case 'S': {

	953 *char_class = Next();

	954 Advance(2);

	955 return CharacterRange::Singleton(0); // Return dummy value.

	956 }

	957 case kEndMarker:

	958 ReportError("\\ at end of pattern");

	959 UNREACHABLE();

	960 default:

	961 uint32_t c = ParseClassCharacterEscape();

	962 return CharacterRange::Singleton(c);

	963 }

	964 } else {

	965 Advance();

	966 return CharacterRange::Singleton(first);

	967 }

	968 }

	969

	970

	971 static const uint16_t kNoCharClass = 0;

	972

	973 // Adds range or pre-defined character class to character ranges.

	974 // If char_class is not kInvalidClass, it's interpreted as a class

	975 // escape (i.e., 's' means whitespace, from '\s').

	976 static inline void AddRangeOrEscape(ZoneGrowableArray<CharacterRange>* ranges,

	977 uint16_t char_class,

	978 CharacterRange range) {

	979 if (char_class != kNoCharClass) {

	980 CharacterRange::AddClassEscape(char_class, ranges);

	981 } else {

	982 ranges->Add(range);

	983 }

	984 }

	985

	986

	987 RegExpTree* RegExpParser::ParseCharacterClass() {

	988 static const char* kUnterminated = "Unterminated character class";

	989 static const char* kRangeOutOfOrder = "Range out of order in character class";

	990

	991 ASSERT(current() == '[');

	992 Advance();

	993 bool is_negated = false;

	994 if (current() == '^') {

	995 is_negated = true;

	996 Advance();

	997 }

	998 ZoneGrowableArray<CharacterRange>* ranges =

	999 new(I) ZoneGrowableArray<CharacterRange>(2);

	1000 while (has_more() && current() != ']') {

	1001 uint16_t char_class = kNoCharClass;

	1002 CharacterRange first = ParseClassAtom(&char_class);

	1003 if (current() == '-') {

	1004 Advance();

	1005 if (current() == kEndMarker) {

	1006 // If we reach the end we break out of the loop and let the

	1007 // following code report an error.

	1008 break;

	1009 } else if (current() == ']') {

	1010 AddRangeOrEscape(ranges, char_class, first);

	1011 ranges->Add(CharacterRange::Singleton('-'));

	1012 break;

	1013 }

	1014 uint16_t char_class_2 = kNoCharClass;

	1015 CharacterRange next = ParseClassAtom(&char_class_2);

	1016 if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {

	1017 // Either end is an escaped character class. Treat the '-' verbatim.

	1018 AddRangeOrEscape(ranges, char_class, first);

	1019 ranges->Add(CharacterRange::Singleton('-'));

	1020 AddRangeOrEscape(ranges, char_class_2, next);

	1021 continue;

	1022 }

	1023 if (first.from() > next.to()) {

	1024 ReportError(kRangeOutOfOrder);

	1025 UNREACHABLE();

	1026 }

	1027 ranges->Add(CharacterRange::Range(first.from(), next.to()));

	1028 } else {

	1029 AddRangeOrEscape(ranges, char_class, first);

	1030 }

	1031 }

	1032 if (!has_more()) {

	1033 ReportError(kUnterminated);

	1034 UNREACHABLE();

	1035 }

	1036 Advance();

	1037 if (ranges->length() == 0) {

	1038 ranges->Add(CharacterRange::Everything());

	1039 is_negated = !is_negated;

	1040 }

	1041 return new(I) RegExpCharacterClass(ranges, is_negated);

	1042 }

	1043

	1044

	1045 // ----------------------------------------------------------------------------

	1046 // The Parser interface.

	1047

	1048 bool RegExpParser::ParseRegExp(const String& input,

	1049 bool multiline,

	1050 RegExpCompileData* result) {

	1051 ASSERT(result != NULL);

	1052 LongJumpScope jump;

	1053 RegExpParser parser(input, &result->error, multiline);

	1054 if (setjmp(*jump.Set()) == 0) {

	1055 RegExpTree* tree = parser.ParsePattern();

	1056 ASSERT(tree != NULL);

	1057 ASSERT(result->error.IsNull());

	1058 result->tree = tree;

	1059 intptr_t capture_count = parser.captures_started();

	1060 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

	1061 result->contains_anchor = parser.contains_anchor();

	1062 result->capture_count = capture_count;

	1063 } else {

	1064 ASSERT(!result->error.IsNull());

	1065 Isolate::Current()->object_store()->clear_sticky_error();

	1066

	1067 // Throw a FormatException on parsing failures.

	1068 const String& message = String::Handle(

	1069 String::Concat(result->error, input));

	1070 const Array& args = Array::Handle(Array::New(1));

	1071 args.SetAt(0, message);

	1072

	1073 Exceptions::ThrowByType(Exceptions::kFormat, args);

	1074 }

	1075 return !parser.failed();

	1076 }

	1077

	1078 } // namespace dart

OLD	NEW

« runtime/vm/object.cc ('K') | « runtime/vm/regexp_parser.h ('k') | runtime/vm/symbols.h » ('j') | no next file with comments »