src/regexp/regexp-parser.cc - Issue 1565183002: [regexp] move regexp parser into own files.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1565183002: [regexp] move regexp parser into own files. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: fix test compile Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
(Empty)
	1 // Copyright 2016 the V8 project authors. All rights reserved.

	2 // Use of this source code is governed by a BSD-style license that can be

	3 // found in the LICENSE file.

	4

	5 #include "src/regexp/regexp-parser.h"

	6

	7 #include "src/char-predicates-inl.h"

	8 #include "src/factory.h"

	9 #include "src/isolate.h"

	10 #include "src/objects-inl.h"

	11 #include "src/regexp/jsregexp.h"

	12 #include "src/utils.h"

	13

	14 namespace v8 {

	15 namespace internal {

	16

	17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

	18 bool multiline, bool unicode, Isolate* isolate,

	19 Zone* zone)

	20 : isolate_(isolate),

	21 zone_(zone),

	22 error_(error),

	23 captures_(NULL),

	24 in_(in),

	25 current_(kEndMarker),

	26 next_pos_(0),

	27 captures_started_(0),

	28 capture_count_(0),

	29 has_more_(true),

	30 multiline_(multiline),

	31 unicode_(unicode),

	32 simple_(false),

	33 contains_anchor_(false),

	34 is_scanned_for_captures_(false),

	35 failed_(false) {

	36 Advance();

	37 }

	38

	39

	40 uc32 RegExpParser::Next() {

	41 if (has_next()) {

	42 return in()->Get(next_pos_);

	43 } else {

	44 return kEndMarker;

	45 }

	46 }

	47

	48

	49 void RegExpParser::Advance() {

	50 if (next_pos_ < in()->length()) {

	51 StackLimitCheck check(isolate());

	52 if (check.HasOverflowed()) {

	53 ReportError(CStrVector(Isolate::kStackOverflowMessage));

	54 } else if (zone()->excess_allocation()) {

	55 ReportError(CStrVector("Regular expression too large"));

	56 } else {

	57 current_ = in()->Get(next_pos_);

	58 next_pos_++;

	59 }

	60 } else {

	61 current_ = kEndMarker;

	62 // Advance so that position() points to 1-after-the-last-character. This is

	63 // important so that Reset() to this position works correctly.

	64 next_pos_ = in()->length() + 1;

	65 has_more_ = false;

	66 }

	67 }

	68

	69

	70 void RegExpParser::Reset(int pos) {

	71 next_pos_ = pos;

	72 has_more_ = (pos < in()->length());

	73 Advance();

	74 }

	75

	76

	77 void RegExpParser::Advance(int dist) {

	78 next_pos_ += dist - 1;

	79 Advance();

	80 }

	81

	82

	83 bool RegExpParser::simple() { return simple_; }

	84

	85

	86 bool RegExpParser::IsSyntaxCharacter(uc32 c) {

	87 return c == '^' \|\| c == '$' \|\| c == '\\' \|\| c == '.' \|\| c == '*' \|\|

	88 c == '+' \|\| c == '?' \|\| c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\|

	89 c == '{' \|\| c == '}' \|\| c == '\|';

	90 }

	91

	92

	93 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {

	94 failed_ = true;

	95 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();

	96 // Zip to the end to make sure the no more input is read.

	97 current_ = kEndMarker;

	98 next_pos_ = in()->length();

	99 return NULL;

	100 }

	101

	102

	103 #define CHECK_FAILED /**/); \

	104 if (failed_) return NULL; \

	105 ((void)0

	106

	107

	108 // Pattern ::

	109 // Disjunction

	110 RegExpTree* RegExpParser::ParsePattern() {

	111 RegExpTree* result = ParseDisjunction(CHECK_FAILED);

	112 DCHECK(!has_more());

	113 // If the result of parsing is a literal string atom, and it has the

	114 // same length as the input, then the atom is identical to the input.

	115 if (result->IsAtom() && result->AsAtom()->length() == in()->length()) {

	116 simple_ = true;

	117 }

	118 return result;

	119 }

	120

	121

	122 // Disjunction ::

	123 // Alternative

	124 // Alternative \| Disjunction

	125 // Alternative ::

	126 // [empty]

	127 // Term Alternative

	128 // Term ::

	129 // Assertion

	130 // Atom

	131 // Atom Quantifier

	132 RegExpTree* RegExpParser::ParseDisjunction() {

	133 // Used to store current state while parsing subexpressions.

	134 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,

	135 zone());

	136 RegExpParserState* state = &initial_state;

	137 // Cache the builder in a local variable for quick access.

	138 RegExpBuilder* builder = initial_state.builder();

	139 while (true) {

	140 switch (current()) {

	141 case kEndMarker:

	142 if (state->IsSubexpression()) {

	143 // Inside a parenthesized group when hitting end of input.

	144 ReportError(CStrVector("Unterminated group") CHECK_FAILED);

	145 }

	146 DCHECK_EQ(INITIAL, state->group_type());

	147 // Parsing completed successfully.

	148 return builder->ToRegExp();

	149 case ')': {

	150 if (!state->IsSubexpression()) {

	151 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);

	152 }

	153 DCHECK_NE(INITIAL, state->group_type());

	154

	155 Advance();

	156 // End disjunction parsing and convert builder content to new single

	157 // regexp atom.

	158 RegExpTree* body = builder->ToRegExp();

	159

	160 int end_capture_index = captures_started();

	161

	162 int capture_index = state->capture_index();

	163 SubexpressionType group_type = state->group_type();

	164

	165 // Build result of subexpression.

	166 if (group_type == CAPTURE) {

	167 RegExpCapture* capture = GetCapture(capture_index);

	168 capture->set_body(body);

	169 body = capture;

	170 } else if (group_type != GROUPING) {

	171 DCHECK(group_type == POSITIVE_LOOKAROUND \|\|

	172 group_type == NEGATIVE_LOOKAROUND);

	173 bool is_positive = (group_type == POSITIVE_LOOKAROUND);

	174 body = new (zone()) RegExpLookaround(

	175 body, is_positive, end_capture_index - capture_index,

	176 capture_index, state->lookaround_type());

	177 }

	178

	179 // Restore previous state.

	180 state = state->previous_state();

	181 builder = state->builder();

	182

	183 builder->AddAtom(body);

	184 // For compatability with JSC and ES3, we allow quantifiers after

	185 // lookaheads, and break in all cases.

	186 break;

	187 }

	188 case '\|': {

	189 Advance();

	190 builder->NewAlternative();

	191 continue;

	192 }

	193 case '*':

	194 case '+':

	195 case '?':

	196 return ReportError(CStrVector("Nothing to repeat"));

	197 case '^': {

	198 Advance();

	199 if (multiline_) {

	200 builder->AddAssertion(

	201 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));

	202 } else {

	203 builder->AddAssertion(

	204 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

	205 set_contains_anchor();

	206 }

	207 continue;

	208 }

	209 case '$': {

	210 Advance();

	211 RegExpAssertion::AssertionType assertion_type =

	212 multiline_ ? RegExpAssertion::END_OF_LINE

	213 : RegExpAssertion::END_OF_INPUT;

	214 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));

	215 continue;

	216 }

	217 case '.': {

	218 Advance();

	219 // everything except \x0a, \x0d, \u2028 and \u2029

	220 ZoneList<CharacterRange>* ranges =

	221 new (zone()) ZoneList<CharacterRange>(2, zone());

	222 CharacterRange::AddClassEscape('.', ranges, zone());

	223 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

	224 builder->AddAtom(atom);

	225 break;

	226 }

	227 case '(': {

	228 SubexpressionType subexpr_type = CAPTURE;

	229 RegExpLookaround::Type lookaround_type = state->lookaround_type();

	230 Advance();

	231 if (current() == '?') {

	232 switch (Next()) {

	233 case ':':

	234 subexpr_type = GROUPING;

	235 break;

	236 case '=':

	237 lookaround_type = RegExpLookaround::LOOKAHEAD;

	238 subexpr_type = POSITIVE_LOOKAROUND;

	239 break;

	240 case '!':

	241 lookaround_type = RegExpLookaround::LOOKAHEAD;

	242 subexpr_type = NEGATIVE_LOOKAROUND;

	243 break;

	244 case '<':

	245 if (FLAG_harmony_regexp_lookbehind) {

	246 Advance();

	247 lookaround_type = RegExpLookaround::LOOKBEHIND;

	248 if (Next() == '=') {

	249 subexpr_type = POSITIVE_LOOKAROUND;

	250 break;

	251 } else if (Next() == '!') {

	252 subexpr_type = NEGATIVE_LOOKAROUND;

	253 break;

	254 }

	255 }

	256 // Fall through.

	257 default:

	258 ReportError(CStrVector("Invalid group") CHECK_FAILED);

	259 break;

	260 }

	261 Advance(2);

	262 } else {

	263 if (captures_started_ >= kMaxCaptures) {

	264 ReportError(CStrVector("Too many captures") CHECK_FAILED);

	265 }

	266 captures_started_++;

	267 }

	268 // Store current state and begin new disjunction parsing.

	269 state = new (zone()) RegExpParserState(

	270 state, subexpr_type, lookaround_type, captures_started_, zone());

	271 builder = state->builder();

	272 continue;

	273 }

	274 case '[': {

	275 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);

	276 builder->AddAtom(atom);

	277 break;

	278 }

	279 // Atom ::

	280 // \ AtomEscape

	281 case '\\':

	282 switch (Next()) {

	283 case kEndMarker:

	284 return ReportError(CStrVector("\\ at end of pattern"));

	285 case 'b':

	286 Advance(2);

	287 builder->AddAssertion(

	288 new (zone()) RegExpAssertion(RegExpAssertion::BOUNDARY));

	289 continue;

	290 case 'B':

	291 Advance(2);

	292 builder->AddAssertion(

	293 new (zone()) RegExpAssertion(RegExpAssertion::NON_BOUNDARY));

	294 continue;

	295 // AtomEscape ::

	296 // CharacterClassEscape

	297 //

	298 // CharacterClassEscape :: one of

	299 // d D s S w W

	300 case 'd':

	301 case 'D':

	302 case 's':

	303 case 'S':

	304 case 'w':

	305 case 'W': {

	306 uc32 c = Next();

	307 Advance(2);

	308 ZoneList<CharacterRange>* ranges =

	309 new (zone()) ZoneList<CharacterRange>(2, zone());

	310 CharacterRange::AddClassEscape(c, ranges, zone());

	311 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

	312 builder->AddAtom(atom);

	313 break;

	314 }

	315 case '1':

	316 case '2':

	317 case '3':

	318 case '4':

	319 case '5':

	320 case '6':

	321 case '7':

	322 case '8':

	323 case '9': {

	324 int index = 0;

	325 if (ParseBackReferenceIndex(&index)) {

	326 if (state->IsInsideCaptureGroup(index)) {

	327 // The back reference is inside the capture group it refers to.

	328 // Nothing can possibly have been captured yet, so we use empty

	329 // instead. This ensures that, when checking a back reference,

	330 // the capture registers of the referenced capture are either

	331 // both set or both cleared.

	332 builder->AddEmpty();

	333 } else {

	334 RegExpCapture* capture = GetCapture(index);

	335 RegExpTree* atom = new (zone()) RegExpBackReference(capture);

	336 builder->AddAtom(atom);

	337 }

	338 break;

	339 }

	340 uc32 first_digit = Next();

	341 if (first_digit == '8' \|\| first_digit == '9') {

	342 // If the 'u' flag is present, only syntax characters can be

	343 // escaped,

	344 // no other identity escapes are allowed. If the 'u' flag is not

	345 // present, all identity escapes are allowed.

	346 if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

	347 builder->AddCharacter(first_digit);

	348 Advance(2);

	349 } else {

	350 return ReportError(CStrVector("Invalid escape"));

	351 }

	352 break;

	353 }

	354 }

	355 // FALLTHROUGH

	356 case '0': {

	357 Advance();

	358 uc32 octal = ParseOctalLiteral();

	359 builder->AddCharacter(octal);

	360 break;

	361 }

	362 // ControlEscape :: one of

	363 // f n r t v

	364 case 'f':

	365 Advance(2);

	366 builder->AddCharacter('\f');

	367 break;

	368 case 'n':

	369 Advance(2);

	370 builder->AddCharacter('\n');

	371 break;

	372 case 'r':

	373 Advance(2);

	374 builder->AddCharacter('\r');

	375 break;

	376 case 't':

	377 Advance(2);

	378 builder->AddCharacter('\t');

	379 break;

	380 case 'v':

	381 Advance(2);

	382 builder->AddCharacter('\v');

	383 break;

	384 case 'c': {

	385 Advance();

	386 uc32 controlLetter = Next();

	387 // Special case if it is an ASCII letter.

	388 // Convert lower case letters to uppercase.

	389 uc32 letter = controlLetter & ~('a' ^ 'A');

	390 if (letter < 'A' \|\| 'Z' < letter) {

	391 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.

	392 // This is outside the specification. We match JSC in

	393 // reading the backslash as a literal character instead

	394 // of as starting an escape.

	395 builder->AddCharacter('\\');

	396 } else {

	397 Advance(2);

	398 builder->AddCharacter(controlLetter & 0x1f);

	399 }

	400 break;

	401 }

	402 case 'x': {

	403 Advance(2);

	404 uc32 value;

	405 if (ParseHexEscape(2, &value)) {

	406 builder->AddCharacter(value);

	407 } else if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

	408 builder->AddCharacter('x');

	409 } else {

	410 // If the 'u' flag is present, invalid escapes are not treated as

	411 // identity escapes.

	412 return ReportError(CStrVector("Invalid escape"));

	413 }

	414 break;

	415 }

	416 case 'u': {

	417 Advance(2);

	418 uc32 value;

	419 if (ParseUnicodeEscape(&value)) {

	420 builder->AddCharacter(value);

	421 } else if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

	422 builder->AddCharacter('u');

	423 } else {

	424 // If the 'u' flag is present, invalid escapes are not treated as

	425 // identity escapes.

	426 return ReportError(CStrVector("Invalid unicode escape"));

	427 }

	428 break;

	429 }

	430 default:

	431 Advance();

	432 // If the 'u' flag is present, only syntax characters can be

	433 // escaped, no

	434 // other identity escapes are allowed. If the 'u' flag is not

	435 // present,

	436 // all identity escapes are allowed.

	437 if (!FLAG_harmony_unicode_regexps \|\| !unicode_ \|\|

	438 IsSyntaxCharacter(current())) {

	439 builder->AddCharacter(current());

	440 Advance();

	441 } else {

	442 return ReportError(CStrVector("Invalid escape"));

	443 }

	444 break;

	445 }

	446 break;

	447 case '{': {

	448 int dummy;

	449 if (ParseIntervalQuantifier(&dummy, &dummy)) {

	450 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);

	451 }

	452 // fallthrough

	453 }

	454 default:

	455 builder->AddCharacter(current());

	456 Advance();

	457 break;

	458 } // end switch(current())

	459

	460 int min;

	461 int max;

	462 switch (current()) {

	463 // QuantifierPrefix ::

	464 // *

	465 // +

	466 // ?

	467 // {

	468 case '*':

	469 min = 0;

	470 max = RegExpTree::kInfinity;

	471 Advance();

	472 break;

	473 case '+':

	474 min = 1;

	475 max = RegExpTree::kInfinity;

	476 Advance();

	477 break;

	478 case '?':

	479 min = 0;

	480 max = 1;

	481 Advance();

	482 break;

	483 case '{':

	484 if (ParseIntervalQuantifier(&min, &max)) {

	485 if (max < min) {

	486 ReportError(CStrVector("numbers out of order in {} quantifier.")

	487 CHECK_FAILED);

	488 }

	489 break;

	490 } else {

	491 continue;

	492 }

	493 default:

	494 continue;

	495 }

	496 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;

	497 if (current() == '?') {

	498 quantifier_type = RegExpQuantifier::NON_GREEDY;

	499 Advance();

	500 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {

	501 // FLAG_regexp_possessive_quantifier is a debug-only flag.

	502 quantifier_type = RegExpQuantifier::POSSESSIVE;

	503 Advance();

	504 }

	505 builder->AddQuantifierToAtom(min, max, quantifier_type);

	506 }

	507 }

	508

	509

	510 #ifdef DEBUG

	511 // Currently only used in an DCHECK.

	512 static bool IsSpecialClassEscape(uc32 c) {

	513 switch (c) {

	514 case 'd':

	515 case 'D':

	516 case 's':

	517 case 'S':

	518 case 'w':

	519 case 'W':

	520 return true;

	521 default:

	522 return false;

	523 }

	524 }

	525 #endif

	526

	527

	528 // In order to know whether an escape is a backreference or not we have to scan

	529 // the entire regexp and find the number of capturing parentheses. However we

	530 // don't want to scan the regexp twice unless it is necessary. This mini-parser

	531 // is called when needed. It can see the difference between capturing and

	532 // noncapturing parentheses and can skip character classes and backslash-escaped

	533 // characters.

	534 void RegExpParser::ScanForCaptures() {

	535 // Start with captures started previous to current position

	536 int capture_count = captures_started();

	537 // Add count of captures after this position.

	538 int n;

	539 while ((n = current()) != kEndMarker) {

	540 Advance();

	541 switch (n) {

	542 case '\\':

	543 Advance();

	544 break;

	545 case '[': {

	546 int c;

	547 while ((c = current()) != kEndMarker) {

	548 Advance();

	549 if (c == '\\') {

	550 Advance();

	551 } else {

	552 if (c == ']') break;

	553 }

	554 }

	555 break;

	556 }

	557 case '(':

	558 if (current() != '?') capture_count++;

	559 break;

	560 }

	561 }

	562 capture_count_ = capture_count;

	563 is_scanned_for_captures_ = true;

	564 }

	565

	566

	567 bool RegExpParser::ParseBackReferenceIndex(int* index_out) {

	568 DCHECK_EQ('\\', current());

	569 DCHECK('1' <= Next() && Next() <= '9');

	570 // Try to parse a decimal literal that is no greater than the total number

	571 // of left capturing parentheses in the input.

	572 int start = position();

	573 int value = Next() - '0';

	574 Advance(2);

	575 while (true) {

	576 uc32 c = current();

	577 if (IsDecimalDigit(c)) {

	578 value = 10 * value + (c - '0');

	579 if (value > kMaxCaptures) {

	580 Reset(start);

	581 return false;

	582 }

	583 Advance();

	584 } else {

	585 break;

	586 }

	587 }

	588 if (value > captures_started()) {

	589 if (!is_scanned_for_captures_) {

	590 int saved_position = position();

	591 ScanForCaptures();

	592 Reset(saved_position);

	593 }

	594 if (value > capture_count_) {

	595 Reset(start);

	596 return false;

	597 }

	598 }

	599 *index_out = value;

	600 return true;

	601 }

	602

	603

	604 RegExpCapture* RegExpParser::GetCapture(int index) {

	605 // The index for the capture groups are one-based. Its index in the list is

	606 // zero-based.

	607 int know_captures =

	608 is_scanned_for_captures_ ? capture_count_ : captures_started_;

	609 DCHECK(index <= know_captures);

	610 if (captures_ == NULL) {

	611 captures_ = new (zone()) ZoneList<RegExpCapture*>(know_captures, zone());

	612 }

	613 while (captures_->length() < know_captures) {

	614 captures_->Add(new (zone()) RegExpCapture(captures_->length() + 1), zone());

	615 }

	616 return captures_->at(index - 1);

	617 }

	618

	619

	620 bool RegExpParser::RegExpParserState::IsInsideCaptureGroup(int index) {

	621 for (RegExpParserState* s = this; s != NULL; s = s->previous_state()) {

	622 if (s->group_type() != CAPTURE) continue;

	623 // Return true if we found the matching capture index.

	624 if (index == s->capture_index()) return true;

	625 // Abort if index is larger than what has been parsed up till this state.

	626 if (index > s->capture_index()) return false;

	627 }

	628 return false;

	629 }

	630

	631

	632 // QuantifierPrefix ::

	633 // { DecimalDigits }

	634 // { DecimalDigits , }

	635 // { DecimalDigits , DecimalDigits }

	636 //

	637 // Returns true if parsing succeeds, and set the min_out and max_out

	638 // values. Values are truncated to RegExpTree::kInfinity if they overflow.

	639 bool RegExpParser::ParseIntervalQuantifier(int* min_out, int* max_out) {

	640 DCHECK_EQ(current(), '{');

	641 int start = position();

	642 Advance();

	643 int min = 0;

	644 if (!IsDecimalDigit(current())) {

	645 Reset(start);

	646 return false;

	647 }

	648 while (IsDecimalDigit(current())) {

	649 int next = current() - '0';

	650 if (min > (RegExpTree::kInfinity - next) / 10) {

	651 // Overflow. Skip past remaining decimal digits and return -1.

	652 do {

	653 Advance();

	654 } while (IsDecimalDigit(current()));

	655 min = RegExpTree::kInfinity;

	656 break;

	657 }

	658 min = 10 * min + next;

	659 Advance();

	660 }

	661 int max = 0;

	662 if (current() == '}') {

	663 max = min;

	664 Advance();

	665 } else if (current() == ',') {

	666 Advance();

	667 if (current() == '}') {

	668 max = RegExpTree::kInfinity;

	669 Advance();

	670 } else {

	671 while (IsDecimalDigit(current())) {

	672 int next = current() - '0';

	673 if (max > (RegExpTree::kInfinity - next) / 10) {

	674 do {

	675 Advance();

	676 } while (IsDecimalDigit(current()));

	677 max = RegExpTree::kInfinity;

	678 break;

	679 }

	680 max = 10 * max + next;

	681 Advance();

	682 }

	683 if (current() != '}') {

	684 Reset(start);

	685 return false;

	686 }

	687 Advance();

	688 }

	689 } else {

	690 Reset(start);

	691 return false;

	692 }

	693 *min_out = min;

	694 *max_out = max;

	695 return true;

	696 }

	697

	698

	699 uc32 RegExpParser::ParseOctalLiteral() {

	700 DCHECK(('0' <= current() && current() <= '7') \|\| current() == kEndMarker);

	701 // For compatibility with some other browsers (not all), we parse

	702 // up to three octal digits with a value below 256.

	703 uc32 value = current() - '0';

	704 Advance();

	705 if ('0' <= current() && current() <= '7') {

	706 value = value * 8 + current() - '0';

	707 Advance();

	708 if (value < 32 && '0' <= current() && current() <= '7') {

	709 value = value * 8 + current() - '0';

	710 Advance();

	711 }

	712 }

	713 return value;

	714 }

	715

	716

	717 bool RegExpParser::ParseHexEscape(int length, uc32* value) {

	718 int start = position();

	719 uc32 val = 0;

	720 for (int i = 0; i < length; ++i) {

	721 uc32 c = current();

	722 int d = HexValue(c);

	723 if (d < 0) {

	724 Reset(start);

	725 return false;

	726 }

	727 val = val * 16 + d;

	728 Advance();

	729 }

	730 *value = val;

	731 return true;

	732 }

	733

	734

	735 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

	736 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

	737 // allowed). In the latter case, the number of hex digits between { } is

	738 // arbitrary. \ and u have already been read.

	739 if (current() == '{' && FLAG_harmony_unicode_regexps && unicode_) {

	740 int start = position();

	741 Advance();

	742 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

	743 if (current() == '}') {

	744 Advance();

	745 return true;

	746 }

	747 }

	748 Reset(start);

	749 return false;

	750 }

	751 // \u but no {, or \u{...} escapes not allowed.

	752 return ParseHexEscape(4, value);

	753 }

	754

	755

	756 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {

	757 uc32 x = 0;

	758 int d = HexValue(current());

	759 if (d < 0) {

	760 return false;

	761 }

	762 while (d >= 0) {

	763 x = x * 16 + d;

	764 if (x > max_value) {

	765 return false;

	766 }

	767 Advance();

	768 d = HexValue(current());

	769 }

	770 *value = x;

	771 return true;

	772 }

	773

	774

	775 uc32 RegExpParser::ParseClassCharacterEscape() {

	776 DCHECK(current() == '\\');

	777 DCHECK(has_next() && !IsSpecialClassEscape(Next()));

	778 Advance();

	779 switch (current()) {

	780 case 'b':

	781 Advance();

	782 return '\b';

	783 // ControlEscape :: one of

	784 // f n r t v

	785 case 'f':

	786 Advance();

	787 return '\f';

	788 case 'n':

	789 Advance();

	790 return '\n';

	791 case 'r':

	792 Advance();

	793 return '\r';

	794 case 't':

	795 Advance();

	796 return '\t';

	797 case 'v':

	798 Advance();

	799 return '\v';

	800 case 'c': {

	801 uc32 controlLetter = Next();

	802 uc32 letter = controlLetter & ~('A' ^ 'a');

	803 // For compatibility with JSC, inside a character class

	804 // we also accept digits and underscore as control characters.

	805 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

	806 controlLetter == '_' \|\| (letter >= 'A' && letter <= 'Z')) {

	807 Advance(2);

	808 // Control letters mapped to ASCII control characters in the range

	809 // 0x00-0x1f.

	810 return controlLetter & 0x1f;

	811 }

	812 // We match JSC in reading the backslash as a literal

	813 // character instead of as starting an escape.

	814 return '\\';

	815 }

	816 case '0':

	817 case '1':

	818 case '2':

	819 case '3':

	820 case '4':

	821 case '5':

	822 case '6':

	823 case '7':

	824 // For compatibility, we interpret a decimal escape that isn't

	825 // a back reference (and therefore either \0 or not valid according

	826 // to the specification) as a 1..3 digit octal character code.

	827 return ParseOctalLiteral();

	828 case 'x': {

	829 Advance();

	830 uc32 value;

	831 if (ParseHexEscape(2, &value)) {

	832 return value;

	833 }

	834 if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

	835 // If \x is not followed by a two-digit hexadecimal, treat it

	836 // as an identity escape.

	837 return 'x';

	838 }

	839 // If the 'u' flag is present, invalid escapes are not treated as

	840 // identity escapes.

	841 ReportError(CStrVector("Invalid escape"));

	842 return 0;

	843 }

	844 case 'u': {

	845 Advance();

	846 uc32 value;

	847 if (ParseUnicodeEscape(&value)) {

	848 return value;

	849 }

	850 if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

	851 return 'u';

	852 }

	853 // If the 'u' flag is present, invalid escapes are not treated as

	854 // identity escapes.

	855 ReportError(CStrVector("Invalid unicode escape"));

	856 return 0;

	857 }

	858 default: {

	859 uc32 result = current();

	860 // If the 'u' flag is present, only syntax characters can be escaped, no

	861 // other identity escapes are allowed. If the 'u' flag is not present, all

	862 // identity escapes are allowed.

	863 if (!FLAG_harmony_unicode_regexps \|\| !unicode_ \|\|

	864 IsSyntaxCharacter(result)) {

	865 Advance();

	866 return result;

	867 }

	868 ReportError(CStrVector("Invalid escape"));

	869 return 0;

	870 }

	871 }

	872 return 0;

	873 }

	874

	875

	876 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

	877 DCHECK_EQ(0, *char_class);

	878 uc32 first = current();

	879 if (first == '\\') {

	880 switch (Next()) {

	881 case 'w':

	882 case 'W':

	883 case 'd':

	884 case 'D':

	885 case 's':

	886 case 'S': {

	887 *char_class = Next();

	888 Advance(2);

	889 return CharacterRange::Singleton(0); // Return dummy value.

	890 }

	891 case kEndMarker:

	892 return ReportError(CStrVector("\\ at end of pattern"));

	893 default:

	894 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);

	895 return CharacterRange::Singleton(c);

	896 }

	897 } else {

	898 Advance();

	899 return CharacterRange::Singleton(first);

	900 }

	901 }

	902

	903

	904 static const uc16 kNoCharClass = 0;

	905

	906 // Adds range or pre-defined character class to character ranges.

	907 // If char_class is not kInvalidClass, it's interpreted as a class

	908 // escape (i.e., 's' means whitespace, from '\s').

	909 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

	910 uc16 char_class, CharacterRange range,

	911 Zone* zone) {

	912 if (char_class != kNoCharClass) {

	913 CharacterRange::AddClassEscape(char_class, ranges, zone);

	914 } else {

	915 ranges->Add(range, zone);

	916 }

	917 }

	918

	919

	920 RegExpTree* RegExpParser::ParseCharacterClass() {

	921 static const char* kUnterminated = "Unterminated character class";

	922 static const char* kRangeOutOfOrder = "Range out of order in character class";

	923

	924 DCHECK_EQ(current(), '[');

	925 Advance();

	926 bool is_negated = false;

	927 if (current() == '^') {

	928 is_negated = true;

	929 Advance();

	930 }

	931 ZoneList<CharacterRange>* ranges =

	932 new (zone()) ZoneList<CharacterRange>(2, zone());

	933 while (has_more() && current() != ']') {

	934 uc16 char_class = kNoCharClass;

	935 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);

	936 if (current() == '-') {

	937 Advance();

	938 if (current() == kEndMarker) {

	939 // If we reach the end we break out of the loop and let the

	940 // following code report an error.

	941 break;

	942 } else if (current() == ']') {

	943 AddRangeOrEscape(ranges, char_class, first, zone());

	944 ranges->Add(CharacterRange::Singleton('-'), zone());

	945 break;

	946 }

	947 uc16 char_class_2 = kNoCharClass;

	948 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);

	949 if (char_class != kNoCharClass \|\| char_class_2 != kNoCharClass) {

	950 // Either end is an escaped character class. Treat the '-' verbatim.

	951 AddRangeOrEscape(ranges, char_class, first, zone());

	952 ranges->Add(CharacterRange::Singleton('-'), zone());

	953 AddRangeOrEscape(ranges, char_class_2, next, zone());

	954 continue;

	955 }

	956 if (first.from() > next.to()) {

	957 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);

	958 }

	959 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());

	960 } else {

	961 AddRangeOrEscape(ranges, char_class, first, zone());

	962 }

	963 }

	964 if (!has_more()) {

	965 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);

	966 }

	967 Advance();

	968 if (ranges->length() == 0) {

	969 ranges->Add(CharacterRange::Everything(), zone());

	970 is_negated = !is_negated;

	971 }

	972 return new (zone()) RegExpCharacterClass(ranges, is_negated);

	973 }

	974

	975

	976 #undef CHECK_FAILED

	977

	978

	979 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

	980 FlatStringReader* input, bool multiline,

	981 bool unicode, RegExpCompileData* result) {

	982 DCHECK(result != NULL);

	983 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);

	984 RegExpTree* tree = parser.ParsePattern();

	985 if (parser.failed()) {

	986 DCHECK(tree == NULL);

	987 DCHECK(!result->error.is_null());

	988 } else {

	989 DCHECK(tree != NULL);

	990 DCHECK(result->error.is_null());

	991 result->tree = tree;

	992 int capture_count = parser.captures_started();

	993 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

	994 result->contains_anchor = parser.contains_anchor();

	995 result->capture_count = capture_count;

	996 }

	997 return !parser.failed();

	998 }

	999

	1000

	1001 RegExpBuilder::RegExpBuilder(Zone* zone)

	1002 : zone_(zone),

	1003 pending_empty_(false),

	1004 characters_(NULL),

	1005 terms_(),

	1006 alternatives_()

	1007 #ifdef DEBUG

	1008 ,

	1009 last_added_(ADD_NONE)

	1010 #endif

	1011 {

	1012 }

	1013

	1014

	1015 void RegExpBuilder::FlushCharacters() {

	1016 pending_empty_ = false;

	1017 if (characters_ != NULL) {

	1018 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());

	1019 characters_ = NULL;

	1020 text_.Add(atom, zone());

	1021 LAST(ADD_ATOM);

	1022 }

	1023 }

	1024

	1025

	1026 void RegExpBuilder::FlushText() {

	1027 FlushCharacters();

	1028 int num_text = text_.length();

	1029 if (num_text == 0) {

	1030 return;

	1031 } else if (num_text == 1) {

	1032 terms_.Add(text_.last(), zone());

	1033 } else {

	1034 RegExpText* text = new (zone()) RegExpText(zone());

	1035 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());

	1036 terms_.Add(text, zone());

	1037 }

	1038 text_.Clear();

	1039 }

	1040

	1041

	1042 void RegExpBuilder::AddCharacter(uc16 c) {

	1043 pending_empty_ = false;

	1044 if (characters_ == NULL) {

	1045 characters_ = new (zone()) ZoneList<uc16>(4, zone());

	1046 }

	1047 characters_->Add(c, zone());

	1048 LAST(ADD_CHAR);

	1049 }

	1050

	1051

	1052 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

	1053

	1054

	1055 void RegExpBuilder::AddAtom(RegExpTree* term) {

	1056 if (term->IsEmpty()) {

	1057 AddEmpty();

	1058 return;

	1059 }

	1060 if (term->IsTextElement()) {

	1061 FlushCharacters();

	1062 text_.Add(term, zone());

	1063 } else {

	1064 FlushText();

	1065 terms_.Add(term, zone());

	1066 }

	1067 LAST(ADD_ATOM);

	1068 }

	1069

	1070

	1071 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

	1072 FlushText();

	1073 terms_.Add(assert, zone());

	1074 LAST(ADD_ASSERT);

	1075 }

	1076

	1077

	1078 void RegExpBuilder::NewAlternative() { FlushTerms(); }

	1079

	1080

	1081 void RegExpBuilder::FlushTerms() {

	1082 FlushText();

	1083 int num_terms = terms_.length();

	1084 RegExpTree* alternative;

	1085 if (num_terms == 0) {

	1086 alternative = new (zone()) RegExpEmpty();

	1087 } else if (num_terms == 1) {

	1088 alternative = terms_.last();

	1089 } else {

	1090 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));

	1091 }

	1092 alternatives_.Add(alternative, zone());

	1093 terms_.Clear();

	1094 LAST(ADD_NONE);

	1095 }

	1096

	1097

	1098 RegExpTree* RegExpBuilder::ToRegExp() {

	1099 FlushTerms();

	1100 int num_alternatives = alternatives_.length();

	1101 if (num_alternatives == 0) return new (zone()) RegExpEmpty();

	1102 if (num_alternatives == 1) return alternatives_.last();

	1103 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));

	1104 }

	1105

	1106

	1107 void RegExpBuilder::AddQuantifierToAtom(

	1108 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

	1109 if (pending_empty_) {

	1110 pending_empty_ = false;

	1111 return;

	1112 }

	1113 RegExpTree* atom;

	1114 if (characters_ != NULL) {

	1115 DCHECK(last_added_ == ADD_CHAR);

	1116 // Last atom was character.

	1117 Vector<const uc16> char_vector = characters_->ToConstVector();

	1118 int num_chars = char_vector.length();

	1119 if (num_chars > 1) {

	1120 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);

	1121 text_.Add(new (zone()) RegExpAtom(prefix), zone());

	1122 char_vector = char_vector.SubVector(num_chars - 1, num_chars);

	1123 }

	1124 characters_ = NULL;

	1125 atom = new (zone()) RegExpAtom(char_vector);

	1126 FlushText();

	1127 } else if (text_.length() > 0) {

	1128 DCHECK(last_added_ == ADD_ATOM);

	1129 atom = text_.RemoveLast();

	1130 FlushText();

	1131 } else if (terms_.length() > 0) {

	1132 DCHECK(last_added_ == ADD_ATOM);

	1133 atom = terms_.RemoveLast();

	1134 if (atom->max_match() == 0) {

	1135 // Guaranteed to only match an empty string.

	1136 LAST(ADD_TERM);

	1137 if (min == 0) {

	1138 return;

	1139 }

	1140 terms_.Add(atom, zone());

	1141 return;

	1142 }

	1143 } else {

	1144 // Only call immediately after adding an atom or character!

	1145 UNREACHABLE();

	1146 return;

	1147 }

	1148 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

	1149 zone());

	1150 LAST(ADD_TERM);

	1151 }

	1152

	1153 } // namespace internal

	1154 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | src/utils.h » ('j') | no next file with comments »