| OLD | NEW |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
| 6 | 6 |
| 7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
| 8 #include "src/factory.h" | 8 #include "src/factory.h" |
| 9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
| 10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
| 11 #include "src/regexp/jsregexp.h" | 11 #include "src/regexp/jsregexp.h" |
| 12 #include "src/utils.h" | 12 #include "src/utils.h" |
| 13 | 13 |
| 14 namespace v8 { | 14 namespace v8 { |
| 15 namespace internal { | 15 namespace internal { |
| 16 | 16 |
| 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
| 18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) | 18 bool multiline, bool unicode, Isolate* isolate, |
| 19 Zone* zone) |
| 19 : isolate_(isolate), | 20 : isolate_(isolate), |
| 20 zone_(zone), | 21 zone_(zone), |
| 21 error_(error), | 22 error_(error), |
| 22 captures_(NULL), | 23 captures_(NULL), |
| 23 in_(in), | 24 in_(in), |
| 24 current_(kEndMarker), | 25 current_(kEndMarker), |
| 25 flags_(flags), | |
| 26 next_pos_(0), | 26 next_pos_(0), |
| 27 captures_started_(0), | 27 captures_started_(0), |
| 28 capture_count_(0), | 28 capture_count_(0), |
| 29 has_more_(true), | 29 has_more_(true), |
| 30 multiline_(multiline), |
| 31 unicode_(unicode), |
| 30 simple_(false), | 32 simple_(false), |
| 31 contains_anchor_(false), | 33 contains_anchor_(false), |
| 32 is_scanned_for_captures_(false), | 34 is_scanned_for_captures_(false), |
| 33 failed_(false) { | 35 failed_(false) { |
| 34 Advance(); | 36 Advance(); |
| 35 } | 37 } |
| 36 | 38 |
| 37 | 39 |
| 38 template <bool update_position> | |
| 39 uc32 RegExpParser::ReadNext() { | |
| 40 int position = next_pos_; | |
| 41 uc32 c0 = in()->Get(position); | |
| 42 position++; | |
| 43 // Read the whole surrogate pair in case of unicode flag, if possible. | |
| 44 if (unicode() && position < in()->length() && | |
| 45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { | |
| 46 uc16 c1 = in()->Get(position); | |
| 47 if (unibrow::Utf16::IsTrailSurrogate(c1)) { | |
| 48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); | |
| 49 position++; | |
| 50 } | |
| 51 } | |
| 52 if (update_position) next_pos_ = position; | |
| 53 return c0; | |
| 54 } | |
| 55 | |
| 56 | |
| 57 uc32 RegExpParser::Next() { | 40 uc32 RegExpParser::Next() { |
| 58 if (has_next()) { | 41 if (has_next()) { |
| 59 return ReadNext<false>(); | 42 return in()->Get(next_pos_); |
| 60 } else { | 43 } else { |
| 61 return kEndMarker; | 44 return kEndMarker; |
| 62 } | 45 } |
| 63 } | 46 } |
| 64 | 47 |
| 65 | 48 |
| 66 void RegExpParser::Advance() { | 49 void RegExpParser::Advance() { |
| 67 if (has_next()) { | 50 if (next_pos_ < in()->length()) { |
| 68 StackLimitCheck check(isolate()); | 51 StackLimitCheck check(isolate()); |
| 69 if (check.HasOverflowed()) { | 52 if (check.HasOverflowed()) { |
| 70 ReportError(CStrVector(Isolate::kStackOverflowMessage)); | 53 ReportError(CStrVector(Isolate::kStackOverflowMessage)); |
| 71 } else if (zone()->excess_allocation()) { | 54 } else if (zone()->excess_allocation()) { |
| 72 ReportError(CStrVector("Regular expression too large")); | 55 ReportError(CStrVector("Regular expression too large")); |
| 73 } else { | 56 } else { |
| 74 current_ = ReadNext<true>(); | 57 current_ = in()->Get(next_pos_); |
| 58 next_pos_++; |
| 59 // Read the whole surrogate pair in case of unicode flag, if possible. |
| 60 if (unicode_ && next_pos_ < in()->length() && |
| 61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { |
| 62 uc16 trail = in()->Get(next_pos_); |
| 63 if (unibrow::Utf16::IsTrailSurrogate(trail)) { |
| 64 current_ = unibrow::Utf16::CombineSurrogatePair( |
| 65 static_cast<uc16>(current_), trail); |
| 66 next_pos_++; |
| 67 } |
| 68 } |
| 75 } | 69 } |
| 76 } else { | 70 } else { |
| 77 current_ = kEndMarker; | 71 current_ = kEndMarker; |
| 78 // Advance so that position() points to 1-after-the-last-character. This is | 72 // Advance so that position() points to 1-after-the-last-character. This is |
| 79 // important so that Reset() to this position works correctly. | 73 // important so that Reset() to this position works correctly. |
| 80 next_pos_ = in()->length() + 1; | 74 next_pos_ = in()->length() + 1; |
| 81 has_more_ = false; | 75 has_more_ = false; |
| 82 } | 76 } |
| 83 } | 77 } |
| 84 | 78 |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 141 // Alternative :: | 135 // Alternative :: |
| 142 // [empty] | 136 // [empty] |
| 143 // Term Alternative | 137 // Term Alternative |
| 144 // Term :: | 138 // Term :: |
| 145 // Assertion | 139 // Assertion |
| 146 // Atom | 140 // Atom |
| 147 // Atom Quantifier | 141 // Atom Quantifier |
| 148 RegExpTree* RegExpParser::ParseDisjunction() { | 142 RegExpTree* RegExpParser::ParseDisjunction() { |
| 149 // Used to store current state while parsing subexpressions. | 143 // Used to store current state while parsing subexpressions. |
| 150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | 144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
| 151 flags_, zone()); | 145 zone()); |
| 152 RegExpParserState* state = &initial_state; | 146 RegExpParserState* state = &initial_state; |
| 153 // Cache the builder in a local variable for quick access. | 147 // Cache the builder in a local variable for quick access. |
| 154 RegExpBuilder* builder = initial_state.builder(); | 148 RegExpBuilder* builder = initial_state.builder(); |
| 155 while (true) { | 149 while (true) { |
| 156 switch (current()) { | 150 switch (current()) { |
| 157 case kEndMarker: | 151 case kEndMarker: |
| 158 if (state->IsSubexpression()) { | 152 if (state->IsSubexpression()) { |
| 159 // Inside a parenthesized group when hitting end of input. | 153 // Inside a parenthesized group when hitting end of input. |
| 160 ReportError(CStrVector("Unterminated group") CHECK_FAILED); | 154 ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
| 161 } | 155 } |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 205 Advance(); | 199 Advance(); |
| 206 builder->NewAlternative(); | 200 builder->NewAlternative(); |
| 207 continue; | 201 continue; |
| 208 } | 202 } |
| 209 case '*': | 203 case '*': |
| 210 case '+': | 204 case '+': |
| 211 case '?': | 205 case '?': |
| 212 return ReportError(CStrVector("Nothing to repeat")); | 206 return ReportError(CStrVector("Nothing to repeat")); |
| 213 case '^': { | 207 case '^': { |
| 214 Advance(); | 208 Advance(); |
| 215 if (multiline()) { | 209 if (multiline_) { |
| 216 builder->AddAssertion( | 210 builder->AddAssertion( |
| 217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); | 211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
| 218 } else { | 212 } else { |
| 219 builder->AddAssertion( | 213 builder->AddAssertion( |
| 220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); | 214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); |
| 221 set_contains_anchor(); | 215 set_contains_anchor(); |
| 222 } | 216 } |
| 223 continue; | 217 continue; |
| 224 } | 218 } |
| 225 case '$': { | 219 case '$': { |
| 226 Advance(); | 220 Advance(); |
| 227 RegExpAssertion::AssertionType assertion_type = | 221 RegExpAssertion::AssertionType assertion_type = |
| 228 multiline() ? RegExpAssertion::END_OF_LINE | 222 multiline_ ? RegExpAssertion::END_OF_LINE |
| 229 : RegExpAssertion::END_OF_INPUT; | 223 : RegExpAssertion::END_OF_INPUT; |
| 230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); | 224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); |
| 231 continue; | 225 continue; |
| 232 } | 226 } |
| 233 case '.': { | 227 case '.': { |
| 234 Advance(); | 228 Advance(); |
| 235 // everything except \x0a, \x0d, \u2028 and \u2029 | 229 // everything except \x0a, \x0d, \u2028 and \u2029 |
| 236 ZoneList<CharacterRange>* ranges = | 230 ZoneList<CharacterRange>* ranges = |
| 237 new (zone()) ZoneList<CharacterRange>(2, zone()); | 231 new (zone()) ZoneList<CharacterRange>(2, zone()); |
| 238 CharacterRange::AddClassEscape('.', ranges, zone()); | 232 CharacterRange::AddClassEscape('.', ranges, zone()); |
| 239 RegExpCharacterClass* cc = | 233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
| 240 new (zone()) RegExpCharacterClass(ranges, false); | 234 builder->AddAtom(atom); |
| 241 builder->AddCharacterClass(cc); | |
| 242 break; | 235 break; |
| 243 } | 236 } |
| 244 case '(': { | 237 case '(': { |
| 245 SubexpressionType subexpr_type = CAPTURE; | 238 SubexpressionType subexpr_type = CAPTURE; |
| 246 RegExpLookaround::Type lookaround_type = state->lookaround_type(); | 239 RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
| 247 Advance(); | 240 Advance(); |
| 248 if (current() == '?') { | 241 if (current() == '?') { |
| 249 switch (Next()) { | 242 switch (Next()) { |
| 250 case ':': | 243 case ':': |
| 251 subexpr_type = GROUPING; | 244 subexpr_type = GROUPING; |
| (...skipping 24 matching lines...) Expand all Loading... |
| 276 break; | 269 break; |
| 277 } | 270 } |
| 278 Advance(2); | 271 Advance(2); |
| 279 } else { | 272 } else { |
| 280 if (captures_started_ >= kMaxCaptures) { | 273 if (captures_started_ >= kMaxCaptures) { |
| 281 ReportError(CStrVector("Too many captures") CHECK_FAILED); | 274 ReportError(CStrVector("Too many captures") CHECK_FAILED); |
| 282 } | 275 } |
| 283 captures_started_++; | 276 captures_started_++; |
| 284 } | 277 } |
| 285 // Store current state and begin new disjunction parsing. | 278 // Store current state and begin new disjunction parsing. |
| 286 state = | 279 state = new (zone()) RegExpParserState( |
| 287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, | 280 state, subexpr_type, lookaround_type, captures_started_, zone()); |
| 288 captures_started_, flags_, zone()); | |
| 289 builder = state->builder(); | 281 builder = state->builder(); |
| 290 continue; | 282 continue; |
| 291 } | 283 } |
| 292 case '[': { | 284 case '[': { |
| 293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); | 285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); |
| 294 builder->AddCharacterClass(cc->AsCharacterClass()); | 286 builder->AddAtom(atom); |
| 295 break; | 287 break; |
| 296 } | 288 } |
| 297 // Atom :: | 289 // Atom :: |
| 298 // \ AtomEscape | 290 // \ AtomEscape |
| 299 case '\\': | 291 case '\\': |
| 300 switch (Next()) { | 292 switch (Next()) { |
| 301 case kEndMarker: | 293 case kEndMarker: |
| 302 return ReportError(CStrVector("\\ at end of pattern")); | 294 return ReportError(CStrVector("\\ at end of pattern")); |
| 303 case 'b': | 295 case 'b': |
| 304 Advance(2); | 296 Advance(2); |
| (...skipping 14 matching lines...) Expand all Loading... |
| 319 case 'D': | 311 case 'D': |
| 320 case 's': | 312 case 's': |
| 321 case 'S': | 313 case 'S': |
| 322 case 'w': | 314 case 'w': |
| 323 case 'W': { | 315 case 'W': { |
| 324 uc32 c = Next(); | 316 uc32 c = Next(); |
| 325 Advance(2); | 317 Advance(2); |
| 326 ZoneList<CharacterRange>* ranges = | 318 ZoneList<CharacterRange>* ranges = |
| 327 new (zone()) ZoneList<CharacterRange>(2, zone()); | 319 new (zone()) ZoneList<CharacterRange>(2, zone()); |
| 328 CharacterRange::AddClassEscape(c, ranges, zone()); | 320 CharacterRange::AddClassEscape(c, ranges, zone()); |
| 329 RegExpCharacterClass* cc = | 321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
| 330 new (zone()) RegExpCharacterClass(ranges, false); | 322 builder->AddAtom(atom); |
| 331 builder->AddCharacterClass(cc); | |
| 332 break; | 323 break; |
| 333 } | 324 } |
| 334 case '1': | 325 case '1': |
| 335 case '2': | 326 case '2': |
| 336 case '3': | 327 case '3': |
| 337 case '4': | 328 case '4': |
| 338 case '5': | 329 case '5': |
| 339 case '6': | 330 case '6': |
| 340 case '7': | 331 case '7': |
| 341 case '8': | 332 case '8': |
| (...skipping 13 matching lines...) Expand all Loading... |
| 355 builder->AddAtom(atom); | 346 builder->AddAtom(atom); |
| 356 } | 347 } |
| 357 break; | 348 break; |
| 358 } | 349 } |
| 359 uc32 first_digit = Next(); | 350 uc32 first_digit = Next(); |
| 360 if (first_digit == '8' || first_digit == '9') { | 351 if (first_digit == '8' || first_digit == '9') { |
| 361 // If the 'u' flag is present, only syntax characters can be | 352 // If the 'u' flag is present, only syntax characters can be |
| 362 // escaped, | 353 // escaped, |
| 363 // no other identity escapes are allowed. If the 'u' flag is not | 354 // no other identity escapes are allowed. If the 'u' flag is not |
| 364 // present, all identity escapes are allowed. | 355 // present, all identity escapes are allowed. |
| 365 if (!unicode()) { | 356 if (!unicode_) { |
| 366 builder->AddCharacter(first_digit); | 357 builder->AddCharacter(first_digit); |
| 367 Advance(2); | 358 Advance(2); |
| 368 } else { | 359 } else { |
| 369 return ReportError(CStrVector("Invalid escape")); | 360 return ReportError(CStrVector("Invalid escape")); |
| 370 } | 361 } |
| 371 break; | 362 break; |
| 372 } | 363 } |
| 373 } | 364 } |
| 374 // FALLTHROUGH | 365 // FALLTHROUGH |
| 375 case '0': { | 366 case '0': { |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 416 Advance(2); | 407 Advance(2); |
| 417 builder->AddCharacter(controlLetter & 0x1f); | 408 builder->AddCharacter(controlLetter & 0x1f); |
| 418 } | 409 } |
| 419 break; | 410 break; |
| 420 } | 411 } |
| 421 case 'x': { | 412 case 'x': { |
| 422 Advance(2); | 413 Advance(2); |
| 423 uc32 value; | 414 uc32 value; |
| 424 if (ParseHexEscape(2, &value)) { | 415 if (ParseHexEscape(2, &value)) { |
| 425 builder->AddCharacter(value); | 416 builder->AddCharacter(value); |
| 426 } else if (!unicode()) { | 417 } else if (!unicode_) { |
| 427 builder->AddCharacter('x'); | 418 builder->AddCharacter('x'); |
| 428 } else { | 419 } else { |
| 429 // If the 'u' flag is present, invalid escapes are not treated as | 420 // If the 'u' flag is present, invalid escapes are not treated as |
| 430 // identity escapes. | 421 // identity escapes. |
| 431 return ReportError(CStrVector("Invalid escape")); | 422 return ReportError(CStrVector("Invalid escape")); |
| 432 } | 423 } |
| 433 break; | 424 break; |
| 434 } | 425 } |
| 435 case 'u': { | 426 case 'u': { |
| 436 Advance(2); | 427 Advance(2); |
| 437 uc32 value; | 428 uc32 value; |
| 438 if (ParseUnicodeEscape(&value)) { | 429 if (ParseUnicodeEscape(&value)) { |
| 439 builder->AddUnicodeCharacter(value); | 430 builder->AddUnicodeCharacter(value); |
| 440 } else if (!unicode()) { | 431 } else if (!unicode_) { |
| 441 builder->AddCharacter('u'); | 432 builder->AddCharacter('u'); |
| 442 } else { | 433 } else { |
| 443 // If the 'u' flag is present, invalid escapes are not treated as | 434 // If the 'u' flag is present, invalid escapes are not treated as |
| 444 // identity escapes. | 435 // identity escapes. |
| 445 return ReportError(CStrVector("Invalid unicode escape")); | 436 return ReportError(CStrVector("Invalid unicode escape")); |
| 446 } | 437 } |
| 447 break; | 438 break; |
| 448 } | 439 } |
| 449 default: | 440 default: |
| 450 Advance(); | 441 Advance(); |
| 451 // If the 'u' flag is present, only syntax characters can be | 442 // If the 'u' flag is present, only syntax characters can be |
| 452 // escaped, no | 443 // escaped, no |
| 453 // other identity escapes are allowed. If the 'u' flag is not | 444 // other identity escapes are allowed. If the 'u' flag is not |
| 454 // present, | 445 // present, |
| 455 // all identity escapes are allowed. | 446 // all identity escapes are allowed. |
| 456 if (!unicode() || IsSyntaxCharacter(current())) { | 447 if (!unicode_ || IsSyntaxCharacter(current())) { |
| 457 builder->AddCharacter(current()); | 448 builder->AddCharacter(current()); |
| 458 Advance(); | 449 Advance(); |
| 459 } else { | 450 } else { |
| 460 return ReportError(CStrVector("Invalid escape")); | 451 return ReportError(CStrVector("Invalid escape")); |
| 461 } | 452 } |
| 462 break; | 453 break; |
| 463 } | 454 } |
| 464 break; | 455 break; |
| 465 case '{': { | 456 case '{': { |
| 466 int dummy; | 457 int dummy; |
| (...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 747 } | 738 } |
| 748 *value = val; | 739 *value = val; |
| 749 return true; | 740 return true; |
| 750 } | 741 } |
| 751 | 742 |
| 752 | 743 |
| 753 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 744 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| 754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| 755 // allowed). In the latter case, the number of hex digits between { } is | 746 // allowed). In the latter case, the number of hex digits between { } is |
| 756 // arbitrary. \ and u have already been read. | 747 // arbitrary. \ and u have already been read. |
| 757 if (current() == '{' && unicode()) { | 748 if (current() == '{' && unicode_) { |
| 758 int start = position(); | 749 int start = position(); |
| 759 Advance(); | 750 Advance(); |
| 760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
| 761 if (current() == '}') { | 752 if (current() == '}') { |
| 762 Advance(); | 753 Advance(); |
| 763 return true; | 754 return true; |
| 764 } | 755 } |
| 765 } | 756 } |
| 766 Reset(start); | 757 Reset(start); |
| 767 return false; | 758 return false; |
| (...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 842 // For compatibility, we interpret a decimal escape that isn't | 833 // For compatibility, we interpret a decimal escape that isn't |
| 843 // a back reference (and therefore either \0 or not valid according | 834 // a back reference (and therefore either \0 or not valid according |
| 844 // to the specification) as a 1..3 digit octal character code. | 835 // to the specification) as a 1..3 digit octal character code. |
| 845 return ParseOctalLiteral(); | 836 return ParseOctalLiteral(); |
| 846 case 'x': { | 837 case 'x': { |
| 847 Advance(); | 838 Advance(); |
| 848 uc32 value; | 839 uc32 value; |
| 849 if (ParseHexEscape(2, &value)) { | 840 if (ParseHexEscape(2, &value)) { |
| 850 return value; | 841 return value; |
| 851 } | 842 } |
| 852 if (!unicode()) { | 843 if (!unicode_) { |
| 853 // If \x is not followed by a two-digit hexadecimal, treat it | 844 // If \x is not followed by a two-digit hexadecimal, treat it |
| 854 // as an identity escape. | 845 // as an identity escape. |
| 855 return 'x'; | 846 return 'x'; |
| 856 } | 847 } |
| 857 // If the 'u' flag is present, invalid escapes are not treated as | 848 // If the 'u' flag is present, invalid escapes are not treated as |
| 858 // identity escapes. | 849 // identity escapes. |
| 859 ReportError(CStrVector("Invalid escape")); | 850 ReportError(CStrVector("Invalid escape")); |
| 860 return 0; | 851 return 0; |
| 861 } | 852 } |
| 862 case 'u': { | 853 case 'u': { |
| 863 Advance(); | 854 Advance(); |
| 864 uc32 value; | 855 uc32 value; |
| 865 if (ParseUnicodeEscape(&value)) { | 856 if (ParseUnicodeEscape(&value)) { |
| 866 return value; | 857 return value; |
| 867 } | 858 } |
| 868 if (!unicode()) { | 859 if (!unicode_) { |
| 869 return 'u'; | 860 return 'u'; |
| 870 } | 861 } |
| 871 // If the 'u' flag is present, invalid escapes are not treated as | 862 // If the 'u' flag is present, invalid escapes are not treated as |
| 872 // identity escapes. | 863 // identity escapes. |
| 873 ReportError(CStrVector("Invalid unicode escape")); | 864 ReportError(CStrVector("Invalid unicode escape")); |
| 874 return 0; | 865 return 0; |
| 875 } | 866 } |
| 876 default: { | 867 default: { |
| 877 uc32 result = current(); | 868 uc32 result = current(); |
| 878 // If the 'u' flag is present, only syntax characters can be escaped, no | 869 // If the 'u' flag is present, only syntax characters can be escaped, no |
| 879 // other identity escapes are allowed. If the 'u' flag is not present, all | 870 // other identity escapes are allowed. If the 'u' flag is not present, all |
| 880 // identity escapes are allowed. | 871 // identity escapes are allowed. |
| 881 if (!unicode() || IsSyntaxCharacter(result)) { | 872 if (!unicode_ || IsSyntaxCharacter(result)) { |
| 882 Advance(); | 873 Advance(); |
| 883 return result; | 874 return result; |
| 884 } | 875 } |
| 885 ReportError(CStrVector("Invalid escape")); | 876 ReportError(CStrVector("Invalid escape")); |
| 886 return 0; | 877 return 0; |
| 887 } | 878 } |
| 888 } | 879 } |
| 889 return 0; | 880 return 0; |
| 890 } | 881 } |
| 891 | 882 |
| 892 | 883 |
| 893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { | 884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
| 894 DCHECK_EQ(0, *char_class); | 885 DCHECK_EQ(0, *char_class); |
| 895 uc32 first = current(); | 886 uc32 first = current(); |
| 896 if (first == '\\') { | 887 if (first == '\\') { |
| 897 switch (Next()) { | 888 switch (Next()) { |
| 898 case 'w': | 889 case 'w': |
| 899 case 'W': | 890 case 'W': |
| 900 case 'd': | 891 case 'd': |
| 901 case 'D': | 892 case 'D': |
| 902 case 's': | 893 case 's': |
| 903 case 'S': { | 894 case 'S': { |
| 904 *char_class = Next(); | 895 *char_class = Next(); |
| 905 Advance(2); | 896 Advance(2); |
| 906 return CharacterRange::Singleton(0); // Return dummy value. | 897 return CharacterRange::Singleton(0); // Return dummy value. |
| 907 } | 898 } |
| 908 case kEndMarker: | 899 case kEndMarker: |
| 909 return ReportError(CStrVector("\\ at end of pattern")); | 900 return ReportError(CStrVector("\\ at end of pattern")); |
| 910 default: | 901 default: |
| 911 first = ParseClassCharacterEscape(CHECK_FAILED); | 902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); |
| 903 return CharacterRange::Singleton(c); |
| 912 } | 904 } |
| 913 } else { | 905 } else { |
| 914 Advance(); | 906 Advance(); |
| 907 return CharacterRange::Singleton(first); |
| 915 } | 908 } |
| 916 | |
| 917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
| 918 // Combine with possibly following trail surrogate. | |
| 919 int start = position(); | |
| 920 uc32 second = current(); | |
| 921 if (second == '\\') { | |
| 922 second = ParseClassCharacterEscape(CHECK_FAILED); | |
| 923 } else { | |
| 924 Advance(); | |
| 925 } | |
| 926 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
| 927 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
| 928 } else { | |
| 929 Reset(start); | |
| 930 } | |
| 931 } | |
| 932 | |
| 933 return CharacterRange::Singleton(first); | |
| 934 } | 909 } |
| 935 | 910 |
| 936 | 911 |
| 937 static const uc16 kNoCharClass = 0; | 912 static const uc16 kNoCharClass = 0; |
| 938 | 913 |
| 939 // Adds range or pre-defined character class to character ranges. | 914 // Adds range or pre-defined character class to character ranges. |
| 940 // If char_class is not kInvalidClass, it's interpreted as a class | 915 // If char_class is not kInvalidClass, it's interpreted as a class |
| 941 // escape (i.e., 's' means whitespace, from '\s'). | 916 // escape (i.e., 's' means whitespace, from '\s'). |
| 942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
| 943 uc16 char_class, CharacterRange range, | 918 uc16 char_class, CharacterRange range, |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 1003 is_negated = !is_negated; | 978 is_negated = !is_negated; |
| 1004 } | 979 } |
| 1005 return new (zone()) RegExpCharacterClass(ranges, is_negated); | 980 return new (zone()) RegExpCharacterClass(ranges, is_negated); |
| 1006 } | 981 } |
| 1007 | 982 |
| 1008 | 983 |
| 1009 #undef CHECK_FAILED | 984 #undef CHECK_FAILED |
| 1010 | 985 |
| 1011 | 986 |
| 1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, | 987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
| 1013 FlatStringReader* input, JSRegExp::Flags flags, | 988 FlatStringReader* input, bool multiline, |
| 1014 RegExpCompileData* result) { | 989 bool unicode, RegExpCompileData* result) { |
| 1015 DCHECK(result != NULL); | 990 DCHECK(result != NULL); |
| 1016 RegExpParser parser(input, &result->error, flags, isolate, zone); | 991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); |
| 1017 RegExpTree* tree = parser.ParsePattern(); | 992 RegExpTree* tree = parser.ParsePattern(); |
| 1018 if (parser.failed()) { | 993 if (parser.failed()) { |
| 1019 DCHECK(tree == NULL); | 994 DCHECK(tree == NULL); |
| 1020 DCHECK(!result->error.is_null()); | 995 DCHECK(!result->error.is_null()); |
| 1021 } else { | 996 } else { |
| 1022 DCHECK(tree != NULL); | 997 DCHECK(tree != NULL); |
| 1023 DCHECK(result->error.is_null()); | 998 DCHECK(result->error.is_null()); |
| 1024 if (FLAG_trace_regexp_parser) { | 999 if (FLAG_trace_regexp_parser) { |
| 1025 OFStream os(stdout); | 1000 OFStream os(stdout); |
| 1026 tree->Print(os, zone); | 1001 tree->Print(os, zone); |
| 1027 os << "\n"; | 1002 os << "\n"; |
| 1028 } | 1003 } |
| 1029 result->tree = tree; | 1004 result->tree = tree; |
| 1030 int capture_count = parser.captures_started(); | 1005 int capture_count = parser.captures_started(); |
| 1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; | 1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
| 1032 result->contains_anchor = parser.contains_anchor(); | 1007 result->contains_anchor = parser.contains_anchor(); |
| 1033 result->capture_count = capture_count; | 1008 result->capture_count = capture_count; |
| 1034 } | 1009 } |
| 1035 return !parser.failed(); | 1010 return !parser.failed(); |
| 1036 } | 1011 } |
| 1037 | 1012 |
| 1038 | 1013 |
| 1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) | 1014 RegExpBuilder::RegExpBuilder(Zone* zone) |
| 1040 : zone_(zone), | 1015 : zone_(zone), |
| 1041 pending_empty_(false), | 1016 pending_empty_(false), |
| 1042 flags_(flags), | |
| 1043 characters_(NULL), | 1017 characters_(NULL), |
| 1044 pending_surrogate_(kNoPendingSurrogate), | |
| 1045 terms_(), | 1018 terms_(), |
| 1046 alternatives_() | 1019 alternatives_() |
| 1047 #ifdef DEBUG | 1020 #ifdef DEBUG |
| 1048 , | 1021 , |
| 1049 last_added_(ADD_NONE) | 1022 last_added_(ADD_NONE) |
| 1050 #endif | 1023 #endif |
| 1051 { | 1024 { |
| 1052 } | 1025 } |
| 1053 | 1026 |
| 1054 | 1027 |
| 1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { | |
| 1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | |
| 1057 FlushPendingSurrogate(); | |
| 1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. | |
| 1059 pending_surrogate_ = lead_surrogate; | |
| 1060 } | |
| 1061 | |
| 1062 | |
| 1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { | |
| 1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); | |
| 1065 if (pending_surrogate_ != kNoPendingSurrogate) { | |
| 1066 uc16 lead_surrogate = pending_surrogate_; | |
| 1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | |
| 1068 ZoneList<uc16> surrogate_pair(2, zone()); | |
| 1069 surrogate_pair.Add(lead_surrogate, zone()); | |
| 1070 surrogate_pair.Add(trail_surrogate, zone()); | |
| 1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | |
| 1072 pending_surrogate_ = kNoPendingSurrogate; | |
| 1073 AddAtom(atom); | |
| 1074 } else { | |
| 1075 pending_surrogate_ = trail_surrogate; | |
| 1076 FlushPendingSurrogate(); | |
| 1077 } | |
| 1078 } | |
| 1079 | |
| 1080 | |
| 1081 void RegExpBuilder::FlushPendingSurrogate() { | |
| 1082 if (pending_surrogate_ != kNoPendingSurrogate) { | |
| 1083 // Use character class to desugar lone surrogate matching. | |
| 1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( | |
| 1085 CharacterRange::List(zone(), | |
| 1086 CharacterRange::Singleton(pending_surrogate_)), | |
| 1087 false); | |
| 1088 pending_surrogate_ = kNoPendingSurrogate; | |
| 1089 DCHECK(unicode()); | |
| 1090 AddCharacterClass(cc); | |
| 1091 } | |
| 1092 } | |
| 1093 | |
| 1094 | |
| 1095 void RegExpBuilder::FlushCharacters() { | 1028 void RegExpBuilder::FlushCharacters() { |
| 1096 FlushPendingSurrogate(); | |
| 1097 pending_empty_ = false; | 1029 pending_empty_ = false; |
| 1098 if (characters_ != NULL) { | 1030 if (characters_ != NULL) { |
| 1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); | 1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
| 1100 characters_ = NULL; | 1032 characters_ = NULL; |
| 1101 text_.Add(atom, zone()); | 1033 text_.Add(atom, zone()); |
| 1102 LAST(ADD_ATOM); | 1034 LAST(ADD_ATOM); |
| 1103 } | 1035 } |
| 1104 } | 1036 } |
| 1105 | 1037 |
| 1106 | 1038 |
| 1107 void RegExpBuilder::FlushText() { | 1039 void RegExpBuilder::FlushText() { |
| 1108 FlushCharacters(); | 1040 FlushCharacters(); |
| 1109 int num_text = text_.length(); | 1041 int num_text = text_.length(); |
| 1110 if (num_text == 0) { | 1042 if (num_text == 0) { |
| 1111 return; | 1043 return; |
| 1112 } else if (num_text == 1) { | 1044 } else if (num_text == 1) { |
| 1113 terms_.Add(text_.last(), zone()); | 1045 terms_.Add(text_.last(), zone()); |
| 1114 } else { | 1046 } else { |
| 1115 RegExpText* text = new (zone()) RegExpText(zone()); | 1047 RegExpText* text = new (zone()) RegExpText(zone()); |
| 1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); | 1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
| 1117 terms_.Add(text, zone()); | 1049 terms_.Add(text, zone()); |
| 1118 } | 1050 } |
| 1119 text_.Clear(); | 1051 text_.Clear(); |
| 1120 } | 1052 } |
| 1121 | 1053 |
| 1122 | 1054 |
| 1123 void RegExpBuilder::AddCharacter(uc16 c) { | 1055 void RegExpBuilder::AddCharacter(uc16 c) { |
| 1124 FlushPendingSurrogate(); | |
| 1125 pending_empty_ = false; | 1056 pending_empty_ = false; |
| 1126 if (characters_ == NULL) { | 1057 if (characters_ == NULL) { |
| 1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); | 1058 characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
| 1128 } | 1059 } |
| 1129 characters_->Add(c, zone()); | 1060 characters_->Add(c, zone()); |
| 1130 LAST(ADD_CHAR); | 1061 LAST(ADD_CHAR); |
| 1131 } | 1062 } |
| 1132 | 1063 |
| 1133 | 1064 |
| 1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
| 1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { | 1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| 1136 DCHECK(unicode()); | 1067 ZoneList<uc16> surrogate_pair(2, zone()); |
| 1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); | 1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); |
| 1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); |
| 1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
| 1140 AddLeadSurrogate(c); | 1071 AddAtom(atom); |
| 1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | |
| 1142 AddTrailSurrogate(c); | |
| 1143 } else { | 1072 } else { |
| 1144 AddCharacter(static_cast<uc16>(c)); | 1073 AddCharacter(static_cast<uc16>(c)); |
| 1145 } | 1074 } |
| 1146 } | 1075 } |
| 1147 | 1076 |
| 1148 | 1077 |
| 1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| 1150 | 1079 |
| 1151 | 1080 |
| 1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | |
| 1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { | |
| 1154 // In unicode mode, character class needs to be desugared, so it | |
| 1155 // must be a standalone term instead of being part of a RegExpText. | |
| 1156 AddTerm(cc); | |
| 1157 } else { | |
| 1158 AddAtom(cc); | |
| 1159 } | |
| 1160 } | |
| 1161 | |
| 1162 | |
| 1163 void RegExpBuilder::AddAtom(RegExpTree* term) { | 1081 void RegExpBuilder::AddAtom(RegExpTree* term) { |
| 1164 if (term->IsEmpty()) { | 1082 if (term->IsEmpty()) { |
| 1165 AddEmpty(); | 1083 AddEmpty(); |
| 1166 return; | 1084 return; |
| 1167 } | 1085 } |
| 1168 if (term->IsTextElement()) { | 1086 if (term->IsTextElement()) { |
| 1169 FlushCharacters(); | 1087 FlushCharacters(); |
| 1170 text_.Add(term, zone()); | 1088 text_.Add(term, zone()); |
| 1171 } else { | 1089 } else { |
| 1172 FlushText(); | 1090 FlushText(); |
| 1173 terms_.Add(term, zone()); | 1091 terms_.Add(term, zone()); |
| 1174 } | 1092 } |
| 1175 LAST(ADD_ATOM); | 1093 LAST(ADD_ATOM); |
| 1176 } | 1094 } |
| 1177 | 1095 |
| 1178 | |
| 1179 void RegExpBuilder::AddTerm(RegExpTree* term) { | |
| 1180 FlushText(); | |
| 1181 terms_.Add(term, zone()); | |
| 1182 LAST(ADD_ATOM); | |
| 1183 } | |
| 1184 | |
| 1185 | 1096 |
| 1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) { | 1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
| 1187 FlushText(); | 1098 FlushText(); |
| 1188 terms_.Add(assert, zone()); | 1099 terms_.Add(assert, zone()); |
| 1189 LAST(ADD_ASSERT); | 1100 LAST(ADD_ASSERT); |
| 1190 } | 1101 } |
| 1191 | 1102 |
| 1192 | 1103 |
| 1193 void RegExpBuilder::NewAlternative() { FlushTerms(); } | 1104 void RegExpBuilder::NewAlternative() { FlushTerms(); } |
| 1194 | 1105 |
| (...skipping 19 matching lines...) Expand all Loading... |
| 1214 FlushTerms(); | 1125 FlushTerms(); |
| 1215 int num_alternatives = alternatives_.length(); | 1126 int num_alternatives = alternatives_.length(); |
| 1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
| 1217 if (num_alternatives == 1) return alternatives_.last(); | 1128 if (num_alternatives == 1) return alternatives_.last(); |
| 1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
| 1219 } | 1130 } |
| 1220 | 1131 |
| 1221 | 1132 |
| 1222 void RegExpBuilder::AddQuantifierToAtom( | 1133 void RegExpBuilder::AddQuantifierToAtom( |
| 1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | 1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
| 1224 FlushPendingSurrogate(); | |
| 1225 if (pending_empty_) { | 1135 if (pending_empty_) { |
| 1226 pending_empty_ = false; | 1136 pending_empty_ = false; |
| 1227 return; | 1137 return; |
| 1228 } | 1138 } |
| 1229 RegExpTree* atom; | 1139 RegExpTree* atom; |
| 1230 if (characters_ != NULL) { | 1140 if (characters_ != NULL) { |
| 1231 DCHECK(last_added_ == ADD_CHAR); | 1141 DCHECK(last_added_ == ADD_CHAR); |
| 1232 // Last atom was character. | 1142 // Last atom was character. |
| 1233 Vector<const uc16> char_vector = characters_->ToConstVector(); | 1143 Vector<const uc16> char_vector = characters_->ToConstVector(); |
| 1234 int num_chars = char_vector.length(); | 1144 int num_chars = char_vector.length(); |
| (...skipping 26 matching lines...) Expand all Loading... |
| 1261 UNREACHABLE(); | 1171 UNREACHABLE(); |
| 1262 return; | 1172 return; |
| 1263 } | 1173 } |
| 1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1265 zone()); | 1175 zone()); |
| 1266 LAST(ADD_TERM); | 1176 LAST(ADD_TERM); |
| 1267 } | 1177 } |
| 1268 | 1178 |
| 1269 } // namespace internal | 1179 } // namespace internal |
| 1270 } // namespace v8 | 1180 } // namespace v8 |
| OLD | NEW |