| OLD | NEW |
| 1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
| 6 | 6 |
| 7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
| 8 #include "src/factory.h" | 8 #include "src/factory.h" |
| 9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
| 10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
| 11 #include "src/regexp/jsregexp.h" | 11 #include "src/regexp/jsregexp.h" |
| 12 #include "src/utils.h" | 12 #include "src/utils.h" |
| 13 | 13 |
| 14 namespace v8 { | 14 namespace v8 { |
| 15 namespace internal { | 15 namespace internal { |
| 16 | 16 |
| 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
| 18 bool multiline, bool unicode, Isolate* isolate, | 18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
| 19 Zone* zone) | |
| 20 : isolate_(isolate), | 19 : isolate_(isolate), |
| 21 zone_(zone), | 20 zone_(zone), |
| 22 error_(error), | 21 error_(error), |
| 23 captures_(NULL), | 22 captures_(NULL), |
| 24 in_(in), | 23 in_(in), |
| 25 current_(kEndMarker), | 24 current_(kEndMarker), |
| 25 flags_(flags), |
| 26 next_pos_(0), | 26 next_pos_(0), |
| 27 captures_started_(0), | 27 captures_started_(0), |
| 28 capture_count_(0), | 28 capture_count_(0), |
| 29 has_more_(true), | 29 has_more_(true), |
| 30 multiline_(multiline), | |
| 31 unicode_(unicode), | |
| 32 simple_(false), | 30 simple_(false), |
| 33 contains_anchor_(false), | 31 contains_anchor_(false), |
| 34 is_scanned_for_captures_(false), | 32 is_scanned_for_captures_(false), |
| 35 failed_(false) { | 33 failed_(false) { |
| 36 Advance(); | 34 Advance(); |
| 37 } | 35 } |
| 38 | 36 |
| 39 | 37 |
| 38 template <bool update_position> |
| 39 uc32 RegExpParser::ReadNext() { |
| 40 int position = next_pos_; |
| 41 uc32 c0 = in()->Get(position); |
| 42 position++; |
| 43 // Read the whole surrogate pair in case of unicode flag, if possible. |
| 44 if (unicode() && position < in()->length() && |
| 45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
| 46 uc16 c1 = in()->Get(position); |
| 47 if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
| 48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
| 49 position++; |
| 50 } |
| 51 } |
| 52 if (update_position) next_pos_ = position; |
| 53 return c0; |
| 54 } |
| 55 |
| 56 |
| 40 uc32 RegExpParser::Next() { | 57 uc32 RegExpParser::Next() { |
| 41 if (has_next()) { | 58 if (has_next()) { |
| 42 return in()->Get(next_pos_); | 59 return ReadNext<false>(); |
| 43 } else { | 60 } else { |
| 44 return kEndMarker; | 61 return kEndMarker; |
| 45 } | 62 } |
| 46 } | 63 } |
| 47 | 64 |
| 48 | 65 |
| 49 void RegExpParser::Advance() { | 66 void RegExpParser::Advance() { |
| 50 if (next_pos_ < in()->length()) { | 67 if (has_next()) { |
| 51 StackLimitCheck check(isolate()); | 68 StackLimitCheck check(isolate()); |
| 52 if (check.HasOverflowed()) { | 69 if (check.HasOverflowed()) { |
| 53 ReportError(CStrVector(Isolate::kStackOverflowMessage)); | 70 ReportError(CStrVector(Isolate::kStackOverflowMessage)); |
| 54 } else if (zone()->excess_allocation()) { | 71 } else if (zone()->excess_allocation()) { |
| 55 ReportError(CStrVector("Regular expression too large")); | 72 ReportError(CStrVector("Regular expression too large")); |
| 56 } else { | 73 } else { |
| 57 current_ = in()->Get(next_pos_); | 74 current_ = ReadNext<true>(); |
| 58 next_pos_++; | |
| 59 // Read the whole surrogate pair in case of unicode flag, if possible. | |
| 60 if (unicode_ && next_pos_ < in()->length() && | |
| 61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { | |
| 62 uc16 trail = in()->Get(next_pos_); | |
| 63 if (unibrow::Utf16::IsTrailSurrogate(trail)) { | |
| 64 current_ = unibrow::Utf16::CombineSurrogatePair( | |
| 65 static_cast<uc16>(current_), trail); | |
| 66 next_pos_++; | |
| 67 } | |
| 68 } | |
| 69 } | 75 } |
| 70 } else { | 76 } else { |
| 71 current_ = kEndMarker; | 77 current_ = kEndMarker; |
| 72 // Advance so that position() points to 1-after-the-last-character. This is | 78 // Advance so that position() points to 1-after-the-last-character. This is |
| 73 // important so that Reset() to this position works correctly. | 79 // important so that Reset() to this position works correctly. |
| 74 next_pos_ = in()->length() + 1; | 80 next_pos_ = in()->length() + 1; |
| 75 has_more_ = false; | 81 has_more_ = false; |
| 76 } | 82 } |
| 77 } | 83 } |
| 78 | 84 |
| (...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 135 // Alternative :: | 141 // Alternative :: |
| 136 // [empty] | 142 // [empty] |
| 137 // Term Alternative | 143 // Term Alternative |
| 138 // Term :: | 144 // Term :: |
| 139 // Assertion | 145 // Assertion |
| 140 // Atom | 146 // Atom |
| 141 // Atom Quantifier | 147 // Atom Quantifier |
| 142 RegExpTree* RegExpParser::ParseDisjunction() { | 148 RegExpTree* RegExpParser::ParseDisjunction() { |
| 143 // Used to store current state while parsing subexpressions. | 149 // Used to store current state while parsing subexpressions. |
| 144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | 150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
| 145 zone()); | 151 flags_, zone()); |
| 146 RegExpParserState* state = &initial_state; | 152 RegExpParserState* state = &initial_state; |
| 147 // Cache the builder in a local variable for quick access. | 153 // Cache the builder in a local variable for quick access. |
| 148 RegExpBuilder* builder = initial_state.builder(); | 154 RegExpBuilder* builder = initial_state.builder(); |
| 149 while (true) { | 155 while (true) { |
| 150 switch (current()) { | 156 switch (current()) { |
| 151 case kEndMarker: | 157 case kEndMarker: |
| 152 if (state->IsSubexpression()) { | 158 if (state->IsSubexpression()) { |
| 153 // Inside a parenthesized group when hitting end of input. | 159 // Inside a parenthesized group when hitting end of input. |
| 154 ReportError(CStrVector("Unterminated group") CHECK_FAILED); | 160 ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
| 155 } | 161 } |
| (...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 199 Advance(); | 205 Advance(); |
| 200 builder->NewAlternative(); | 206 builder->NewAlternative(); |
| 201 continue; | 207 continue; |
| 202 } | 208 } |
| 203 case '*': | 209 case '*': |
| 204 case '+': | 210 case '+': |
| 205 case '?': | 211 case '?': |
| 206 return ReportError(CStrVector("Nothing to repeat")); | 212 return ReportError(CStrVector("Nothing to repeat")); |
| 207 case '^': { | 213 case '^': { |
| 208 Advance(); | 214 Advance(); |
| 209 if (multiline_) { | 215 if (multiline()) { |
| 210 builder->AddAssertion( | 216 builder->AddAssertion( |
| 211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); | 217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
| 212 } else { | 218 } else { |
| 213 builder->AddAssertion( | 219 builder->AddAssertion( |
| 214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); | 220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); |
| 215 set_contains_anchor(); | 221 set_contains_anchor(); |
| 216 } | 222 } |
| 217 continue; | 223 continue; |
| 218 } | 224 } |
| 219 case '$': { | 225 case '$': { |
| 220 Advance(); | 226 Advance(); |
| 221 RegExpAssertion::AssertionType assertion_type = | 227 RegExpAssertion::AssertionType assertion_type = |
| 222 multiline_ ? RegExpAssertion::END_OF_LINE | 228 multiline() ? RegExpAssertion::END_OF_LINE |
| 223 : RegExpAssertion::END_OF_INPUT; | 229 : RegExpAssertion::END_OF_INPUT; |
| 224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); | 230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); |
| 225 continue; | 231 continue; |
| 226 } | 232 } |
| 227 case '.': { | 233 case '.': { |
| 228 Advance(); | 234 Advance(); |
| 229 // everything except \x0a, \x0d, \u2028 and \u2029 | 235 // everything except \x0a, \x0d, \u2028 and \u2029 |
| 230 ZoneList<CharacterRange>* ranges = | 236 ZoneList<CharacterRange>* ranges = |
| 231 new (zone()) ZoneList<CharacterRange>(2, zone()); | 237 new (zone()) ZoneList<CharacterRange>(2, zone()); |
| 232 CharacterRange::AddClassEscape('.', ranges, zone()); | 238 CharacterRange::AddClassEscape('.', ranges, zone()); |
| 233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); | 239 RegExpCharacterClass* cc = |
| 234 builder->AddAtom(atom); | 240 new (zone()) RegExpCharacterClass(ranges, false); |
| 241 builder->AddCharacterClass(cc); |
| 235 break; | 242 break; |
| 236 } | 243 } |
| 237 case '(': { | 244 case '(': { |
| 238 SubexpressionType subexpr_type = CAPTURE; | 245 SubexpressionType subexpr_type = CAPTURE; |
| 239 RegExpLookaround::Type lookaround_type = state->lookaround_type(); | 246 RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
| 240 Advance(); | 247 Advance(); |
| 241 if (current() == '?') { | 248 if (current() == '?') { |
| 242 switch (Next()) { | 249 switch (Next()) { |
| 243 case ':': | 250 case ':': |
| 244 subexpr_type = GROUPING; | 251 subexpr_type = GROUPING; |
| (...skipping 24 matching lines...) Expand all Loading... |
| 269 break; | 276 break; |
| 270 } | 277 } |
| 271 Advance(2); | 278 Advance(2); |
| 272 } else { | 279 } else { |
| 273 if (captures_started_ >= kMaxCaptures) { | 280 if (captures_started_ >= kMaxCaptures) { |
| 274 ReportError(CStrVector("Too many captures") CHECK_FAILED); | 281 ReportError(CStrVector("Too many captures") CHECK_FAILED); |
| 275 } | 282 } |
| 276 captures_started_++; | 283 captures_started_++; |
| 277 } | 284 } |
| 278 // Store current state and begin new disjunction parsing. | 285 // Store current state and begin new disjunction parsing. |
| 279 state = new (zone()) RegExpParserState( | 286 state = |
| 280 state, subexpr_type, lookaround_type, captures_started_, zone()); | 287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, |
| 288 captures_started_, flags_, zone()); |
| 281 builder = state->builder(); | 289 builder = state->builder(); |
| 282 continue; | 290 continue; |
| 283 } | 291 } |
| 284 case '[': { | 292 case '[': { |
| 285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); | 293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); |
| 286 builder->AddAtom(atom); | 294 builder->AddCharacterClass(cc->AsCharacterClass()); |
| 287 break; | 295 break; |
| 288 } | 296 } |
| 289 // Atom :: | 297 // Atom :: |
| 290 // \ AtomEscape | 298 // \ AtomEscape |
| 291 case '\\': | 299 case '\\': |
| 292 switch (Next()) { | 300 switch (Next()) { |
| 293 case kEndMarker: | 301 case kEndMarker: |
| 294 return ReportError(CStrVector("\\ at end of pattern")); | 302 return ReportError(CStrVector("\\ at end of pattern")); |
| 295 case 'b': | 303 case 'b': |
| 296 Advance(2); | 304 Advance(2); |
| (...skipping 14 matching lines...) Expand all Loading... |
| 311 case 'D': | 319 case 'D': |
| 312 case 's': | 320 case 's': |
| 313 case 'S': | 321 case 'S': |
| 314 case 'w': | 322 case 'w': |
| 315 case 'W': { | 323 case 'W': { |
| 316 uc32 c = Next(); | 324 uc32 c = Next(); |
| 317 Advance(2); | 325 Advance(2); |
| 318 ZoneList<CharacterRange>* ranges = | 326 ZoneList<CharacterRange>* ranges = |
| 319 new (zone()) ZoneList<CharacterRange>(2, zone()); | 327 new (zone()) ZoneList<CharacterRange>(2, zone()); |
| 320 CharacterRange::AddClassEscape(c, ranges, zone()); | 328 CharacterRange::AddClassEscape(c, ranges, zone()); |
| 321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); | 329 RegExpCharacterClass* cc = |
| 322 builder->AddAtom(atom); | 330 new (zone()) RegExpCharacterClass(ranges, false); |
| 331 builder->AddCharacterClass(cc); |
| 323 break; | 332 break; |
| 324 } | 333 } |
| 325 case '1': | 334 case '1': |
| 326 case '2': | 335 case '2': |
| 327 case '3': | 336 case '3': |
| 328 case '4': | 337 case '4': |
| 329 case '5': | 338 case '5': |
| 330 case '6': | 339 case '6': |
| 331 case '7': | 340 case '7': |
| 332 case '8': | 341 case '8': |
| (...skipping 13 matching lines...) Expand all Loading... |
| 346 builder->AddAtom(atom); | 355 builder->AddAtom(atom); |
| 347 } | 356 } |
| 348 break; | 357 break; |
| 349 } | 358 } |
| 350 uc32 first_digit = Next(); | 359 uc32 first_digit = Next(); |
| 351 if (first_digit == '8' || first_digit == '9') { | 360 if (first_digit == '8' || first_digit == '9') { |
| 352 // If the 'u' flag is present, only syntax characters can be | 361 // If the 'u' flag is present, only syntax characters can be |
| 353 // escaped, | 362 // escaped, |
| 354 // no other identity escapes are allowed. If the 'u' flag is not | 363 // no other identity escapes are allowed. If the 'u' flag is not |
| 355 // present, all identity escapes are allowed. | 364 // present, all identity escapes are allowed. |
| 356 if (!unicode_) { | 365 if (!unicode()) { |
| 357 builder->AddCharacter(first_digit); | 366 builder->AddCharacter(first_digit); |
| 358 Advance(2); | 367 Advance(2); |
| 359 } else { | 368 } else { |
| 360 return ReportError(CStrVector("Invalid escape")); | 369 return ReportError(CStrVector("Invalid escape")); |
| 361 } | 370 } |
| 362 break; | 371 break; |
| 363 } | 372 } |
| 364 } | 373 } |
| 365 // FALLTHROUGH | 374 // FALLTHROUGH |
| 366 case '0': { | 375 case '0': { |
| (...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 407 Advance(2); | 416 Advance(2); |
| 408 builder->AddCharacter(controlLetter & 0x1f); | 417 builder->AddCharacter(controlLetter & 0x1f); |
| 409 } | 418 } |
| 410 break; | 419 break; |
| 411 } | 420 } |
| 412 case 'x': { | 421 case 'x': { |
| 413 Advance(2); | 422 Advance(2); |
| 414 uc32 value; | 423 uc32 value; |
| 415 if (ParseHexEscape(2, &value)) { | 424 if (ParseHexEscape(2, &value)) { |
| 416 builder->AddCharacter(value); | 425 builder->AddCharacter(value); |
| 417 } else if (!unicode_) { | 426 } else if (!unicode()) { |
| 418 builder->AddCharacter('x'); | 427 builder->AddCharacter('x'); |
| 419 } else { | 428 } else { |
| 420 // If the 'u' flag is present, invalid escapes are not treated as | 429 // If the 'u' flag is present, invalid escapes are not treated as |
| 421 // identity escapes. | 430 // identity escapes. |
| 422 return ReportError(CStrVector("Invalid escape")); | 431 return ReportError(CStrVector("Invalid escape")); |
| 423 } | 432 } |
| 424 break; | 433 break; |
| 425 } | 434 } |
| 426 case 'u': { | 435 case 'u': { |
| 427 Advance(2); | 436 Advance(2); |
| 428 uc32 value; | 437 uc32 value; |
| 429 if (ParseUnicodeEscape(&value)) { | 438 if (ParseUnicodeEscape(&value)) { |
| 430 builder->AddUnicodeCharacter(value); | 439 builder->AddUnicodeCharacter(value); |
| 431 } else if (!unicode_) { | 440 } else if (!unicode()) { |
| 432 builder->AddCharacter('u'); | 441 builder->AddCharacter('u'); |
| 433 } else { | 442 } else { |
| 434 // If the 'u' flag is present, invalid escapes are not treated as | 443 // If the 'u' flag is present, invalid escapes are not treated as |
| 435 // identity escapes. | 444 // identity escapes. |
| 436 return ReportError(CStrVector("Invalid unicode escape")); | 445 return ReportError(CStrVector("Invalid unicode escape")); |
| 437 } | 446 } |
| 438 break; | 447 break; |
| 439 } | 448 } |
| 440 default: | 449 default: |
| 441 Advance(); | 450 Advance(); |
| 442 // If the 'u' flag is present, only syntax characters can be | 451 // If the 'u' flag is present, only syntax characters can be |
| 443 // escaped, no | 452 // escaped, no |
| 444 // other identity escapes are allowed. If the 'u' flag is not | 453 // other identity escapes are allowed. If the 'u' flag is not |
| 445 // present, | 454 // present, |
| 446 // all identity escapes are allowed. | 455 // all identity escapes are allowed. |
| 447 if (!unicode_ || IsSyntaxCharacter(current())) { | 456 if (!unicode() || IsSyntaxCharacter(current())) { |
| 448 builder->AddCharacter(current()); | 457 builder->AddCharacter(current()); |
| 449 Advance(); | 458 Advance(); |
| 450 } else { | 459 } else { |
| 451 return ReportError(CStrVector("Invalid escape")); | 460 return ReportError(CStrVector("Invalid escape")); |
| 452 } | 461 } |
| 453 break; | 462 break; |
| 454 } | 463 } |
| 455 break; | 464 break; |
| 456 case '{': { | 465 case '{': { |
| 457 int dummy; | 466 int dummy; |
| (...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 738 } | 747 } |
| 739 *value = val; | 748 *value = val; |
| 740 return true; | 749 return true; |
| 741 } | 750 } |
| 742 | 751 |
| 743 | 752 |
| 744 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 753 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
| 745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
| 746 // allowed). In the latter case, the number of hex digits between { } is | 755 // allowed). In the latter case, the number of hex digits between { } is |
| 747 // arbitrary. \ and u have already been read. | 756 // arbitrary. \ and u have already been read. |
| 748 if (current() == '{' && unicode_) { | 757 if (current() == '{' && unicode()) { |
| 749 int start = position(); | 758 int start = position(); |
| 750 Advance(); | 759 Advance(); |
| 751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
| 752 if (current() == '}') { | 761 if (current() == '}') { |
| 753 Advance(); | 762 Advance(); |
| 754 return true; | 763 return true; |
| 755 } | 764 } |
| 756 } | 765 } |
| 757 Reset(start); | 766 Reset(start); |
| 758 return false; | 767 return false; |
| (...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 833 // For compatibility, we interpret a decimal escape that isn't | 842 // For compatibility, we interpret a decimal escape that isn't |
| 834 // a back reference (and therefore either \0 or not valid according | 843 // a back reference (and therefore either \0 or not valid according |
| 835 // to the specification) as a 1..3 digit octal character code. | 844 // to the specification) as a 1..3 digit octal character code. |
| 836 return ParseOctalLiteral(); | 845 return ParseOctalLiteral(); |
| 837 case 'x': { | 846 case 'x': { |
| 838 Advance(); | 847 Advance(); |
| 839 uc32 value; | 848 uc32 value; |
| 840 if (ParseHexEscape(2, &value)) { | 849 if (ParseHexEscape(2, &value)) { |
| 841 return value; | 850 return value; |
| 842 } | 851 } |
| 843 if (!unicode_) { | 852 if (!unicode()) { |
| 844 // If \x is not followed by a two-digit hexadecimal, treat it | 853 // If \x is not followed by a two-digit hexadecimal, treat it |
| 845 // as an identity escape. | 854 // as an identity escape. |
| 846 return 'x'; | 855 return 'x'; |
| 847 } | 856 } |
| 848 // If the 'u' flag is present, invalid escapes are not treated as | 857 // If the 'u' flag is present, invalid escapes are not treated as |
| 849 // identity escapes. | 858 // identity escapes. |
| 850 ReportError(CStrVector("Invalid escape")); | 859 ReportError(CStrVector("Invalid escape")); |
| 851 return 0; | 860 return 0; |
| 852 } | 861 } |
| 853 case 'u': { | 862 case 'u': { |
| 854 Advance(); | 863 Advance(); |
| 855 uc32 value; | 864 uc32 value; |
| 856 if (ParseUnicodeEscape(&value)) { | 865 if (ParseUnicodeEscape(&value)) { |
| 857 return value; | 866 return value; |
| 858 } | 867 } |
| 859 if (!unicode_) { | 868 if (!unicode()) { |
| 860 return 'u'; | 869 return 'u'; |
| 861 } | 870 } |
| 862 // If the 'u' flag is present, invalid escapes are not treated as | 871 // If the 'u' flag is present, invalid escapes are not treated as |
| 863 // identity escapes. | 872 // identity escapes. |
| 864 ReportError(CStrVector("Invalid unicode escape")); | 873 ReportError(CStrVector("Invalid unicode escape")); |
| 865 return 0; | 874 return 0; |
| 866 } | 875 } |
| 867 default: { | 876 default: { |
| 868 uc32 result = current(); | 877 uc32 result = current(); |
| 869 // If the 'u' flag is present, only syntax characters can be escaped, no | 878 // If the 'u' flag is present, only syntax characters can be escaped, no |
| 870 // other identity escapes are allowed. If the 'u' flag is not present, all | 879 // other identity escapes are allowed. If the 'u' flag is not present, all |
| 871 // identity escapes are allowed. | 880 // identity escapes are allowed. |
| 872 if (!unicode_ || IsSyntaxCharacter(result)) { | 881 if (!unicode() || IsSyntaxCharacter(result)) { |
| 873 Advance(); | 882 Advance(); |
| 874 return result; | 883 return result; |
| 875 } | 884 } |
| 876 ReportError(CStrVector("Invalid escape")); | 885 ReportError(CStrVector("Invalid escape")); |
| 877 return 0; | 886 return 0; |
| 878 } | 887 } |
| 879 } | 888 } |
| 880 return 0; | 889 return 0; |
| 881 } | 890 } |
| 882 | 891 |
| 883 | 892 |
| 884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { | 893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
| 885 DCHECK_EQ(0, *char_class); | 894 DCHECK_EQ(0, *char_class); |
| 886 uc32 first = current(); | 895 uc32 first = current(); |
| 887 if (first == '\\') { | 896 if (first == '\\') { |
| 888 switch (Next()) { | 897 switch (Next()) { |
| 889 case 'w': | 898 case 'w': |
| 890 case 'W': | 899 case 'W': |
| 891 case 'd': | 900 case 'd': |
| 892 case 'D': | 901 case 'D': |
| 893 case 's': | 902 case 's': |
| 894 case 'S': { | 903 case 'S': { |
| 895 *char_class = Next(); | 904 *char_class = Next(); |
| 896 Advance(2); | 905 Advance(2); |
| 897 return CharacterRange::Singleton(0); // Return dummy value. | 906 return CharacterRange::Singleton(0); // Return dummy value. |
| 898 } | 907 } |
| 899 case kEndMarker: | 908 case kEndMarker: |
| 900 return ReportError(CStrVector("\\ at end of pattern")); | 909 return ReportError(CStrVector("\\ at end of pattern")); |
| 901 default: | 910 default: |
| 902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); | 911 first = ParseClassCharacterEscape(CHECK_FAILED); |
| 903 return CharacterRange::Singleton(c); | |
| 904 } | 912 } |
| 905 } else { | 913 } else { |
| 906 Advance(); | 914 Advance(); |
| 907 return CharacterRange::Singleton(first); | |
| 908 } | 915 } |
| 916 |
| 917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { |
| 918 // Combine with possibly following trail surrogate. |
| 919 int start = position(); |
| 920 uc32 second = current(); |
| 921 if (second == '\\') { |
| 922 second = ParseClassCharacterEscape(CHECK_FAILED); |
| 923 } else { |
| 924 Advance(); |
| 925 } |
| 926 if (unibrow::Utf16::IsTrailSurrogate(second)) { |
| 927 first = unibrow::Utf16::CombineSurrogatePair(first, second); |
| 928 } else { |
| 929 Reset(start); |
| 930 } |
| 931 } |
| 932 |
| 933 return CharacterRange::Singleton(first); |
| 909 } | 934 } |
| 910 | 935 |
| 911 | 936 |
| 912 static const uc16 kNoCharClass = 0; | 937 static const uc16 kNoCharClass = 0; |
| 913 | 938 |
| 914 // Adds range or pre-defined character class to character ranges. | 939 // Adds range or pre-defined character class to character ranges. |
| 915 // If char_class is not kInvalidClass, it's interpreted as a class | 940 // If char_class is not kInvalidClass, it's interpreted as a class |
| 916 // escape (i.e., 's' means whitespace, from '\s'). | 941 // escape (i.e., 's' means whitespace, from '\s'). |
| 917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
| 918 uc16 char_class, CharacterRange range, | 943 uc16 char_class, CharacterRange range, |
| (...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
| 978 is_negated = !is_negated; | 1003 is_negated = !is_negated; |
| 979 } | 1004 } |
| 980 return new (zone()) RegExpCharacterClass(ranges, is_negated); | 1005 return new (zone()) RegExpCharacterClass(ranges, is_negated); |
| 981 } | 1006 } |
| 982 | 1007 |
| 983 | 1008 |
| 984 #undef CHECK_FAILED | 1009 #undef CHECK_FAILED |
| 985 | 1010 |
| 986 | 1011 |
| 987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, | 1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
| 988 FlatStringReader* input, bool multiline, | 1013 FlatStringReader* input, JSRegExp::Flags flags, |
| 989 bool unicode, RegExpCompileData* result) { | 1014 RegExpCompileData* result) { |
| 990 DCHECK(result != NULL); | 1015 DCHECK(result != NULL); |
| 991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); | 1016 RegExpParser parser(input, &result->error, flags, isolate, zone); |
| 992 RegExpTree* tree = parser.ParsePattern(); | 1017 RegExpTree* tree = parser.ParsePattern(); |
| 993 if (parser.failed()) { | 1018 if (parser.failed()) { |
| 994 DCHECK(tree == NULL); | 1019 DCHECK(tree == NULL); |
| 995 DCHECK(!result->error.is_null()); | 1020 DCHECK(!result->error.is_null()); |
| 996 } else { | 1021 } else { |
| 997 DCHECK(tree != NULL); | 1022 DCHECK(tree != NULL); |
| 998 DCHECK(result->error.is_null()); | 1023 DCHECK(result->error.is_null()); |
| 999 if (FLAG_trace_regexp_parser) { | 1024 if (FLAG_trace_regexp_parser) { |
| 1000 OFStream os(stdout); | 1025 OFStream os(stdout); |
| 1001 tree->Print(os, zone); | 1026 tree->Print(os, zone); |
| 1002 os << "\n"; | 1027 os << "\n"; |
| 1003 } | 1028 } |
| 1004 result->tree = tree; | 1029 result->tree = tree; |
| 1005 int capture_count = parser.captures_started(); | 1030 int capture_count = parser.captures_started(); |
| 1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; | 1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
| 1007 result->contains_anchor = parser.contains_anchor(); | 1032 result->contains_anchor = parser.contains_anchor(); |
| 1008 result->capture_count = capture_count; | 1033 result->capture_count = capture_count; |
| 1009 } | 1034 } |
| 1010 return !parser.failed(); | 1035 return !parser.failed(); |
| 1011 } | 1036 } |
| 1012 | 1037 |
| 1013 | 1038 |
| 1014 RegExpBuilder::RegExpBuilder(Zone* zone) | 1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) |
| 1015 : zone_(zone), | 1040 : zone_(zone), |
| 1016 pending_empty_(false), | 1041 pending_empty_(false), |
| 1042 flags_(flags), |
| 1017 characters_(NULL), | 1043 characters_(NULL), |
| 1044 pending_surrogate_(kNoPendingSurrogate), |
| 1018 terms_(), | 1045 terms_(), |
| 1019 alternatives_() | 1046 alternatives_() |
| 1020 #ifdef DEBUG | 1047 #ifdef DEBUG |
| 1021 , | 1048 , |
| 1022 last_added_(ADD_NONE) | 1049 last_added_(ADD_NONE) |
| 1023 #endif | 1050 #endif |
| 1024 { | 1051 { |
| 1025 } | 1052 } |
| 1026 | 1053 |
| 1027 | 1054 |
| 1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { |
| 1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| 1057 FlushPendingSurrogate(); |
| 1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
| 1059 pending_surrogate_ = lead_surrogate; |
| 1060 } |
| 1061 |
| 1062 |
| 1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
| 1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
| 1065 if (pending_surrogate_ != kNoPendingSurrogate) { |
| 1066 uc16 lead_surrogate = pending_surrogate_; |
| 1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| 1068 ZoneList<uc16> surrogate_pair(2, zone()); |
| 1069 surrogate_pair.Add(lead_surrogate, zone()); |
| 1070 surrogate_pair.Add(trail_surrogate, zone()); |
| 1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
| 1072 pending_surrogate_ = kNoPendingSurrogate; |
| 1073 AddAtom(atom); |
| 1074 } else { |
| 1075 pending_surrogate_ = trail_surrogate; |
| 1076 FlushPendingSurrogate(); |
| 1077 } |
| 1078 } |
| 1079 |
| 1080 |
| 1081 void RegExpBuilder::FlushPendingSurrogate() { |
| 1082 if (pending_surrogate_ != kNoPendingSurrogate) { |
| 1083 // Use character class to desugar lone surrogate matching. |
| 1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( |
| 1085 CharacterRange::List(zone(), |
| 1086 CharacterRange::Singleton(pending_surrogate_)), |
| 1087 false); |
| 1088 pending_surrogate_ = kNoPendingSurrogate; |
| 1089 DCHECK(unicode()); |
| 1090 AddCharacterClass(cc); |
| 1091 } |
| 1092 } |
| 1093 |
| 1094 |
| 1028 void RegExpBuilder::FlushCharacters() { | 1095 void RegExpBuilder::FlushCharacters() { |
| 1096 FlushPendingSurrogate(); |
| 1029 pending_empty_ = false; | 1097 pending_empty_ = false; |
| 1030 if (characters_ != NULL) { | 1098 if (characters_ != NULL) { |
| 1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); | 1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
| 1032 characters_ = NULL; | 1100 characters_ = NULL; |
| 1033 text_.Add(atom, zone()); | 1101 text_.Add(atom, zone()); |
| 1034 LAST(ADD_ATOM); | 1102 LAST(ADD_ATOM); |
| 1035 } | 1103 } |
| 1036 } | 1104 } |
| 1037 | 1105 |
| 1038 | 1106 |
| 1039 void RegExpBuilder::FlushText() { | 1107 void RegExpBuilder::FlushText() { |
| 1040 FlushCharacters(); | 1108 FlushCharacters(); |
| 1041 int num_text = text_.length(); | 1109 int num_text = text_.length(); |
| 1042 if (num_text == 0) { | 1110 if (num_text == 0) { |
| 1043 return; | 1111 return; |
| 1044 } else if (num_text == 1) { | 1112 } else if (num_text == 1) { |
| 1045 terms_.Add(text_.last(), zone()); | 1113 terms_.Add(text_.last(), zone()); |
| 1046 } else { | 1114 } else { |
| 1047 RegExpText* text = new (zone()) RegExpText(zone()); | 1115 RegExpText* text = new (zone()) RegExpText(zone()); |
| 1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); | 1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
| 1049 terms_.Add(text, zone()); | 1117 terms_.Add(text, zone()); |
| 1050 } | 1118 } |
| 1051 text_.Clear(); | 1119 text_.Clear(); |
| 1052 } | 1120 } |
| 1053 | 1121 |
| 1054 | 1122 |
| 1055 void RegExpBuilder::AddCharacter(uc16 c) { | 1123 void RegExpBuilder::AddCharacter(uc16 c) { |
| 1124 FlushPendingSurrogate(); |
| 1056 pending_empty_ = false; | 1125 pending_empty_ = false; |
| 1057 if (characters_ == NULL) { | 1126 if (characters_ == NULL) { |
| 1058 characters_ = new (zone()) ZoneList<uc16>(4, zone()); | 1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
| 1059 } | 1128 } |
| 1060 characters_->Add(c, zone()); | 1129 characters_->Add(c, zone()); |
| 1061 LAST(ADD_CHAR); | 1130 LAST(ADD_CHAR); |
| 1062 } | 1131 } |
| 1063 | 1132 |
| 1064 | 1133 |
| 1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
| 1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { | 1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
| 1067 ZoneList<uc16> surrogate_pair(2, zone()); | 1136 DCHECK(unicode()); |
| 1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); | 1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
| 1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); | 1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
| 1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | 1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
| 1071 AddAtom(atom); | 1140 AddLeadSurrogate(c); |
| 1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| 1142 AddTrailSurrogate(c); |
| 1072 } else { | 1143 } else { |
| 1073 AddCharacter(static_cast<uc16>(c)); | 1144 AddCharacter(static_cast<uc16>(c)); |
| 1074 } | 1145 } |
| 1075 } | 1146 } |
| 1076 | 1147 |
| 1077 | 1148 |
| 1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| 1079 | 1150 |
| 1080 | 1151 |
| 1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| 1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { |
| 1154 // In unicode mode, character class needs to be desugared, so it |
| 1155 // must be a standalone term instead of being part of a RegExpText. |
| 1156 AddTerm(cc); |
| 1157 } else { |
| 1158 AddAtom(cc); |
| 1159 } |
| 1160 } |
| 1161 |
| 1162 |
| 1081 void RegExpBuilder::AddAtom(RegExpTree* term) { | 1163 void RegExpBuilder::AddAtom(RegExpTree* term) { |
| 1082 if (term->IsEmpty()) { | 1164 if (term->IsEmpty()) { |
| 1083 AddEmpty(); | 1165 AddEmpty(); |
| 1084 return; | 1166 return; |
| 1085 } | 1167 } |
| 1086 if (term->IsTextElement()) { | 1168 if (term->IsTextElement()) { |
| 1087 FlushCharacters(); | 1169 FlushCharacters(); |
| 1088 text_.Add(term, zone()); | 1170 text_.Add(term, zone()); |
| 1089 } else { | 1171 } else { |
| 1090 FlushText(); | 1172 FlushText(); |
| 1091 terms_.Add(term, zone()); | 1173 terms_.Add(term, zone()); |
| 1092 } | 1174 } |
| 1093 LAST(ADD_ATOM); | 1175 LAST(ADD_ATOM); |
| 1094 } | 1176 } |
| 1095 | 1177 |
| 1096 | 1178 |
| 1179 void RegExpBuilder::AddTerm(RegExpTree* term) { |
| 1180 FlushText(); |
| 1181 terms_.Add(term, zone()); |
| 1182 LAST(ADD_ATOM); |
| 1183 } |
| 1184 |
| 1185 |
| 1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) { | 1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
| 1098 FlushText(); | 1187 FlushText(); |
| 1099 terms_.Add(assert, zone()); | 1188 terms_.Add(assert, zone()); |
| 1100 LAST(ADD_ASSERT); | 1189 LAST(ADD_ASSERT); |
| 1101 } | 1190 } |
| 1102 | 1191 |
| 1103 | 1192 |
| 1104 void RegExpBuilder::NewAlternative() { FlushTerms(); } | 1193 void RegExpBuilder::NewAlternative() { FlushTerms(); } |
| 1105 | 1194 |
| 1106 | 1195 |
| (...skipping 18 matching lines...) Expand all Loading... |
| 1125 FlushTerms(); | 1214 FlushTerms(); |
| 1126 int num_alternatives = alternatives_.length(); | 1215 int num_alternatives = alternatives_.length(); |
| 1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
| 1128 if (num_alternatives == 1) return alternatives_.last(); | 1217 if (num_alternatives == 1) return alternatives_.last(); |
| 1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
| 1130 } | 1219 } |
| 1131 | 1220 |
| 1132 | 1221 |
| 1133 void RegExpBuilder::AddQuantifierToAtom( | 1222 void RegExpBuilder::AddQuantifierToAtom( |
| 1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | 1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
| 1224 FlushPendingSurrogate(); |
| 1135 if (pending_empty_) { | 1225 if (pending_empty_) { |
| 1136 pending_empty_ = false; | 1226 pending_empty_ = false; |
| 1137 return; | 1227 return; |
| 1138 } | 1228 } |
| 1139 RegExpTree* atom; | 1229 RegExpTree* atom; |
| 1140 if (characters_ != NULL) { | 1230 if (characters_ != NULL) { |
| 1141 DCHECK(last_added_ == ADD_CHAR); | 1231 DCHECK(last_added_ == ADD_CHAR); |
| 1142 // Last atom was character. | 1232 // Last atom was character. |
| 1143 Vector<const uc16> char_vector = characters_->ToConstVector(); | 1233 Vector<const uc16> char_vector = characters_->ToConstVector(); |
| 1144 int num_chars = char_vector.length(); | 1234 int num_chars = char_vector.length(); |
| (...skipping 26 matching lines...) Expand all Loading... |
| 1171 UNREACHABLE(); | 1261 UNREACHABLE(); |
| 1172 return; | 1262 return; |
| 1173 } | 1263 } |
| 1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
| 1175 zone()); | 1265 zone()); |
| 1176 LAST(ADD_TERM); | 1266 LAST(ADD_TERM); |
| 1177 } | 1267 } |
| 1178 | 1268 |
| 1179 } // namespace internal | 1269 } // namespace internal |
| 1180 } // namespace v8 | 1270 } // namespace v8 |
| OLD | NEW |