OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
95 | 95 |
96 | 96 |
97 void RegExpParser::Advance(int dist) { | 97 void RegExpParser::Advance(int dist) { |
98 next_pos_ += dist - 1; | 98 next_pos_ += dist - 1; |
99 Advance(); | 99 Advance(); |
100 } | 100 } |
101 | 101 |
102 | 102 |
103 bool RegExpParser::simple() { return simple_; } | 103 bool RegExpParser::simple() { return simple_; } |
104 | 104 |
105 | 105 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { |
106 bool RegExpParser::IsSyntaxCharacter(uc32 c) { | 106 switch (c) { |
107 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || | 107 case '^': |
108 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || | 108 case '$': |
109 c == '{' || c == '}' || c == '|'; | 109 case '\\': |
| 110 case '.': |
| 111 case '*': |
| 112 case '+': |
| 113 case '?': |
| 114 case '(': |
| 115 case ')': |
| 116 case '[': |
| 117 case ']': |
| 118 case '{': |
| 119 case '}': |
| 120 case '|': |
| 121 case '/': |
| 122 return true; |
| 123 default: |
| 124 break; |
| 125 } |
| 126 return false; |
110 } | 127 } |
111 | 128 |
112 | 129 |
113 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { | 130 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { |
114 failed_ = true; | 131 failed_ = true; |
115 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); | 132 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); |
116 // Zip to the end to make sure the no more input is read. | 133 // Zip to the end to make sure the no more input is read. |
117 current_ = kEndMarker; | 134 current_ = kEndMarker; |
118 next_pos_ = in()->length(); | 135 next_pos_ = in()->length(); |
119 return NULL; | 136 return NULL; |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
154 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | 171 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
155 flags_, zone()); | 172 flags_, zone()); |
156 RegExpParserState* state = &initial_state; | 173 RegExpParserState* state = &initial_state; |
157 // Cache the builder in a local variable for quick access. | 174 // Cache the builder in a local variable for quick access. |
158 RegExpBuilder* builder = initial_state.builder(); | 175 RegExpBuilder* builder = initial_state.builder(); |
159 while (true) { | 176 while (true) { |
160 switch (current()) { | 177 switch (current()) { |
161 case kEndMarker: | 178 case kEndMarker: |
162 if (state->IsSubexpression()) { | 179 if (state->IsSubexpression()) { |
163 // Inside a parenthesized group when hitting end of input. | 180 // Inside a parenthesized group when hitting end of input. |
164 ReportError(CStrVector("Unterminated group") CHECK_FAILED); | 181 return ReportError(CStrVector("Unterminated group")); |
165 } | 182 } |
166 DCHECK_EQ(INITIAL, state->group_type()); | 183 DCHECK_EQ(INITIAL, state->group_type()); |
167 // Parsing completed successfully. | 184 // Parsing completed successfully. |
168 return builder->ToRegExp(); | 185 return builder->ToRegExp(); |
169 case ')': { | 186 case ')': { |
170 if (!state->IsSubexpression()) { | 187 if (!state->IsSubexpression()) { |
171 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); | 188 return ReportError(CStrVector("Unmatched ')'")); |
172 } | 189 } |
173 DCHECK_NE(INITIAL, state->group_type()); | 190 DCHECK_NE(INITIAL, state->group_type()); |
174 | 191 |
175 Advance(); | 192 Advance(); |
176 // End disjunction parsing and convert builder content to new single | 193 // End disjunction parsing and convert builder content to new single |
177 // regexp atom. | 194 // regexp atom. |
178 RegExpTree* body = builder->ToRegExp(); | 195 RegExpTree* body = builder->ToRegExp(); |
179 | 196 |
180 int end_capture_index = captures_started(); | 197 int end_capture_index = captures_started(); |
181 | 198 |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
269 if (Next() == '=') { | 286 if (Next() == '=') { |
270 subexpr_type = POSITIVE_LOOKAROUND; | 287 subexpr_type = POSITIVE_LOOKAROUND; |
271 break; | 288 break; |
272 } else if (Next() == '!') { | 289 } else if (Next() == '!') { |
273 subexpr_type = NEGATIVE_LOOKAROUND; | 290 subexpr_type = NEGATIVE_LOOKAROUND; |
274 break; | 291 break; |
275 } | 292 } |
276 } | 293 } |
277 // Fall through. | 294 // Fall through. |
278 default: | 295 default: |
279 ReportError(CStrVector("Invalid group") CHECK_FAILED); | 296 return ReportError(CStrVector("Invalid group")); |
280 break; | |
281 } | 297 } |
282 Advance(2); | 298 Advance(2); |
283 } else { | 299 } else { |
284 if (captures_started_ >= kMaxCaptures) { | 300 if (captures_started_ >= kMaxCaptures) { |
285 ReportError(CStrVector("Too many captures") CHECK_FAILED); | 301 return ReportError(CStrVector("Too many captures")); |
286 } | 302 } |
287 captures_started_++; | 303 captures_started_++; |
288 } | 304 } |
289 // Store current state and begin new disjunction parsing. | 305 // Store current state and begin new disjunction parsing. |
290 state = | 306 state = |
291 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, | 307 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, |
292 captures_started_, flags_, zone()); | 308 captures_started_, flags_, zone()); |
293 builder = state->builder(); | 309 builder = state->builder(); |
294 continue; | 310 continue; |
295 } | 311 } |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
353 // the capture registers of the referenced capture are either | 369 // the capture registers of the referenced capture are either |
354 // both set or both cleared. | 370 // both set or both cleared. |
355 builder->AddEmpty(); | 371 builder->AddEmpty(); |
356 } else { | 372 } else { |
357 RegExpCapture* capture = GetCapture(index); | 373 RegExpCapture* capture = GetCapture(index); |
358 RegExpTree* atom = new (zone()) RegExpBackReference(capture); | 374 RegExpTree* atom = new (zone()) RegExpBackReference(capture); |
359 builder->AddAtom(atom); | 375 builder->AddAtom(atom); |
360 } | 376 } |
361 break; | 377 break; |
362 } | 378 } |
| 379 // With /u, no identity escapes except for syntax characters |
| 380 // are allowed. Otherwise, all identity escapes are allowed. |
| 381 if (unicode()) { |
| 382 return ReportError(CStrVector("Invalid escape")); |
| 383 } |
363 uc32 first_digit = Next(); | 384 uc32 first_digit = Next(); |
364 if (first_digit == '8' || first_digit == '9') { | 385 if (first_digit == '8' || first_digit == '9') { |
365 // If the 'u' flag is present, only syntax characters can be | 386 builder->AddCharacter(first_digit); |
366 // escaped, | 387 Advance(2); |
367 // no other identity escapes are allowed. If the 'u' flag is not | |
368 // present, all identity escapes are allowed. | |
369 if (!unicode()) { | |
370 builder->AddCharacter(first_digit); | |
371 Advance(2); | |
372 } else { | |
373 return ReportError(CStrVector("Invalid escape")); | |
374 } | |
375 break; | 388 break; |
376 } | 389 } |
377 } | 390 } |
378 // FALLTHROUGH | 391 // FALLTHROUGH |
379 case '0': { | 392 case '0': { |
380 Advance(); | 393 Advance(); |
| 394 if (unicode() && Next() >= '0' && Next() <= '9') { |
| 395 // With /u, decimal escape with leading 0 are not parsed as octal. |
| 396 return ReportError(CStrVector("Invalid decimal escape")); |
| 397 } |
381 uc32 octal = ParseOctalLiteral(); | 398 uc32 octal = ParseOctalLiteral(); |
382 builder->AddCharacter(octal); | 399 builder->AddCharacter(octal); |
383 break; | 400 break; |
384 } | 401 } |
385 // ControlEscape :: one of | 402 // ControlEscape :: one of |
386 // f n r t v | 403 // f n r t v |
387 case 'f': | 404 case 'f': |
388 Advance(2); | 405 Advance(2); |
389 builder->AddCharacter('\f'); | 406 builder->AddCharacter('\f'); |
390 break; | 407 break; |
(...skipping 17 matching lines...) Expand all Loading... |
408 Advance(); | 425 Advance(); |
409 uc32 controlLetter = Next(); | 426 uc32 controlLetter = Next(); |
410 // Special case if it is an ASCII letter. | 427 // Special case if it is an ASCII letter. |
411 // Convert lower case letters to uppercase. | 428 // Convert lower case letters to uppercase. |
412 uc32 letter = controlLetter & ~('a' ^ 'A'); | 429 uc32 letter = controlLetter & ~('a' ^ 'A'); |
413 if (letter < 'A' || 'Z' < letter) { | 430 if (letter < 'A' || 'Z' < letter) { |
414 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. | 431 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. |
415 // This is outside the specification. We match JSC in | 432 // This is outside the specification. We match JSC in |
416 // reading the backslash as a literal character instead | 433 // reading the backslash as a literal character instead |
417 // of as starting an escape. | 434 // of as starting an escape. |
| 435 if (unicode()) { |
| 436 // With /u, invalid escapes are not treated as identity escapes. |
| 437 return ReportError(CStrVector("Invalid unicode escape")); |
| 438 } |
418 builder->AddCharacter('\\'); | 439 builder->AddCharacter('\\'); |
419 } else { | 440 } else { |
420 Advance(2); | 441 Advance(2); |
421 builder->AddCharacter(controlLetter & 0x1f); | 442 builder->AddCharacter(controlLetter & 0x1f); |
422 } | 443 } |
423 break; | 444 break; |
424 } | 445 } |
425 case 'x': { | 446 case 'x': { |
426 Advance(2); | 447 Advance(2); |
427 uc32 value; | 448 uc32 value; |
428 if (ParseHexEscape(2, &value)) { | 449 if (ParseHexEscape(2, &value)) { |
429 builder->AddCharacter(value); | 450 builder->AddCharacter(value); |
430 } else if (!unicode()) { | 451 } else if (!unicode()) { |
431 builder->AddCharacter('x'); | 452 builder->AddCharacter('x'); |
432 } else { | 453 } else { |
433 // If the 'u' flag is present, invalid escapes are not treated as | 454 // With /u, invalid escapes are not treated as identity escapes. |
434 // identity escapes. | |
435 return ReportError(CStrVector("Invalid escape")); | 455 return ReportError(CStrVector("Invalid escape")); |
436 } | 456 } |
437 break; | 457 break; |
438 } | 458 } |
439 case 'u': { | 459 case 'u': { |
440 Advance(2); | 460 Advance(2); |
441 uc32 value; | 461 uc32 value; |
442 if (ParseUnicodeEscape(&value)) { | 462 if (ParseUnicodeEscape(&value)) { |
443 builder->AddUnicodeCharacter(value); | 463 builder->AddUnicodeCharacter(value); |
444 } else if (!unicode()) { | 464 } else if (!unicode()) { |
445 builder->AddCharacter('u'); | 465 builder->AddCharacter('u'); |
446 } else { | 466 } else { |
447 // If the 'u' flag is present, invalid escapes are not treated as | 467 // With /u, invalid escapes are not treated as identity escapes. |
448 // identity escapes. | |
449 return ReportError(CStrVector("Invalid unicode escape")); | 468 return ReportError(CStrVector("Invalid unicode escape")); |
450 } | 469 } |
451 break; | 470 break; |
452 } | 471 } |
453 default: | 472 default: |
454 Advance(); | 473 Advance(); |
455 // If the 'u' flag is present, only syntax characters can be | 474 // With /u, no identity escapes except for syntax characters |
456 // escaped, no | 475 // are allowed. Otherwise, all identity escapes are allowed. |
457 // other identity escapes are allowed. If the 'u' flag is not | 476 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { |
458 // present, | |
459 // all identity escapes are allowed. | |
460 if (!unicode() || IsSyntaxCharacter(current())) { | |
461 builder->AddCharacter(current()); | 477 builder->AddCharacter(current()); |
462 Advance(); | 478 Advance(); |
463 } else { | 479 } else { |
464 return ReportError(CStrVector("Invalid escape")); | 480 return ReportError(CStrVector("Invalid escape")); |
465 } | 481 } |
466 break; | 482 break; |
467 } | 483 } |
468 break; | 484 break; |
469 case '{': { | 485 case '{': { |
470 int dummy; | 486 int dummy; |
471 if (ParseIntervalQuantifier(&dummy, &dummy)) { | 487 if (ParseIntervalQuantifier(&dummy, &dummy)) { |
472 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); | 488 return ReportError(CStrVector("Nothing to repeat")); |
473 } | 489 } |
474 // fallthrough | 490 // fallthrough |
475 } | 491 } |
| 492 case '}': |
| 493 case ']': |
| 494 if (unicode()) { |
| 495 return ReportError(CStrVector("Lone quantifier brackets")); |
| 496 } |
| 497 // fallthrough |
476 default: | 498 default: |
477 builder->AddUnicodeCharacter(current()); | 499 builder->AddUnicodeCharacter(current()); |
478 Advance(); | 500 Advance(); |
479 break; | 501 break; |
480 } // end switch(current()) | 502 } // end switch(current()) |
481 | 503 |
482 int min; | 504 int min; |
483 int max; | 505 int max; |
484 switch (current()) { | 506 switch (current()) { |
485 // QuantifierPrefix :: | 507 // QuantifierPrefix :: |
(...skipping 12 matching lines...) Expand all Loading... |
498 Advance(); | 520 Advance(); |
499 break; | 521 break; |
500 case '?': | 522 case '?': |
501 min = 0; | 523 min = 0; |
502 max = 1; | 524 max = 1; |
503 Advance(); | 525 Advance(); |
504 break; | 526 break; |
505 case '{': | 527 case '{': |
506 if (ParseIntervalQuantifier(&min, &max)) { | 528 if (ParseIntervalQuantifier(&min, &max)) { |
507 if (max < min) { | 529 if (max < min) { |
508 ReportError(CStrVector("numbers out of order in {} quantifier.") | 530 return ReportError( |
509 CHECK_FAILED); | 531 CStrVector("numbers out of order in {} quantifier")); |
510 } | 532 } |
511 break; | 533 break; |
512 } else { | 534 } else if (unicode()) { |
513 continue; | 535 // With /u, incomplete quantifiers are not allowed. |
| 536 return ReportError(CStrVector("Incomplete quantifier")); |
514 } | 537 } |
| 538 continue; |
515 default: | 539 default: |
516 continue; | 540 continue; |
517 } | 541 } |
518 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; | 542 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; |
519 if (current() == '?') { | 543 if (current() == '?') { |
520 quantifier_type = RegExpQuantifier::NON_GREEDY; | 544 quantifier_type = RegExpQuantifier::NON_GREEDY; |
521 Advance(); | 545 Advance(); |
522 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { | 546 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { |
523 // FLAG_regexp_possessive_quantifier is a debug-only flag. | 547 // FLAG_regexp_possessive_quantifier is a debug-only flag. |
524 quantifier_type = RegExpQuantifier::POSSESSIVE; | 548 quantifier_type = RegExpQuantifier::POSSESSIVE; |
525 Advance(); | 549 Advance(); |
526 } | 550 } |
527 builder->AddQuantifierToAtom(min, max, quantifier_type); | 551 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { |
| 552 return ReportError(CStrVector("Invalid quantifier")); |
| 553 } |
528 } | 554 } |
529 } | 555 } |
530 | 556 |
531 | 557 |
532 #ifdef DEBUG | 558 #ifdef DEBUG |
533 // Currently only used in an DCHECK. | 559 // Currently only used in an DCHECK. |
534 static bool IsSpecialClassEscape(uc32 c) { | 560 static bool IsSpecialClassEscape(uc32 c) { |
535 switch (c) { | 561 switch (c) { |
536 case 'd': | 562 case 'd': |
537 case 'D': | 563 case 'D': |
(...skipping 277 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
815 return '\r'; | 841 return '\r'; |
816 case 't': | 842 case 't': |
817 Advance(); | 843 Advance(); |
818 return '\t'; | 844 return '\t'; |
819 case 'v': | 845 case 'v': |
820 Advance(); | 846 Advance(); |
821 return '\v'; | 847 return '\v'; |
822 case 'c': { | 848 case 'c': { |
823 uc32 controlLetter = Next(); | 849 uc32 controlLetter = Next(); |
824 uc32 letter = controlLetter & ~('A' ^ 'a'); | 850 uc32 letter = controlLetter & ~('A' ^ 'a'); |
825 // For compatibility with JSC, inside a character class | 851 // For compatibility with JSC, inside a character class. We also accept |
826 // we also accept digits and underscore as control characters. | 852 // digits and underscore as control characters, unless with /u. |
827 if ((controlLetter >= '0' && controlLetter <= '9') || | 853 if (letter >= 'A' && letter <= 'Z') { |
828 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { | |
829 Advance(2); | 854 Advance(2); |
830 // Control letters mapped to ASCII control characters in the range | 855 // Control letters mapped to ASCII control characters in the range |
831 // 0x00-0x1f. | 856 // 0x00-0x1f. |
832 return controlLetter & 0x1f; | 857 return controlLetter & 0x1f; |
833 } | 858 } |
| 859 if (unicode()) { |
| 860 // With /u, invalid escapes are not treated as identity escapes. |
| 861 ReportError(CStrVector("Invalid class escape")); |
| 862 return 0; |
| 863 } |
| 864 if ((controlLetter >= '0' && controlLetter <= '9') || |
| 865 controlLetter == '_') { |
| 866 Advance(2); |
| 867 return controlLetter & 0x1f; |
| 868 } |
834 // We match JSC in reading the backslash as a literal | 869 // We match JSC in reading the backslash as a literal |
835 // character instead of as starting an escape. | 870 // character instead of as starting an escape. |
836 return '\\'; | 871 return '\\'; |
837 } | 872 } |
838 case '0': | 873 case '0': |
839 case '1': | 874 case '1': |
840 case '2': | 875 case '2': |
841 case '3': | 876 case '3': |
842 case '4': | 877 case '4': |
843 case '5': | 878 case '5': |
844 case '6': | 879 case '6': |
845 case '7': | 880 case '7': |
846 // For compatibility, we interpret a decimal escape that isn't | 881 // For compatibility, we interpret a decimal escape that isn't |
847 // a back reference (and therefore either \0 or not valid according | 882 // a back reference (and therefore either \0 or not valid according |
848 // to the specification) as a 1..3 digit octal character code. | 883 // to the specification) as a 1..3 digit octal character code. |
| 884 if (unicode()) { |
| 885 // With /u, decimal escape is not interpreted as octal character code. |
| 886 ReportError(CStrVector("Invalid class escape")); |
| 887 return 0; |
| 888 } |
849 return ParseOctalLiteral(); | 889 return ParseOctalLiteral(); |
850 case 'x': { | 890 case 'x': { |
851 Advance(); | 891 Advance(); |
852 uc32 value; | 892 uc32 value; |
853 if (ParseHexEscape(2, &value)) { | 893 if (ParseHexEscape(2, &value)) return value; |
854 return value; | 894 if (unicode()) { |
| 895 // With /u, invalid escapes are not treated as identity escapes. |
| 896 ReportError(CStrVector("Invalid escape")); |
| 897 return 0; |
855 } | 898 } |
856 if (!unicode()) { | 899 // If \x is not followed by a two-digit hexadecimal, treat it |
857 // If \x is not followed by a two-digit hexadecimal, treat it | 900 // as an identity escape. |
858 // as an identity escape. | 901 return 'x'; |
859 return 'x'; | |
860 } | |
861 // If the 'u' flag is present, invalid escapes are not treated as | |
862 // identity escapes. | |
863 ReportError(CStrVector("Invalid escape")); | |
864 return 0; | |
865 } | 902 } |
866 case 'u': { | 903 case 'u': { |
867 Advance(); | 904 Advance(); |
868 uc32 value; | 905 uc32 value; |
869 if (ParseUnicodeEscape(&value)) { | 906 if (ParseUnicodeEscape(&value)) return value; |
870 return value; | 907 if (unicode()) { |
| 908 // With /u, invalid escapes are not treated as identity escapes. |
| 909 ReportError(CStrVector("Invalid unicode escape")); |
| 910 return 0; |
871 } | 911 } |
872 if (!unicode()) { | 912 // If \u is not followed by a two-digit hexadecimal, treat it |
873 return 'u'; | 913 // as an identity escape. |
874 } | 914 return 'u'; |
875 // If the 'u' flag is present, invalid escapes are not treated as | |
876 // identity escapes. | |
877 ReportError(CStrVector("Invalid unicode escape")); | |
878 return 0; | |
879 } | 915 } |
880 default: { | 916 default: { |
881 uc32 result = current(); | 917 uc32 result = current(); |
882 // If the 'u' flag is present, only syntax characters can be escaped, no | 918 // With /u, no identity escapes except for syntax characters are |
883 // other identity escapes are allowed. If the 'u' flag is not present, all | 919 // allowed. Otherwise, all identity escapes are allowed. |
884 // identity escapes are allowed. | 920 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { |
885 if (!unicode() || IsSyntaxCharacter(result)) { | |
886 Advance(); | 921 Advance(); |
887 return result; | 922 return result; |
888 } | 923 } |
889 ReportError(CStrVector("Invalid escape")); | 924 ReportError(CStrVector("Invalid escape")); |
890 return 0; | 925 return 0; |
891 } | 926 } |
892 } | 927 } |
893 return 0; | 928 return 0; |
894 } | 929 } |
895 | 930 |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
949 if (char_class != kNoCharClass) { | 984 if (char_class != kNoCharClass) { |
950 CharacterRange::AddClassEscape(char_class, ranges, zone); | 985 CharacterRange::AddClassEscape(char_class, ranges, zone); |
951 } else { | 986 } else { |
952 ranges->Add(range, zone); | 987 ranges->Add(range, zone); |
953 } | 988 } |
954 } | 989 } |
955 | 990 |
956 | 991 |
957 RegExpTree* RegExpParser::ParseCharacterClass() { | 992 RegExpTree* RegExpParser::ParseCharacterClass() { |
958 static const char* kUnterminated = "Unterminated character class"; | 993 static const char* kUnterminated = "Unterminated character class"; |
| 994 static const char* kRangeInvalid = "Invalid character class"; |
959 static const char* kRangeOutOfOrder = "Range out of order in character class"; | 995 static const char* kRangeOutOfOrder = "Range out of order in character class"; |
960 | 996 |
961 DCHECK_EQ(current(), '['); | 997 DCHECK_EQ(current(), '['); |
962 Advance(); | 998 Advance(); |
963 bool is_negated = false; | 999 bool is_negated = false; |
964 if (current() == '^') { | 1000 if (current() == '^') { |
965 is_negated = true; | 1001 is_negated = true; |
966 Advance(); | 1002 Advance(); |
967 } | 1003 } |
968 ZoneList<CharacterRange>* ranges = | 1004 ZoneList<CharacterRange>* ranges = |
969 new (zone()) ZoneList<CharacterRange>(2, zone()); | 1005 new (zone()) ZoneList<CharacterRange>(2, zone()); |
970 while (has_more() && current() != ']') { | 1006 while (has_more() && current() != ']') { |
971 uc16 char_class = kNoCharClass; | 1007 uc16 char_class = kNoCharClass; |
972 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); | 1008 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); |
973 if (current() == '-') { | 1009 if (current() == '-') { |
974 Advance(); | 1010 Advance(); |
975 if (current() == kEndMarker) { | 1011 if (current() == kEndMarker) { |
976 // If we reach the end we break out of the loop and let the | 1012 // If we reach the end we break out of the loop and let the |
977 // following code report an error. | 1013 // following code report an error. |
978 break; | 1014 break; |
979 } else if (current() == ']') { | 1015 } else if (current() == ']') { |
980 AddRangeOrEscape(ranges, char_class, first, zone()); | 1016 AddRangeOrEscape(ranges, char_class, first, zone()); |
981 ranges->Add(CharacterRange::Singleton('-'), zone()); | 1017 ranges->Add(CharacterRange::Singleton('-'), zone()); |
982 break; | 1018 break; |
983 } | 1019 } |
984 uc16 char_class_2 = kNoCharClass; | 1020 uc16 char_class_2 = kNoCharClass; |
985 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); | 1021 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); |
986 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { | 1022 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { |
987 // Either end is an escaped character class. Treat the '-' verbatim. | 1023 // Either end is an escaped character class. Treat the '-' verbatim. |
| 1024 if (unicode()) { |
| 1025 // ES2015 21.2.2.15.1 step 1. |
| 1026 return ReportError(CStrVector(kRangeInvalid)); |
| 1027 } |
988 AddRangeOrEscape(ranges, char_class, first, zone()); | 1028 AddRangeOrEscape(ranges, char_class, first, zone()); |
989 ranges->Add(CharacterRange::Singleton('-'), zone()); | 1029 ranges->Add(CharacterRange::Singleton('-'), zone()); |
990 AddRangeOrEscape(ranges, char_class_2, next, zone()); | 1030 AddRangeOrEscape(ranges, char_class_2, next, zone()); |
991 continue; | 1031 continue; |
992 } | 1032 } |
| 1033 // ES2015 21.2.2.15.1 step 6. |
993 if (first.from() > next.to()) { | 1034 if (first.from() > next.to()) { |
994 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); | 1035 return ReportError(CStrVector(kRangeOutOfOrder)); |
995 } | 1036 } |
996 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); | 1037 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); |
997 } else { | 1038 } else { |
998 AddRangeOrEscape(ranges, char_class, first, zone()); | 1039 AddRangeOrEscape(ranges, char_class, first, zone()); |
999 } | 1040 } |
1000 } | 1041 } |
1001 if (!has_more()) { | 1042 if (!has_more()) { |
1002 return ReportError(CStrVector(kUnterminated) CHECK_FAILED); | 1043 return ReportError(CStrVector(kUnterminated)); |
1003 } | 1044 } |
1004 Advance(); | 1045 Advance(); |
1005 if (ranges->length() == 0) { | 1046 if (ranges->length() == 0) { |
1006 ranges->Add(CharacterRange::Everything(), zone()); | 1047 ranges->Add(CharacterRange::Everything(), zone()); |
1007 is_negated = !is_negated; | 1048 is_negated = !is_negated; |
1008 } | 1049 } |
1009 return new (zone()) RegExpCharacterClass(ranges, is_negated); | 1050 return new (zone()) RegExpCharacterClass(ranges, is_negated); |
1010 } | 1051 } |
1011 | 1052 |
1012 | 1053 |
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1155 AddCharacter(static_cast<uc16>(c)); | 1196 AddCharacter(static_cast<uc16>(c)); |
1156 } | 1197 } |
1157 } | 1198 } |
1158 | 1199 |
1159 | 1200 |
1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1201 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1161 | 1202 |
1162 | 1203 |
1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1204 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
1164 if (NeedsDesugaringForUnicode(cc)) { | 1205 if (NeedsDesugaringForUnicode(cc)) { |
1165 // In unicode mode, character class needs to be desugared, so it | 1206 // With /u, character class needs to be desugared, so it |
1166 // must be a standalone term instead of being part of a RegExpText. | 1207 // must be a standalone term instead of being part of a RegExpText. |
1167 AddTerm(cc); | 1208 AddTerm(cc); |
1168 } else { | 1209 } else { |
1169 AddAtom(cc); | 1210 AddAtom(cc); |
1170 } | 1211 } |
1171 } | 1212 } |
1172 | 1213 |
1173 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { | 1214 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { |
1174 AddTerm(new (zone()) RegExpCharacterClass( | 1215 AddTerm(new (zone()) RegExpCharacterClass( |
1175 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); | 1216 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); |
(...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1268 | 1309 |
1269 | 1310 |
1270 RegExpTree* RegExpBuilder::ToRegExp() { | 1311 RegExpTree* RegExpBuilder::ToRegExp() { |
1271 FlushTerms(); | 1312 FlushTerms(); |
1272 int num_alternatives = alternatives_.length(); | 1313 int num_alternatives = alternatives_.length(); |
1273 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1314 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
1274 if (num_alternatives == 1) return alternatives_.last(); | 1315 if (num_alternatives == 1) return alternatives_.last(); |
1275 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1316 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
1276 } | 1317 } |
1277 | 1318 |
1278 | 1319 bool RegExpBuilder::AddQuantifierToAtom( |
1279 void RegExpBuilder::AddQuantifierToAtom( | |
1280 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | 1320 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
1281 FlushPendingSurrogate(); | 1321 FlushPendingSurrogate(); |
1282 if (pending_empty_) { | 1322 if (pending_empty_) { |
1283 pending_empty_ = false; | 1323 pending_empty_ = false; |
1284 return; | 1324 return true; |
1285 } | 1325 } |
1286 RegExpTree* atom; | 1326 RegExpTree* atom; |
1287 if (characters_ != NULL) { | 1327 if (characters_ != NULL) { |
1288 DCHECK(last_added_ == ADD_CHAR); | 1328 DCHECK(last_added_ == ADD_CHAR); |
1289 // Last atom was character. | 1329 // Last atom was character. |
1290 Vector<const uc16> char_vector = characters_->ToConstVector(); | 1330 Vector<const uc16> char_vector = characters_->ToConstVector(); |
1291 int num_chars = char_vector.length(); | 1331 int num_chars = char_vector.length(); |
1292 if (num_chars > 1) { | 1332 if (num_chars > 1) { |
1293 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); | 1333 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); |
1294 text_.Add(new (zone()) RegExpAtom(prefix), zone()); | 1334 text_.Add(new (zone()) RegExpAtom(prefix), zone()); |
1295 char_vector = char_vector.SubVector(num_chars - 1, num_chars); | 1335 char_vector = char_vector.SubVector(num_chars - 1, num_chars); |
1296 } | 1336 } |
1297 characters_ = NULL; | 1337 characters_ = NULL; |
1298 atom = new (zone()) RegExpAtom(char_vector); | 1338 atom = new (zone()) RegExpAtom(char_vector); |
1299 FlushText(); | 1339 FlushText(); |
1300 } else if (text_.length() > 0) { | 1340 } else if (text_.length() > 0) { |
1301 DCHECK(last_added_ == ADD_ATOM); | 1341 DCHECK(last_added_ == ADD_ATOM); |
1302 atom = text_.RemoveLast(); | 1342 atom = text_.RemoveLast(); |
1303 FlushText(); | 1343 FlushText(); |
1304 } else if (terms_.length() > 0) { | 1344 } else if (terms_.length() > 0) { |
1305 DCHECK(last_added_ == ADD_ATOM); | 1345 DCHECK(last_added_ == ADD_ATOM); |
1306 atom = terms_.RemoveLast(); | 1346 atom = terms_.RemoveLast(); |
| 1347 // With /u, lookarounds are not quantifiable. |
| 1348 if (unicode() && atom->IsLookaround()) return false; |
1307 if (atom->max_match() == 0) { | 1349 if (atom->max_match() == 0) { |
1308 // Guaranteed to only match an empty string. | 1350 // Guaranteed to only match an empty string. |
1309 LAST(ADD_TERM); | 1351 LAST(ADD_TERM); |
1310 if (min == 0) { | 1352 if (min == 0) { |
1311 return; | 1353 return true; |
1312 } | 1354 } |
1313 terms_.Add(atom, zone()); | 1355 terms_.Add(atom, zone()); |
1314 return; | 1356 return true; |
1315 } | 1357 } |
1316 } else { | 1358 } else { |
1317 // Only call immediately after adding an atom or character! | 1359 // Only call immediately after adding an atom or character! |
1318 UNREACHABLE(); | 1360 UNREACHABLE(); |
1319 return; | 1361 return false; |
1320 } | 1362 } |
1321 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1363 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1322 zone()); | 1364 zone()); |
1323 LAST(ADD_TERM); | 1365 LAST(ADD_TERM); |
| 1366 return true; |
1324 } | 1367 } |
1325 | 1368 |
1326 } // namespace internal | 1369 } // namespace internal |
1327 } // namespace v8 | 1370 } // namespace v8 |
OLD | NEW |