OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
95 | 95 |
96 | 96 |
97 void RegExpParser::Advance(int dist) { | 97 void RegExpParser::Advance(int dist) { |
98 next_pos_ += dist - 1; | 98 next_pos_ += dist - 1; |
99 Advance(); | 99 Advance(); |
100 } | 100 } |
101 | 101 |
102 | 102 |
103 bool RegExpParser::simple() { return simple_; } | 103 bool RegExpParser::simple() { return simple_; } |
104 | 104 |
105 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { | 105 |
106 switch (c) { | 106 bool RegExpParser::IsSyntaxCharacter(uc32 c) { |
107 case '^': | 107 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || |
108 case '$': | 108 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || |
109 case '\\': | 109 c == '{' || c == '}' || c == '|'; |
110 case '.': | |
111 case '*': | |
112 case '+': | |
113 case '?': | |
114 case '(': | |
115 case ')': | |
116 case '[': | |
117 case ']': | |
118 case '{': | |
119 case '}': | |
120 case '|': | |
121 case '/': | |
122 return true; | |
123 default: | |
124 break; | |
125 } | |
126 return false; | |
127 } | 110 } |
128 | 111 |
129 | 112 |
130 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { | 113 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { |
131 failed_ = true; | 114 failed_ = true; |
132 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); | 115 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); |
133 // Zip to the end to make sure the no more input is read. | 116 // Zip to the end to make sure the no more input is read. |
134 current_ = kEndMarker; | 117 current_ = kEndMarker; |
135 next_pos_ = in()->length(); | 118 next_pos_ = in()->length(); |
136 return NULL; | 119 return NULL; |
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
171 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | 154 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
172 flags_, zone()); | 155 flags_, zone()); |
173 RegExpParserState* state = &initial_state; | 156 RegExpParserState* state = &initial_state; |
174 // Cache the builder in a local variable for quick access. | 157 // Cache the builder in a local variable for quick access. |
175 RegExpBuilder* builder = initial_state.builder(); | 158 RegExpBuilder* builder = initial_state.builder(); |
176 while (true) { | 159 while (true) { |
177 switch (current()) { | 160 switch (current()) { |
178 case kEndMarker: | 161 case kEndMarker: |
179 if (state->IsSubexpression()) { | 162 if (state->IsSubexpression()) { |
180 // Inside a parenthesized group when hitting end of input. | 163 // Inside a parenthesized group when hitting end of input. |
181 return ReportError(CStrVector("Unterminated group")); | 164 ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
182 } | 165 } |
183 DCHECK_EQ(INITIAL, state->group_type()); | 166 DCHECK_EQ(INITIAL, state->group_type()); |
184 // Parsing completed successfully. | 167 // Parsing completed successfully. |
185 return builder->ToRegExp(); | 168 return builder->ToRegExp(); |
186 case ')': { | 169 case ')': { |
187 if (!state->IsSubexpression()) { | 170 if (!state->IsSubexpression()) { |
188 return ReportError(CStrVector("Unmatched ')'")); | 171 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); |
189 } | 172 } |
190 DCHECK_NE(INITIAL, state->group_type()); | 173 DCHECK_NE(INITIAL, state->group_type()); |
191 | 174 |
192 Advance(); | 175 Advance(); |
193 // End disjunction parsing and convert builder content to new single | 176 // End disjunction parsing and convert builder content to new single |
194 // regexp atom. | 177 // regexp atom. |
195 RegExpTree* body = builder->ToRegExp(); | 178 RegExpTree* body = builder->ToRegExp(); |
196 | 179 |
197 int end_capture_index = captures_started(); | 180 int end_capture_index = captures_started(); |
198 | 181 |
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
286 if (Next() == '=') { | 269 if (Next() == '=') { |
287 subexpr_type = POSITIVE_LOOKAROUND; | 270 subexpr_type = POSITIVE_LOOKAROUND; |
288 break; | 271 break; |
289 } else if (Next() == '!') { | 272 } else if (Next() == '!') { |
290 subexpr_type = NEGATIVE_LOOKAROUND; | 273 subexpr_type = NEGATIVE_LOOKAROUND; |
291 break; | 274 break; |
292 } | 275 } |
293 } | 276 } |
294 // Fall through. | 277 // Fall through. |
295 default: | 278 default: |
296 return ReportError(CStrVector("Invalid group")); | 279 ReportError(CStrVector("Invalid group") CHECK_FAILED); |
| 280 break; |
297 } | 281 } |
298 Advance(2); | 282 Advance(2); |
299 } else { | 283 } else { |
300 if (captures_started_ >= kMaxCaptures) { | 284 if (captures_started_ >= kMaxCaptures) { |
301 return ReportError(CStrVector("Too many captures")); | 285 ReportError(CStrVector("Too many captures") CHECK_FAILED); |
302 } | 286 } |
303 captures_started_++; | 287 captures_started_++; |
304 } | 288 } |
305 // Store current state and begin new disjunction parsing. | 289 // Store current state and begin new disjunction parsing. |
306 state = | 290 state = |
307 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, | 291 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, |
308 captures_started_, flags_, zone()); | 292 captures_started_, flags_, zone()); |
309 builder = state->builder(); | 293 builder = state->builder(); |
310 continue; | 294 continue; |
311 } | 295 } |
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
369 // the capture registers of the referenced capture are either | 353 // the capture registers of the referenced capture are either |
370 // both set or both cleared. | 354 // both set or both cleared. |
371 builder->AddEmpty(); | 355 builder->AddEmpty(); |
372 } else { | 356 } else { |
373 RegExpCapture* capture = GetCapture(index); | 357 RegExpCapture* capture = GetCapture(index); |
374 RegExpTree* atom = new (zone()) RegExpBackReference(capture); | 358 RegExpTree* atom = new (zone()) RegExpBackReference(capture); |
375 builder->AddAtom(atom); | 359 builder->AddAtom(atom); |
376 } | 360 } |
377 break; | 361 break; |
378 } | 362 } |
379 // With /u, no identity escapes except for syntax characters | |
380 // are allowed. Otherwise, all identity escapes are allowed. | |
381 if (unicode()) { | |
382 return ReportError(CStrVector("Invalid escape")); | |
383 } | |
384 uc32 first_digit = Next(); | 363 uc32 first_digit = Next(); |
385 if (first_digit == '8' || first_digit == '9') { | 364 if (first_digit == '8' || first_digit == '9') { |
386 builder->AddCharacter(first_digit); | 365 // If the 'u' flag is present, only syntax characters can be |
387 Advance(2); | 366 // escaped, |
| 367 // no other identity escapes are allowed. If the 'u' flag is not |
| 368 // present, all identity escapes are allowed. |
| 369 if (!unicode()) { |
| 370 builder->AddCharacter(first_digit); |
| 371 Advance(2); |
| 372 } else { |
| 373 return ReportError(CStrVector("Invalid escape")); |
| 374 } |
388 break; | 375 break; |
389 } | 376 } |
390 } | 377 } |
391 // FALLTHROUGH | 378 // FALLTHROUGH |
392 case '0': { | 379 case '0': { |
393 Advance(); | 380 Advance(); |
394 if (unicode() && Next() >= '0' && Next() <= '9') { | |
395 // With /u, decimal escape with leading 0 are not parsed as octal. | |
396 return ReportError(CStrVector("Invalid decimal escape")); | |
397 } | |
398 uc32 octal = ParseOctalLiteral(); | 381 uc32 octal = ParseOctalLiteral(); |
399 builder->AddCharacter(octal); | 382 builder->AddCharacter(octal); |
400 break; | 383 break; |
401 } | 384 } |
402 // ControlEscape :: one of | 385 // ControlEscape :: one of |
403 // f n r t v | 386 // f n r t v |
404 case 'f': | 387 case 'f': |
405 Advance(2); | 388 Advance(2); |
406 builder->AddCharacter('\f'); | 389 builder->AddCharacter('\f'); |
407 break; | 390 break; |
(...skipping 17 matching lines...) Expand all Loading... |
425 Advance(); | 408 Advance(); |
426 uc32 controlLetter = Next(); | 409 uc32 controlLetter = Next(); |
427 // Special case if it is an ASCII letter. | 410 // Special case if it is an ASCII letter. |
428 // Convert lower case letters to uppercase. | 411 // Convert lower case letters to uppercase. |
429 uc32 letter = controlLetter & ~('a' ^ 'A'); | 412 uc32 letter = controlLetter & ~('a' ^ 'A'); |
430 if (letter < 'A' || 'Z' < letter) { | 413 if (letter < 'A' || 'Z' < letter) { |
431 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. | 414 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. |
432 // This is outside the specification. We match JSC in | 415 // This is outside the specification. We match JSC in |
433 // reading the backslash as a literal character instead | 416 // reading the backslash as a literal character instead |
434 // of as starting an escape. | 417 // of as starting an escape. |
435 if (unicode()) { | |
436 // With /u, invalid escapes are not treated as identity escapes. | |
437 return ReportError(CStrVector("Invalid unicode escape")); | |
438 } | |
439 builder->AddCharacter('\\'); | 418 builder->AddCharacter('\\'); |
440 } else { | 419 } else { |
441 Advance(2); | 420 Advance(2); |
442 builder->AddCharacter(controlLetter & 0x1f); | 421 builder->AddCharacter(controlLetter & 0x1f); |
443 } | 422 } |
444 break; | 423 break; |
445 } | 424 } |
446 case 'x': { | 425 case 'x': { |
447 Advance(2); | 426 Advance(2); |
448 uc32 value; | 427 uc32 value; |
449 if (ParseHexEscape(2, &value)) { | 428 if (ParseHexEscape(2, &value)) { |
450 builder->AddCharacter(value); | 429 builder->AddCharacter(value); |
451 } else if (!unicode()) { | 430 } else if (!unicode()) { |
452 builder->AddCharacter('x'); | 431 builder->AddCharacter('x'); |
453 } else { | 432 } else { |
454 // With /u, invalid escapes are not treated as identity escapes. | 433 // If the 'u' flag is present, invalid escapes are not treated as |
| 434 // identity escapes. |
455 return ReportError(CStrVector("Invalid escape")); | 435 return ReportError(CStrVector("Invalid escape")); |
456 } | 436 } |
457 break; | 437 break; |
458 } | 438 } |
459 case 'u': { | 439 case 'u': { |
460 Advance(2); | 440 Advance(2); |
461 uc32 value; | 441 uc32 value; |
462 if (ParseUnicodeEscape(&value)) { | 442 if (ParseUnicodeEscape(&value)) { |
463 builder->AddUnicodeCharacter(value); | 443 builder->AddUnicodeCharacter(value); |
464 } else if (!unicode()) { | 444 } else if (!unicode()) { |
465 builder->AddCharacter('u'); | 445 builder->AddCharacter('u'); |
466 } else { | 446 } else { |
467 // With /u, invalid escapes are not treated as identity escapes. | 447 // If the 'u' flag is present, invalid escapes are not treated as |
| 448 // identity escapes. |
468 return ReportError(CStrVector("Invalid unicode escape")); | 449 return ReportError(CStrVector("Invalid unicode escape")); |
469 } | 450 } |
470 break; | 451 break; |
471 } | 452 } |
472 default: | 453 default: |
473 Advance(); | 454 Advance(); |
474 // With /u, no identity escapes except for syntax characters | 455 // If the 'u' flag is present, only syntax characters can be |
475 // are allowed. Otherwise, all identity escapes are allowed. | 456 // escaped, no |
476 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { | 457 // other identity escapes are allowed. If the 'u' flag is not |
| 458 // present, |
| 459 // all identity escapes are allowed. |
| 460 if (!unicode() || IsSyntaxCharacter(current())) { |
477 builder->AddCharacter(current()); | 461 builder->AddCharacter(current()); |
478 Advance(); | 462 Advance(); |
479 } else { | 463 } else { |
480 return ReportError(CStrVector("Invalid escape")); | 464 return ReportError(CStrVector("Invalid escape")); |
481 } | 465 } |
482 break; | 466 break; |
483 } | 467 } |
484 break; | 468 break; |
485 case '{': { | 469 case '{': { |
486 int dummy; | 470 int dummy; |
487 if (ParseIntervalQuantifier(&dummy, &dummy)) { | 471 if (ParseIntervalQuantifier(&dummy, &dummy)) { |
488 return ReportError(CStrVector("Nothing to repeat")); | 472 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); |
489 } | 473 } |
490 // fallthrough | 474 // fallthrough |
491 } | 475 } |
492 case '}': | |
493 case ']': | |
494 if (unicode()) { | |
495 return ReportError(CStrVector("Lone quantifier brackets")); | |
496 } | |
497 // fallthrough | |
498 default: | 476 default: |
499 builder->AddUnicodeCharacter(current()); | 477 builder->AddUnicodeCharacter(current()); |
500 Advance(); | 478 Advance(); |
501 break; | 479 break; |
502 } // end switch(current()) | 480 } // end switch(current()) |
503 | 481 |
504 int min; | 482 int min; |
505 int max; | 483 int max; |
506 switch (current()) { | 484 switch (current()) { |
507 // QuantifierPrefix :: | 485 // QuantifierPrefix :: |
(...skipping 12 matching lines...) Expand all Loading... |
520 Advance(); | 498 Advance(); |
521 break; | 499 break; |
522 case '?': | 500 case '?': |
523 min = 0; | 501 min = 0; |
524 max = 1; | 502 max = 1; |
525 Advance(); | 503 Advance(); |
526 break; | 504 break; |
527 case '{': | 505 case '{': |
528 if (ParseIntervalQuantifier(&min, &max)) { | 506 if (ParseIntervalQuantifier(&min, &max)) { |
529 if (max < min) { | 507 if (max < min) { |
530 return ReportError( | 508 ReportError(CStrVector("numbers out of order in {} quantifier.") |
531 CStrVector("numbers out of order in {} quantifier")); | 509 CHECK_FAILED); |
532 } | 510 } |
533 break; | 511 break; |
534 } else if (unicode()) { | 512 } else { |
535 // With /u, incomplete quantifiers are not allowed. | 513 continue; |
536 return ReportError(CStrVector("Incomplete quantifier")); | |
537 } | 514 } |
538 continue; | |
539 default: | 515 default: |
540 continue; | 516 continue; |
541 } | 517 } |
542 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; | 518 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; |
543 if (current() == '?') { | 519 if (current() == '?') { |
544 quantifier_type = RegExpQuantifier::NON_GREEDY; | 520 quantifier_type = RegExpQuantifier::NON_GREEDY; |
545 Advance(); | 521 Advance(); |
546 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { | 522 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { |
547 // FLAG_regexp_possessive_quantifier is a debug-only flag. | 523 // FLAG_regexp_possessive_quantifier is a debug-only flag. |
548 quantifier_type = RegExpQuantifier::POSSESSIVE; | 524 quantifier_type = RegExpQuantifier::POSSESSIVE; |
549 Advance(); | 525 Advance(); |
550 } | 526 } |
551 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { | 527 builder->AddQuantifierToAtom(min, max, quantifier_type); |
552 return ReportError(CStrVector("Invalid quantifier")); | |
553 } | |
554 } | 528 } |
555 } | 529 } |
556 | 530 |
557 | 531 |
558 #ifdef DEBUG | 532 #ifdef DEBUG |
559 // Currently only used in an DCHECK. | 533 // Currently only used in an DCHECK. |
560 static bool IsSpecialClassEscape(uc32 c) { | 534 static bool IsSpecialClassEscape(uc32 c) { |
561 switch (c) { | 535 switch (c) { |
562 case 'd': | 536 case 'd': |
563 case 'D': | 537 case 'D': |
(...skipping 277 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
841 return '\r'; | 815 return '\r'; |
842 case 't': | 816 case 't': |
843 Advance(); | 817 Advance(); |
844 return '\t'; | 818 return '\t'; |
845 case 'v': | 819 case 'v': |
846 Advance(); | 820 Advance(); |
847 return '\v'; | 821 return '\v'; |
848 case 'c': { | 822 case 'c': { |
849 uc32 controlLetter = Next(); | 823 uc32 controlLetter = Next(); |
850 uc32 letter = controlLetter & ~('A' ^ 'a'); | 824 uc32 letter = controlLetter & ~('A' ^ 'a'); |
851 // For compatibility with JSC, inside a character class. We also accept | 825 // For compatibility with JSC, inside a character class |
852 // digits and underscore as control characters, unless with /u. | 826 // we also accept digits and underscore as control characters. |
853 if (letter >= 'A' && letter <= 'Z') { | 827 if ((controlLetter >= '0' && controlLetter <= '9') || |
| 828 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) { |
854 Advance(2); | 829 Advance(2); |
855 // Control letters mapped to ASCII control characters in the range | 830 // Control letters mapped to ASCII control characters in the range |
856 // 0x00-0x1f. | 831 // 0x00-0x1f. |
857 return controlLetter & 0x1f; | 832 return controlLetter & 0x1f; |
858 } | 833 } |
859 if (unicode()) { | |
860 // With /u, invalid escapes are not treated as identity escapes. | |
861 ReportError(CStrVector("Invalid class escape")); | |
862 return 0; | |
863 } | |
864 if ((controlLetter >= '0' && controlLetter <= '9') || | |
865 controlLetter == '_') { | |
866 Advance(2); | |
867 return controlLetter & 0x1f; | |
868 } | |
869 // We match JSC in reading the backslash as a literal | 834 // We match JSC in reading the backslash as a literal |
870 // character instead of as starting an escape. | 835 // character instead of as starting an escape. |
871 return '\\'; | 836 return '\\'; |
872 } | 837 } |
873 case '0': | 838 case '0': |
874 case '1': | 839 case '1': |
875 case '2': | 840 case '2': |
876 case '3': | 841 case '3': |
877 case '4': | 842 case '4': |
878 case '5': | 843 case '5': |
879 case '6': | 844 case '6': |
880 case '7': | 845 case '7': |
881 // For compatibility, we interpret a decimal escape that isn't | 846 // For compatibility, we interpret a decimal escape that isn't |
882 // a back reference (and therefore either \0 or not valid according | 847 // a back reference (and therefore either \0 or not valid according |
883 // to the specification) as a 1..3 digit octal character code. | 848 // to the specification) as a 1..3 digit octal character code. |
884 if (unicode()) { | |
885 // With /u, decimal escape is not interpreted as octal character code. | |
886 ReportError(CStrVector("Invalid class escape")); | |
887 return 0; | |
888 } | |
889 return ParseOctalLiteral(); | 849 return ParseOctalLiteral(); |
890 case 'x': { | 850 case 'x': { |
891 Advance(); | 851 Advance(); |
892 uc32 value; | 852 uc32 value; |
893 if (ParseHexEscape(2, &value)) return value; | 853 if (ParseHexEscape(2, &value)) { |
894 if (unicode()) { | 854 return value; |
895 // With /u, invalid escapes are not treated as identity escapes. | |
896 ReportError(CStrVector("Invalid escape")); | |
897 return 0; | |
898 } | 855 } |
899 // If \x is not followed by a two-digit hexadecimal, treat it | 856 if (!unicode()) { |
900 // as an identity escape. | 857 // If \x is not followed by a two-digit hexadecimal, treat it |
901 return 'x'; | 858 // as an identity escape. |
| 859 return 'x'; |
| 860 } |
| 861 // If the 'u' flag is present, invalid escapes are not treated as |
| 862 // identity escapes. |
| 863 ReportError(CStrVector("Invalid escape")); |
| 864 return 0; |
902 } | 865 } |
903 case 'u': { | 866 case 'u': { |
904 Advance(); | 867 Advance(); |
905 uc32 value; | 868 uc32 value; |
906 if (ParseUnicodeEscape(&value)) return value; | 869 if (ParseUnicodeEscape(&value)) { |
907 if (unicode()) { | 870 return value; |
908 // With /u, invalid escapes are not treated as identity escapes. | |
909 ReportError(CStrVector("Invalid unicode escape")); | |
910 return 0; | |
911 } | 871 } |
912 // If \u is not followed by a two-digit hexadecimal, treat it | 872 if (!unicode()) { |
913 // as an identity escape. | 873 return 'u'; |
914 return 'u'; | 874 } |
| 875 // If the 'u' flag is present, invalid escapes are not treated as |
| 876 // identity escapes. |
| 877 ReportError(CStrVector("Invalid unicode escape")); |
| 878 return 0; |
915 } | 879 } |
916 default: { | 880 default: { |
917 uc32 result = current(); | 881 uc32 result = current(); |
918 // With /u, no identity escapes except for syntax characters are | 882 // If the 'u' flag is present, only syntax characters can be escaped, no |
919 // allowed. Otherwise, all identity escapes are allowed. | 883 // other identity escapes are allowed. If the 'u' flag is not present, all |
920 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { | 884 // identity escapes are allowed. |
| 885 if (!unicode() || IsSyntaxCharacter(result)) { |
921 Advance(); | 886 Advance(); |
922 return result; | 887 return result; |
923 } | 888 } |
924 ReportError(CStrVector("Invalid escape")); | 889 ReportError(CStrVector("Invalid escape")); |
925 return 0; | 890 return 0; |
926 } | 891 } |
927 } | 892 } |
928 return 0; | 893 return 0; |
929 } | 894 } |
930 | 895 |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
984 if (char_class != kNoCharClass) { | 949 if (char_class != kNoCharClass) { |
985 CharacterRange::AddClassEscape(char_class, ranges, zone); | 950 CharacterRange::AddClassEscape(char_class, ranges, zone); |
986 } else { | 951 } else { |
987 ranges->Add(range, zone); | 952 ranges->Add(range, zone); |
988 } | 953 } |
989 } | 954 } |
990 | 955 |
991 | 956 |
992 RegExpTree* RegExpParser::ParseCharacterClass() { | 957 RegExpTree* RegExpParser::ParseCharacterClass() { |
993 static const char* kUnterminated = "Unterminated character class"; | 958 static const char* kUnterminated = "Unterminated character class"; |
994 static const char* kRangeInvalid = "Invalid character class"; | |
995 static const char* kRangeOutOfOrder = "Range out of order in character class"; | 959 static const char* kRangeOutOfOrder = "Range out of order in character class"; |
996 | 960 |
997 DCHECK_EQ(current(), '['); | 961 DCHECK_EQ(current(), '['); |
998 Advance(); | 962 Advance(); |
999 bool is_negated = false; | 963 bool is_negated = false; |
1000 if (current() == '^') { | 964 if (current() == '^') { |
1001 is_negated = true; | 965 is_negated = true; |
1002 Advance(); | 966 Advance(); |
1003 } | 967 } |
1004 ZoneList<CharacterRange>* ranges = | 968 ZoneList<CharacterRange>* ranges = |
1005 new (zone()) ZoneList<CharacterRange>(2, zone()); | 969 new (zone()) ZoneList<CharacterRange>(2, zone()); |
1006 while (has_more() && current() != ']') { | 970 while (has_more() && current() != ']') { |
1007 uc16 char_class = kNoCharClass; | 971 uc16 char_class = kNoCharClass; |
1008 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); | 972 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); |
1009 if (current() == '-') { | 973 if (current() == '-') { |
1010 Advance(); | 974 Advance(); |
1011 if (current() == kEndMarker) { | 975 if (current() == kEndMarker) { |
1012 // If we reach the end we break out of the loop and let the | 976 // If we reach the end we break out of the loop and let the |
1013 // following code report an error. | 977 // following code report an error. |
1014 break; | 978 break; |
1015 } else if (current() == ']') { | 979 } else if (current() == ']') { |
1016 AddRangeOrEscape(ranges, char_class, first, zone()); | 980 AddRangeOrEscape(ranges, char_class, first, zone()); |
1017 ranges->Add(CharacterRange::Singleton('-'), zone()); | 981 ranges->Add(CharacterRange::Singleton('-'), zone()); |
1018 break; | 982 break; |
1019 } | 983 } |
1020 uc16 char_class_2 = kNoCharClass; | 984 uc16 char_class_2 = kNoCharClass; |
1021 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); | 985 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); |
1022 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { | 986 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { |
1023 // Either end is an escaped character class. Treat the '-' verbatim. | 987 // Either end is an escaped character class. Treat the '-' verbatim. |
1024 if (unicode()) { | |
1025 // ES2015 21.2.2.15.1 step 1. | |
1026 return ReportError(CStrVector(kRangeInvalid)); | |
1027 } | |
1028 AddRangeOrEscape(ranges, char_class, first, zone()); | 988 AddRangeOrEscape(ranges, char_class, first, zone()); |
1029 ranges->Add(CharacterRange::Singleton('-'), zone()); | 989 ranges->Add(CharacterRange::Singleton('-'), zone()); |
1030 AddRangeOrEscape(ranges, char_class_2, next, zone()); | 990 AddRangeOrEscape(ranges, char_class_2, next, zone()); |
1031 continue; | 991 continue; |
1032 } | 992 } |
1033 // ES2015 21.2.2.15.1 step 6. | |
1034 if (first.from() > next.to()) { | 993 if (first.from() > next.to()) { |
1035 return ReportError(CStrVector(kRangeOutOfOrder)); | 994 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); |
1036 } | 995 } |
1037 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); | 996 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); |
1038 } else { | 997 } else { |
1039 AddRangeOrEscape(ranges, char_class, first, zone()); | 998 AddRangeOrEscape(ranges, char_class, first, zone()); |
1040 } | 999 } |
1041 } | 1000 } |
1042 if (!has_more()) { | 1001 if (!has_more()) { |
1043 return ReportError(CStrVector(kUnterminated)); | 1002 return ReportError(CStrVector(kUnterminated) CHECK_FAILED); |
1044 } | 1003 } |
1045 Advance(); | 1004 Advance(); |
1046 if (ranges->length() == 0) { | 1005 if (ranges->length() == 0) { |
1047 ranges->Add(CharacterRange::Everything(), zone()); | 1006 ranges->Add(CharacterRange::Everything(), zone()); |
1048 is_negated = !is_negated; | 1007 is_negated = !is_negated; |
1049 } | 1008 } |
1050 return new (zone()) RegExpCharacterClass(ranges, is_negated); | 1009 return new (zone()) RegExpCharacterClass(ranges, is_negated); |
1051 } | 1010 } |
1052 | 1011 |
1053 | 1012 |
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1196 AddCharacter(static_cast<uc16>(c)); | 1155 AddCharacter(static_cast<uc16>(c)); |
1197 } | 1156 } |
1198 } | 1157 } |
1199 | 1158 |
1200 | 1159 |
1201 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1202 | 1161 |
1203 | 1162 |
1204 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
1205 if (NeedsDesugaringForUnicode(cc)) { | 1164 if (NeedsDesugaringForUnicode(cc)) { |
1206 // With /u, character class needs to be desugared, so it | 1165 // In unicode mode, character class needs to be desugared, so it |
1207 // must be a standalone term instead of being part of a RegExpText. | 1166 // must be a standalone term instead of being part of a RegExpText. |
1208 AddTerm(cc); | 1167 AddTerm(cc); |
1209 } else { | 1168 } else { |
1210 AddAtom(cc); | 1169 AddAtom(cc); |
1211 } | 1170 } |
1212 } | 1171 } |
1213 | 1172 |
1214 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { | 1173 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { |
1215 AddTerm(new (zone()) RegExpCharacterClass( | 1174 AddTerm(new (zone()) RegExpCharacterClass( |
1216 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); | 1175 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); |
(...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1309 | 1268 |
1310 | 1269 |
1311 RegExpTree* RegExpBuilder::ToRegExp() { | 1270 RegExpTree* RegExpBuilder::ToRegExp() { |
1312 FlushTerms(); | 1271 FlushTerms(); |
1313 int num_alternatives = alternatives_.length(); | 1272 int num_alternatives = alternatives_.length(); |
1314 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1273 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
1315 if (num_alternatives == 1) return alternatives_.last(); | 1274 if (num_alternatives == 1) return alternatives_.last(); |
1316 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1275 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
1317 } | 1276 } |
1318 | 1277 |
1319 bool RegExpBuilder::AddQuantifierToAtom( | 1278 |
| 1279 void RegExpBuilder::AddQuantifierToAtom( |
1320 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | 1280 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
1321 FlushPendingSurrogate(); | 1281 FlushPendingSurrogate(); |
1322 if (pending_empty_) { | 1282 if (pending_empty_) { |
1323 pending_empty_ = false; | 1283 pending_empty_ = false; |
1324 return true; | 1284 return; |
1325 } | 1285 } |
1326 RegExpTree* atom; | 1286 RegExpTree* atom; |
1327 if (characters_ != NULL) { | 1287 if (characters_ != NULL) { |
1328 DCHECK(last_added_ == ADD_CHAR); | 1288 DCHECK(last_added_ == ADD_CHAR); |
1329 // Last atom was character. | 1289 // Last atom was character. |
1330 Vector<const uc16> char_vector = characters_->ToConstVector(); | 1290 Vector<const uc16> char_vector = characters_->ToConstVector(); |
1331 int num_chars = char_vector.length(); | 1291 int num_chars = char_vector.length(); |
1332 if (num_chars > 1) { | 1292 if (num_chars > 1) { |
1333 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); | 1293 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); |
1334 text_.Add(new (zone()) RegExpAtom(prefix), zone()); | 1294 text_.Add(new (zone()) RegExpAtom(prefix), zone()); |
1335 char_vector = char_vector.SubVector(num_chars - 1, num_chars); | 1295 char_vector = char_vector.SubVector(num_chars - 1, num_chars); |
1336 } | 1296 } |
1337 characters_ = NULL; | 1297 characters_ = NULL; |
1338 atom = new (zone()) RegExpAtom(char_vector); | 1298 atom = new (zone()) RegExpAtom(char_vector); |
1339 FlushText(); | 1299 FlushText(); |
1340 } else if (text_.length() > 0) { | 1300 } else if (text_.length() > 0) { |
1341 DCHECK(last_added_ == ADD_ATOM); | 1301 DCHECK(last_added_ == ADD_ATOM); |
1342 atom = text_.RemoveLast(); | 1302 atom = text_.RemoveLast(); |
1343 FlushText(); | 1303 FlushText(); |
1344 } else if (terms_.length() > 0) { | 1304 } else if (terms_.length() > 0) { |
1345 DCHECK(last_added_ == ADD_ATOM); | 1305 DCHECK(last_added_ == ADD_ATOM); |
1346 atom = terms_.RemoveLast(); | 1306 atom = terms_.RemoveLast(); |
1347 // With /u, lookarounds are not quantifiable. | |
1348 if (unicode() && atom->IsLookaround()) return false; | |
1349 if (atom->max_match() == 0) { | 1307 if (atom->max_match() == 0) { |
1350 // Guaranteed to only match an empty string. | 1308 // Guaranteed to only match an empty string. |
1351 LAST(ADD_TERM); | 1309 LAST(ADD_TERM); |
1352 if (min == 0) { | 1310 if (min == 0) { |
1353 return true; | 1311 return; |
1354 } | 1312 } |
1355 terms_.Add(atom, zone()); | 1313 terms_.Add(atom, zone()); |
1356 return true; | 1314 return; |
1357 } | 1315 } |
1358 } else { | 1316 } else { |
1359 // Only call immediately after adding an atom or character! | 1317 // Only call immediately after adding an atom or character! |
1360 UNREACHABLE(); | 1318 UNREACHABLE(); |
1361 return false; | 1319 return; |
1362 } | 1320 } |
1363 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1321 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1364 zone()); | 1322 zone()); |
1365 LAST(ADD_TERM); | 1323 LAST(ADD_TERM); |
1366 return true; | |
1367 } | 1324 } |
1368 | 1325 |
1369 } // namespace internal | 1326 } // namespace internal |
1370 } // namespace v8 | 1327 } // namespace v8 |
OLD | NEW |