Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(93)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1648673002: Revert of [regexp] restrict pattern syntax for unicode mode. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@stage
Patch Set: Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-regexp-restricted-syntax.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 95
96 96
97 void RegExpParser::Advance(int dist) { 97 void RegExpParser::Advance(int dist) {
98 next_pos_ += dist - 1; 98 next_pos_ += dist - 1;
99 Advance(); 99 Advance();
100 } 100 }
101 101
102 102
103 bool RegExpParser::simple() { return simple_; } 103 bool RegExpParser::simple() { return simple_; }
104 104
105 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) { 105
106 switch (c) { 106 bool RegExpParser::IsSyntaxCharacter(uc32 c) {
107 case '^': 107 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
108 case '$': 108 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
109 case '\\': 109 c == '{' || c == '}' || c == '|';
110 case '.':
111 case '*':
112 case '+':
113 case '?':
114 case '(':
115 case ')':
116 case '[':
117 case ']':
118 case '{':
119 case '}':
120 case '|':
121 case '/':
122 return true;
123 default:
124 break;
125 }
126 return false;
127 } 110 }
128 111
129 112
130 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { 113 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
131 failed_ = true; 114 failed_ = true;
132 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); 115 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
133 // Zip to the end to make sure the no more input is read. 116 // Zip to the end to make sure the no more input is read.
134 current_ = kEndMarker; 117 current_ = kEndMarker;
135 next_pos_ = in()->length(); 118 next_pos_ = in()->length();
136 return NULL; 119 return NULL;
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
171 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, 154 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
172 flags_, zone()); 155 flags_, zone());
173 RegExpParserState* state = &initial_state; 156 RegExpParserState* state = &initial_state;
174 // Cache the builder in a local variable for quick access. 157 // Cache the builder in a local variable for quick access.
175 RegExpBuilder* builder = initial_state.builder(); 158 RegExpBuilder* builder = initial_state.builder();
176 while (true) { 159 while (true) {
177 switch (current()) { 160 switch (current()) {
178 case kEndMarker: 161 case kEndMarker:
179 if (state->IsSubexpression()) { 162 if (state->IsSubexpression()) {
180 // Inside a parenthesized group when hitting end of input. 163 // Inside a parenthesized group when hitting end of input.
181 return ReportError(CStrVector("Unterminated group")); 164 ReportError(CStrVector("Unterminated group") CHECK_FAILED);
182 } 165 }
183 DCHECK_EQ(INITIAL, state->group_type()); 166 DCHECK_EQ(INITIAL, state->group_type());
184 // Parsing completed successfully. 167 // Parsing completed successfully.
185 return builder->ToRegExp(); 168 return builder->ToRegExp();
186 case ')': { 169 case ')': {
187 if (!state->IsSubexpression()) { 170 if (!state->IsSubexpression()) {
188 return ReportError(CStrVector("Unmatched ')'")); 171 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED);
189 } 172 }
190 DCHECK_NE(INITIAL, state->group_type()); 173 DCHECK_NE(INITIAL, state->group_type());
191 174
192 Advance(); 175 Advance();
193 // End disjunction parsing and convert builder content to new single 176 // End disjunction parsing and convert builder content to new single
194 // regexp atom. 177 // regexp atom.
195 RegExpTree* body = builder->ToRegExp(); 178 RegExpTree* body = builder->ToRegExp();
196 179
197 int end_capture_index = captures_started(); 180 int end_capture_index = captures_started();
198 181
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
286 if (Next() == '=') { 269 if (Next() == '=') {
287 subexpr_type = POSITIVE_LOOKAROUND; 270 subexpr_type = POSITIVE_LOOKAROUND;
288 break; 271 break;
289 } else if (Next() == '!') { 272 } else if (Next() == '!') {
290 subexpr_type = NEGATIVE_LOOKAROUND; 273 subexpr_type = NEGATIVE_LOOKAROUND;
291 break; 274 break;
292 } 275 }
293 } 276 }
294 // Fall through. 277 // Fall through.
295 default: 278 default:
296 return ReportError(CStrVector("Invalid group")); 279 ReportError(CStrVector("Invalid group") CHECK_FAILED);
280 break;
297 } 281 }
298 Advance(2); 282 Advance(2);
299 } else { 283 } else {
300 if (captures_started_ >= kMaxCaptures) { 284 if (captures_started_ >= kMaxCaptures) {
301 return ReportError(CStrVector("Too many captures")); 285 ReportError(CStrVector("Too many captures") CHECK_FAILED);
302 } 286 }
303 captures_started_++; 287 captures_started_++;
304 } 288 }
305 // Store current state and begin new disjunction parsing. 289 // Store current state and begin new disjunction parsing.
306 state = 290 state =
307 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, 291 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,
308 captures_started_, flags_, zone()); 292 captures_started_, flags_, zone());
309 builder = state->builder(); 293 builder = state->builder();
310 continue; 294 continue;
311 } 295 }
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
369 // the capture registers of the referenced capture are either 353 // the capture registers of the referenced capture are either
370 // both set or both cleared. 354 // both set or both cleared.
371 builder->AddEmpty(); 355 builder->AddEmpty();
372 } else { 356 } else {
373 RegExpCapture* capture = GetCapture(index); 357 RegExpCapture* capture = GetCapture(index);
374 RegExpTree* atom = new (zone()) RegExpBackReference(capture); 358 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
375 builder->AddAtom(atom); 359 builder->AddAtom(atom);
376 } 360 }
377 break; 361 break;
378 } 362 }
379 // With /u, no identity escapes except for syntax characters
380 // are allowed. Otherwise, all identity escapes are allowed.
381 if (unicode()) {
382 return ReportError(CStrVector("Invalid escape"));
383 }
384 uc32 first_digit = Next(); 363 uc32 first_digit = Next();
385 if (first_digit == '8' || first_digit == '9') { 364 if (first_digit == '8' || first_digit == '9') {
386 builder->AddCharacter(first_digit); 365 // If the 'u' flag is present, only syntax characters can be
387 Advance(2); 366 // escaped,
367 // no other identity escapes are allowed. If the 'u' flag is not
368 // present, all identity escapes are allowed.
369 if (!unicode()) {
370 builder->AddCharacter(first_digit);
371 Advance(2);
372 } else {
373 return ReportError(CStrVector("Invalid escape"));
374 }
388 break; 375 break;
389 } 376 }
390 } 377 }
391 // FALLTHROUGH 378 // FALLTHROUGH
392 case '0': { 379 case '0': {
393 Advance(); 380 Advance();
394 if (unicode() && Next() >= '0' && Next() <= '9') {
395 // With /u, decimal escape with leading 0 are not parsed as octal.
396 return ReportError(CStrVector("Invalid decimal escape"));
397 }
398 uc32 octal = ParseOctalLiteral(); 381 uc32 octal = ParseOctalLiteral();
399 builder->AddCharacter(octal); 382 builder->AddCharacter(octal);
400 break; 383 break;
401 } 384 }
402 // ControlEscape :: one of 385 // ControlEscape :: one of
403 // f n r t v 386 // f n r t v
404 case 'f': 387 case 'f':
405 Advance(2); 388 Advance(2);
406 builder->AddCharacter('\f'); 389 builder->AddCharacter('\f');
407 break; 390 break;
(...skipping 17 matching lines...) Expand all
425 Advance(); 408 Advance();
426 uc32 controlLetter = Next(); 409 uc32 controlLetter = Next();
427 // Special case if it is an ASCII letter. 410 // Special case if it is an ASCII letter.
428 // Convert lower case letters to uppercase. 411 // Convert lower case letters to uppercase.
429 uc32 letter = controlLetter & ~('a' ^ 'A'); 412 uc32 letter = controlLetter & ~('a' ^ 'A');
430 if (letter < 'A' || 'Z' < letter) { 413 if (letter < 'A' || 'Z' < letter) {
431 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. 414 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
432 // This is outside the specification. We match JSC in 415 // This is outside the specification. We match JSC in
433 // reading the backslash as a literal character instead 416 // reading the backslash as a literal character instead
434 // of as starting an escape. 417 // of as starting an escape.
435 if (unicode()) {
436 // With /u, invalid escapes are not treated as identity escapes.
437 return ReportError(CStrVector("Invalid unicode escape"));
438 }
439 builder->AddCharacter('\\'); 418 builder->AddCharacter('\\');
440 } else { 419 } else {
441 Advance(2); 420 Advance(2);
442 builder->AddCharacter(controlLetter & 0x1f); 421 builder->AddCharacter(controlLetter & 0x1f);
443 } 422 }
444 break; 423 break;
445 } 424 }
446 case 'x': { 425 case 'x': {
447 Advance(2); 426 Advance(2);
448 uc32 value; 427 uc32 value;
449 if (ParseHexEscape(2, &value)) { 428 if (ParseHexEscape(2, &value)) {
450 builder->AddCharacter(value); 429 builder->AddCharacter(value);
451 } else if (!unicode()) { 430 } else if (!unicode()) {
452 builder->AddCharacter('x'); 431 builder->AddCharacter('x');
453 } else { 432 } else {
454 // With /u, invalid escapes are not treated as identity escapes. 433 // If the 'u' flag is present, invalid escapes are not treated as
434 // identity escapes.
455 return ReportError(CStrVector("Invalid escape")); 435 return ReportError(CStrVector("Invalid escape"));
456 } 436 }
457 break; 437 break;
458 } 438 }
459 case 'u': { 439 case 'u': {
460 Advance(2); 440 Advance(2);
461 uc32 value; 441 uc32 value;
462 if (ParseUnicodeEscape(&value)) { 442 if (ParseUnicodeEscape(&value)) {
463 builder->AddUnicodeCharacter(value); 443 builder->AddUnicodeCharacter(value);
464 } else if (!unicode()) { 444 } else if (!unicode()) {
465 builder->AddCharacter('u'); 445 builder->AddCharacter('u');
466 } else { 446 } else {
467 // With /u, invalid escapes are not treated as identity escapes. 447 // If the 'u' flag is present, invalid escapes are not treated as
448 // identity escapes.
468 return ReportError(CStrVector("Invalid unicode escape")); 449 return ReportError(CStrVector("Invalid unicode escape"));
469 } 450 }
470 break; 451 break;
471 } 452 }
472 default: 453 default:
473 Advance(); 454 Advance();
474 // With /u, no identity escapes except for syntax characters 455 // If the 'u' flag is present, only syntax characters can be
475 // are allowed. Otherwise, all identity escapes are allowed. 456 // escaped, no
476 if (!unicode() || IsSyntaxCharacterOrSlash(current())) { 457 // other identity escapes are allowed. If the 'u' flag is not
458 // present,
459 // all identity escapes are allowed.
460 if (!unicode() || IsSyntaxCharacter(current())) {
477 builder->AddCharacter(current()); 461 builder->AddCharacter(current());
478 Advance(); 462 Advance();
479 } else { 463 } else {
480 return ReportError(CStrVector("Invalid escape")); 464 return ReportError(CStrVector("Invalid escape"));
481 } 465 }
482 break; 466 break;
483 } 467 }
484 break; 468 break;
485 case '{': { 469 case '{': {
486 int dummy; 470 int dummy;
487 if (ParseIntervalQuantifier(&dummy, &dummy)) { 471 if (ParseIntervalQuantifier(&dummy, &dummy)) {
488 return ReportError(CStrVector("Nothing to repeat")); 472 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
489 } 473 }
490 // fallthrough 474 // fallthrough
491 } 475 }
492 case '}':
493 case ']':
494 if (unicode()) {
495 return ReportError(CStrVector("Lone quantifier brackets"));
496 }
497 // fallthrough
498 default: 476 default:
499 builder->AddUnicodeCharacter(current()); 477 builder->AddUnicodeCharacter(current());
500 Advance(); 478 Advance();
501 break; 479 break;
502 } // end switch(current()) 480 } // end switch(current())
503 481
504 int min; 482 int min;
505 int max; 483 int max;
506 switch (current()) { 484 switch (current()) {
507 // QuantifierPrefix :: 485 // QuantifierPrefix ::
(...skipping 12 matching lines...) Expand all
520 Advance(); 498 Advance();
521 break; 499 break;
522 case '?': 500 case '?':
523 min = 0; 501 min = 0;
524 max = 1; 502 max = 1;
525 Advance(); 503 Advance();
526 break; 504 break;
527 case '{': 505 case '{':
528 if (ParseIntervalQuantifier(&min, &max)) { 506 if (ParseIntervalQuantifier(&min, &max)) {
529 if (max < min) { 507 if (max < min) {
530 return ReportError( 508 ReportError(CStrVector("numbers out of order in {} quantifier.")
531 CStrVector("numbers out of order in {} quantifier")); 509 CHECK_FAILED);
532 } 510 }
533 break; 511 break;
534 } else if (unicode()) { 512 } else {
535 // With /u, incomplete quantifiers are not allowed. 513 continue;
536 return ReportError(CStrVector("Incomplete quantifier"));
537 } 514 }
538 continue;
539 default: 515 default:
540 continue; 516 continue;
541 } 517 }
542 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 518 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
543 if (current() == '?') { 519 if (current() == '?') {
544 quantifier_type = RegExpQuantifier::NON_GREEDY; 520 quantifier_type = RegExpQuantifier::NON_GREEDY;
545 Advance(); 521 Advance();
546 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 522 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
547 // FLAG_regexp_possessive_quantifier is a debug-only flag. 523 // FLAG_regexp_possessive_quantifier is a debug-only flag.
548 quantifier_type = RegExpQuantifier::POSSESSIVE; 524 quantifier_type = RegExpQuantifier::POSSESSIVE;
549 Advance(); 525 Advance();
550 } 526 }
551 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) { 527 builder->AddQuantifierToAtom(min, max, quantifier_type);
552 return ReportError(CStrVector("Invalid quantifier"));
553 }
554 } 528 }
555 } 529 }
556 530
557 531
558 #ifdef DEBUG 532 #ifdef DEBUG
559 // Currently only used in an DCHECK. 533 // Currently only used in an DCHECK.
560 static bool IsSpecialClassEscape(uc32 c) { 534 static bool IsSpecialClassEscape(uc32 c) {
561 switch (c) { 535 switch (c) {
562 case 'd': 536 case 'd':
563 case 'D': 537 case 'D':
(...skipping 277 matching lines...) Expand 10 before | Expand all | Expand 10 after
841 return '\r'; 815 return '\r';
842 case 't': 816 case 't':
843 Advance(); 817 Advance();
844 return '\t'; 818 return '\t';
845 case 'v': 819 case 'v':
846 Advance(); 820 Advance();
847 return '\v'; 821 return '\v';
848 case 'c': { 822 case 'c': {
849 uc32 controlLetter = Next(); 823 uc32 controlLetter = Next();
850 uc32 letter = controlLetter & ~('A' ^ 'a'); 824 uc32 letter = controlLetter & ~('A' ^ 'a');
851 // For compatibility with JSC, inside a character class. We also accept 825 // For compatibility with JSC, inside a character class
852 // digits and underscore as control characters, unless with /u. 826 // we also accept digits and underscore as control characters.
853 if (letter >= 'A' && letter <= 'Z') { 827 if ((controlLetter >= '0' && controlLetter <= '9') ||
828 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
854 Advance(2); 829 Advance(2);
855 // Control letters mapped to ASCII control characters in the range 830 // Control letters mapped to ASCII control characters in the range
856 // 0x00-0x1f. 831 // 0x00-0x1f.
857 return controlLetter & 0x1f; 832 return controlLetter & 0x1f;
858 } 833 }
859 if (unicode()) {
860 // With /u, invalid escapes are not treated as identity escapes.
861 ReportError(CStrVector("Invalid class escape"));
862 return 0;
863 }
864 if ((controlLetter >= '0' && controlLetter <= '9') ||
865 controlLetter == '_') {
866 Advance(2);
867 return controlLetter & 0x1f;
868 }
869 // We match JSC in reading the backslash as a literal 834 // We match JSC in reading the backslash as a literal
870 // character instead of as starting an escape. 835 // character instead of as starting an escape.
871 return '\\'; 836 return '\\';
872 } 837 }
873 case '0': 838 case '0':
874 case '1': 839 case '1':
875 case '2': 840 case '2':
876 case '3': 841 case '3':
877 case '4': 842 case '4':
878 case '5': 843 case '5':
879 case '6': 844 case '6':
880 case '7': 845 case '7':
881 // For compatibility, we interpret a decimal escape that isn't 846 // For compatibility, we interpret a decimal escape that isn't
882 // a back reference (and therefore either \0 or not valid according 847 // a back reference (and therefore either \0 or not valid according
883 // to the specification) as a 1..3 digit octal character code. 848 // to the specification) as a 1..3 digit octal character code.
884 if (unicode()) {
885 // With /u, decimal escape is not interpreted as octal character code.
886 ReportError(CStrVector("Invalid class escape"));
887 return 0;
888 }
889 return ParseOctalLiteral(); 849 return ParseOctalLiteral();
890 case 'x': { 850 case 'x': {
891 Advance(); 851 Advance();
892 uc32 value; 852 uc32 value;
893 if (ParseHexEscape(2, &value)) return value; 853 if (ParseHexEscape(2, &value)) {
894 if (unicode()) { 854 return value;
895 // With /u, invalid escapes are not treated as identity escapes.
896 ReportError(CStrVector("Invalid escape"));
897 return 0;
898 } 855 }
899 // If \x is not followed by a two-digit hexadecimal, treat it 856 if (!unicode()) {
900 // as an identity escape. 857 // If \x is not followed by a two-digit hexadecimal, treat it
901 return 'x'; 858 // as an identity escape.
859 return 'x';
860 }
861 // If the 'u' flag is present, invalid escapes are not treated as
862 // identity escapes.
863 ReportError(CStrVector("Invalid escape"));
864 return 0;
902 } 865 }
903 case 'u': { 866 case 'u': {
904 Advance(); 867 Advance();
905 uc32 value; 868 uc32 value;
906 if (ParseUnicodeEscape(&value)) return value; 869 if (ParseUnicodeEscape(&value)) {
907 if (unicode()) { 870 return value;
908 // With /u, invalid escapes are not treated as identity escapes.
909 ReportError(CStrVector("Invalid unicode escape"));
910 return 0;
911 } 871 }
912 // If \u is not followed by a two-digit hexadecimal, treat it 872 if (!unicode()) {
913 // as an identity escape. 873 return 'u';
914 return 'u'; 874 }
875 // If the 'u' flag is present, invalid escapes are not treated as
876 // identity escapes.
877 ReportError(CStrVector("Invalid unicode escape"));
878 return 0;
915 } 879 }
916 default: { 880 default: {
917 uc32 result = current(); 881 uc32 result = current();
918 // With /u, no identity escapes except for syntax characters are 882 // If the 'u' flag is present, only syntax characters can be escaped, no
919 // allowed. Otherwise, all identity escapes are allowed. 883 // other identity escapes are allowed. If the 'u' flag is not present, all
920 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { 884 // identity escapes are allowed.
885 if (!unicode() || IsSyntaxCharacter(result)) {
921 Advance(); 886 Advance();
922 return result; 887 return result;
923 } 888 }
924 ReportError(CStrVector("Invalid escape")); 889 ReportError(CStrVector("Invalid escape"));
925 return 0; 890 return 0;
926 } 891 }
927 } 892 }
928 return 0; 893 return 0;
929 } 894 }
930 895
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
984 if (char_class != kNoCharClass) { 949 if (char_class != kNoCharClass) {
985 CharacterRange::AddClassEscape(char_class, ranges, zone); 950 CharacterRange::AddClassEscape(char_class, ranges, zone);
986 } else { 951 } else {
987 ranges->Add(range, zone); 952 ranges->Add(range, zone);
988 } 953 }
989 } 954 }
990 955
991 956
992 RegExpTree* RegExpParser::ParseCharacterClass() { 957 RegExpTree* RegExpParser::ParseCharacterClass() {
993 static const char* kUnterminated = "Unterminated character class"; 958 static const char* kUnterminated = "Unterminated character class";
994 static const char* kRangeInvalid = "Invalid character class";
995 static const char* kRangeOutOfOrder = "Range out of order in character class"; 959 static const char* kRangeOutOfOrder = "Range out of order in character class";
996 960
997 DCHECK_EQ(current(), '['); 961 DCHECK_EQ(current(), '[');
998 Advance(); 962 Advance();
999 bool is_negated = false; 963 bool is_negated = false;
1000 if (current() == '^') { 964 if (current() == '^') {
1001 is_negated = true; 965 is_negated = true;
1002 Advance(); 966 Advance();
1003 } 967 }
1004 ZoneList<CharacterRange>* ranges = 968 ZoneList<CharacterRange>* ranges =
1005 new (zone()) ZoneList<CharacterRange>(2, zone()); 969 new (zone()) ZoneList<CharacterRange>(2, zone());
1006 while (has_more() && current() != ']') { 970 while (has_more() && current() != ']') {
1007 uc16 char_class = kNoCharClass; 971 uc16 char_class = kNoCharClass;
1008 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); 972 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
1009 if (current() == '-') { 973 if (current() == '-') {
1010 Advance(); 974 Advance();
1011 if (current() == kEndMarker) { 975 if (current() == kEndMarker) {
1012 // If we reach the end we break out of the loop and let the 976 // If we reach the end we break out of the loop and let the
1013 // following code report an error. 977 // following code report an error.
1014 break; 978 break;
1015 } else if (current() == ']') { 979 } else if (current() == ']') {
1016 AddRangeOrEscape(ranges, char_class, first, zone()); 980 AddRangeOrEscape(ranges, char_class, first, zone());
1017 ranges->Add(CharacterRange::Singleton('-'), zone()); 981 ranges->Add(CharacterRange::Singleton('-'), zone());
1018 break; 982 break;
1019 } 983 }
1020 uc16 char_class_2 = kNoCharClass; 984 uc16 char_class_2 = kNoCharClass;
1021 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); 985 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
1022 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { 986 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
1023 // Either end is an escaped character class. Treat the '-' verbatim. 987 // Either end is an escaped character class. Treat the '-' verbatim.
1024 if (unicode()) {
1025 // ES2015 21.2.2.15.1 step 1.
1026 return ReportError(CStrVector(kRangeInvalid));
1027 }
1028 AddRangeOrEscape(ranges, char_class, first, zone()); 988 AddRangeOrEscape(ranges, char_class, first, zone());
1029 ranges->Add(CharacterRange::Singleton('-'), zone()); 989 ranges->Add(CharacterRange::Singleton('-'), zone());
1030 AddRangeOrEscape(ranges, char_class_2, next, zone()); 990 AddRangeOrEscape(ranges, char_class_2, next, zone());
1031 continue; 991 continue;
1032 } 992 }
1033 // ES2015 21.2.2.15.1 step 6.
1034 if (first.from() > next.to()) { 993 if (first.from() > next.to()) {
1035 return ReportError(CStrVector(kRangeOutOfOrder)); 994 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED);
1036 } 995 }
1037 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); 996 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
1038 } else { 997 } else {
1039 AddRangeOrEscape(ranges, char_class, first, zone()); 998 AddRangeOrEscape(ranges, char_class, first, zone());
1040 } 999 }
1041 } 1000 }
1042 if (!has_more()) { 1001 if (!has_more()) {
1043 return ReportError(CStrVector(kUnterminated)); 1002 return ReportError(CStrVector(kUnterminated) CHECK_FAILED);
1044 } 1003 }
1045 Advance(); 1004 Advance();
1046 if (ranges->length() == 0) { 1005 if (ranges->length() == 0) {
1047 ranges->Add(CharacterRange::Everything(), zone()); 1006 ranges->Add(CharacterRange::Everything(), zone());
1048 is_negated = !is_negated; 1007 is_negated = !is_negated;
1049 } 1008 }
1050 return new (zone()) RegExpCharacterClass(ranges, is_negated); 1009 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1051 } 1010 }
1052 1011
1053 1012
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after
1196 AddCharacter(static_cast<uc16>(c)); 1155 AddCharacter(static_cast<uc16>(c));
1197 } 1156 }
1198 } 1157 }
1199 1158
1200 1159
1201 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1202 1161
1203 1162
1204 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1205 if (NeedsDesugaringForUnicode(cc)) { 1164 if (NeedsDesugaringForUnicode(cc)) {
1206 // With /u, character class needs to be desugared, so it 1165 // In unicode mode, character class needs to be desugared, so it
1207 // must be a standalone term instead of being part of a RegExpText. 1166 // must be a standalone term instead of being part of a RegExpText.
1208 AddTerm(cc); 1167 AddTerm(cc);
1209 } else { 1168 } else {
1210 AddAtom(cc); 1169 AddAtom(cc);
1211 } 1170 }
1212 } 1171 }
1213 1172
1214 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { 1173 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1215 AddTerm(new (zone()) RegExpCharacterClass( 1174 AddTerm(new (zone()) RegExpCharacterClass(
1216 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); 1175 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
(...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after
1309 1268
1310 1269
1311 RegExpTree* RegExpBuilder::ToRegExp() { 1270 RegExpTree* RegExpBuilder::ToRegExp() {
1312 FlushTerms(); 1271 FlushTerms();
1313 int num_alternatives = alternatives_.length(); 1272 int num_alternatives = alternatives_.length();
1314 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1273 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1315 if (num_alternatives == 1) return alternatives_.last(); 1274 if (num_alternatives == 1) return alternatives_.last();
1316 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1275 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1317 } 1276 }
1318 1277
1319 bool RegExpBuilder::AddQuantifierToAtom( 1278
1279 void RegExpBuilder::AddQuantifierToAtom(
1320 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 1280 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1321 FlushPendingSurrogate(); 1281 FlushPendingSurrogate();
1322 if (pending_empty_) { 1282 if (pending_empty_) {
1323 pending_empty_ = false; 1283 pending_empty_ = false;
1324 return true; 1284 return;
1325 } 1285 }
1326 RegExpTree* atom; 1286 RegExpTree* atom;
1327 if (characters_ != NULL) { 1287 if (characters_ != NULL) {
1328 DCHECK(last_added_ == ADD_CHAR); 1288 DCHECK(last_added_ == ADD_CHAR);
1329 // Last atom was character. 1289 // Last atom was character.
1330 Vector<const uc16> char_vector = characters_->ToConstVector(); 1290 Vector<const uc16> char_vector = characters_->ToConstVector();
1331 int num_chars = char_vector.length(); 1291 int num_chars = char_vector.length();
1332 if (num_chars > 1) { 1292 if (num_chars > 1) {
1333 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); 1293 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1334 text_.Add(new (zone()) RegExpAtom(prefix), zone()); 1294 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1335 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 1295 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1336 } 1296 }
1337 characters_ = NULL; 1297 characters_ = NULL;
1338 atom = new (zone()) RegExpAtom(char_vector); 1298 atom = new (zone()) RegExpAtom(char_vector);
1339 FlushText(); 1299 FlushText();
1340 } else if (text_.length() > 0) { 1300 } else if (text_.length() > 0) {
1341 DCHECK(last_added_ == ADD_ATOM); 1301 DCHECK(last_added_ == ADD_ATOM);
1342 atom = text_.RemoveLast(); 1302 atom = text_.RemoveLast();
1343 FlushText(); 1303 FlushText();
1344 } else if (terms_.length() > 0) { 1304 } else if (terms_.length() > 0) {
1345 DCHECK(last_added_ == ADD_ATOM); 1305 DCHECK(last_added_ == ADD_ATOM);
1346 atom = terms_.RemoveLast(); 1306 atom = terms_.RemoveLast();
1347 // With /u, lookarounds are not quantifiable.
1348 if (unicode() && atom->IsLookaround()) return false;
1349 if (atom->max_match() == 0) { 1307 if (atom->max_match() == 0) {
1350 // Guaranteed to only match an empty string. 1308 // Guaranteed to only match an empty string.
1351 LAST(ADD_TERM); 1309 LAST(ADD_TERM);
1352 if (min == 0) { 1310 if (min == 0) {
1353 return true; 1311 return;
1354 } 1312 }
1355 terms_.Add(atom, zone()); 1313 terms_.Add(atom, zone());
1356 return true; 1314 return;
1357 } 1315 }
1358 } else { 1316 } else {
1359 // Only call immediately after adding an atom or character! 1317 // Only call immediately after adding an atom or character!
1360 UNREACHABLE(); 1318 UNREACHABLE();
1361 return false; 1319 return;
1362 } 1320 }
1363 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1321 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1364 zone()); 1322 zone());
1365 LAST(ADD_TERM); 1323 LAST(ADD_TERM);
1366 return true;
1367 } 1324 }
1368 1325
1369 } // namespace internal 1326 } // namespace internal
1370 } // namespace v8 1327 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-regexp-restricted-syntax.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698