Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1645573002: [regexp] restrict pattern syntax for unicode mode. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@stage
Patch Set: addressed comments Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-regexp-restricted-syntax.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 84 matching lines...) Expand 10 before | Expand all | Expand 10 after
95 95
96 96
97 void RegExpParser::Advance(int dist) { 97 void RegExpParser::Advance(int dist) {
98 next_pos_ += dist - 1; 98 next_pos_ += dist - 1;
99 Advance(); 99 Advance();
100 } 100 }
101 101
102 102
103 bool RegExpParser::simple() { return simple_; } 103 bool RegExpParser::simple() { return simple_; }
104 104
105 105 bool RegExpParser::IsSyntaxCharacterOrSlash(uc32 c) {
106 bool RegExpParser::IsSyntaxCharacter(uc32 c) { 106 switch (c) {
107 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' || 107 case '^':
108 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' || 108 case '$':
109 c == '{' || c == '}' || c == '|'; 109 case '\\':
110 case '.':
111 case '*':
112 case '+':
113 case '?':
114 case '(':
115 case ')':
116 case '[':
117 case ']':
118 case '{':
119 case '}':
120 case '|':
121 case '/':
122 return true;
123 default:
124 break;
125 }
126 return false;
110 } 127 }
111 128
112 129
113 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { 130 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
114 failed_ = true; 131 failed_ = true;
115 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); 132 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
116 // Zip to the end to make sure the no more input is read. 133 // Zip to the end to make sure the no more input is read.
117 current_ = kEndMarker; 134 current_ = kEndMarker;
118 next_pos_ = in()->length(); 135 next_pos_ = in()->length();
119 return NULL; 136 return NULL;
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
154 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, 171 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
155 flags_, zone()); 172 flags_, zone());
156 RegExpParserState* state = &initial_state; 173 RegExpParserState* state = &initial_state;
157 // Cache the builder in a local variable for quick access. 174 // Cache the builder in a local variable for quick access.
158 RegExpBuilder* builder = initial_state.builder(); 175 RegExpBuilder* builder = initial_state.builder();
159 while (true) { 176 while (true) {
160 switch (current()) { 177 switch (current()) {
161 case kEndMarker: 178 case kEndMarker:
162 if (state->IsSubexpression()) { 179 if (state->IsSubexpression()) {
163 // Inside a parenthesized group when hitting end of input. 180 // Inside a parenthesized group when hitting end of input.
164 ReportError(CStrVector("Unterminated group") CHECK_FAILED); 181 return ReportError(CStrVector("Unterminated group"));
165 } 182 }
166 DCHECK_EQ(INITIAL, state->group_type()); 183 DCHECK_EQ(INITIAL, state->group_type());
167 // Parsing completed successfully. 184 // Parsing completed successfully.
168 return builder->ToRegExp(); 185 return builder->ToRegExp();
169 case ')': { 186 case ')': {
170 if (!state->IsSubexpression()) { 187 if (!state->IsSubexpression()) {
171 ReportError(CStrVector("Unmatched ')'") CHECK_FAILED); 188 return ReportError(CStrVector("Unmatched ')'"));
172 } 189 }
173 DCHECK_NE(INITIAL, state->group_type()); 190 DCHECK_NE(INITIAL, state->group_type());
174 191
175 Advance(); 192 Advance();
176 // End disjunction parsing and convert builder content to new single 193 // End disjunction parsing and convert builder content to new single
177 // regexp atom. 194 // regexp atom.
178 RegExpTree* body = builder->ToRegExp(); 195 RegExpTree* body = builder->ToRegExp();
179 196
180 int end_capture_index = captures_started(); 197 int end_capture_index = captures_started();
181 198
(...skipping 87 matching lines...) Expand 10 before | Expand all | Expand 10 after
269 if (Next() == '=') { 286 if (Next() == '=') {
270 subexpr_type = POSITIVE_LOOKAROUND; 287 subexpr_type = POSITIVE_LOOKAROUND;
271 break; 288 break;
272 } else if (Next() == '!') { 289 } else if (Next() == '!') {
273 subexpr_type = NEGATIVE_LOOKAROUND; 290 subexpr_type = NEGATIVE_LOOKAROUND;
274 break; 291 break;
275 } 292 }
276 } 293 }
277 // Fall through. 294 // Fall through.
278 default: 295 default:
279 ReportError(CStrVector("Invalid group") CHECK_FAILED); 296 return ReportError(CStrVector("Invalid group"));
280 break;
281 } 297 }
282 Advance(2); 298 Advance(2);
283 } else { 299 } else {
284 if (captures_started_ >= kMaxCaptures) { 300 if (captures_started_ >= kMaxCaptures) {
285 ReportError(CStrVector("Too many captures") CHECK_FAILED); 301 return ReportError(CStrVector("Too many captures"));
286 } 302 }
287 captures_started_++; 303 captures_started_++;
288 } 304 }
289 // Store current state and begin new disjunction parsing. 305 // Store current state and begin new disjunction parsing.
290 state = 306 state =
291 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, 307 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,
292 captures_started_, flags_, zone()); 308 captures_started_, flags_, zone());
293 builder = state->builder(); 309 builder = state->builder();
294 continue; 310 continue;
295 } 311 }
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
353 // the capture registers of the referenced capture are either 369 // the capture registers of the referenced capture are either
354 // both set or both cleared. 370 // both set or both cleared.
355 builder->AddEmpty(); 371 builder->AddEmpty();
356 } else { 372 } else {
357 RegExpCapture* capture = GetCapture(index); 373 RegExpCapture* capture = GetCapture(index);
358 RegExpTree* atom = new (zone()) RegExpBackReference(capture); 374 RegExpTree* atom = new (zone()) RegExpBackReference(capture);
359 builder->AddAtom(atom); 375 builder->AddAtom(atom);
360 } 376 }
361 break; 377 break;
362 } 378 }
379 // With /u, no identity escapes except for syntax characters
380 // are allowed. Otherwise, all identity escapes are allowed.
381 if (unicode()) {
382 return ReportError(CStrVector("Invalid escape"));
383 }
363 uc32 first_digit = Next(); 384 uc32 first_digit = Next();
364 if (first_digit == '8' || first_digit == '9') { 385 if (first_digit == '8' || first_digit == '9') {
365 // If the 'u' flag is present, only syntax characters can be 386 builder->AddCharacter(first_digit);
366 // escaped, 387 Advance(2);
367 // no other identity escapes are allowed. If the 'u' flag is not
368 // present, all identity escapes are allowed.
369 if (!unicode()) {
370 builder->AddCharacter(first_digit);
371 Advance(2);
372 } else {
373 return ReportError(CStrVector("Invalid escape"));
374 }
375 break; 388 break;
376 } 389 }
377 } 390 }
378 // FALLTHROUGH 391 // FALLTHROUGH
379 case '0': { 392 case '0': {
380 Advance(); 393 Advance();
394 if (unicode() && Next() >= '0' && Next() <= '9') {
395 // With /u, decimal escape with leading 0 are not parsed as octal.
396 return ReportError(CStrVector("Invalid decimal escape"));
397 }
381 uc32 octal = ParseOctalLiteral(); 398 uc32 octal = ParseOctalLiteral();
382 builder->AddCharacter(octal); 399 builder->AddCharacter(octal);
383 break; 400 break;
384 } 401 }
385 // ControlEscape :: one of 402 // ControlEscape :: one of
386 // f n r t v 403 // f n r t v
387 case 'f': 404 case 'f':
388 Advance(2); 405 Advance(2);
389 builder->AddCharacter('\f'); 406 builder->AddCharacter('\f');
390 break; 407 break;
(...skipping 17 matching lines...) Expand all
408 Advance(); 425 Advance();
409 uc32 controlLetter = Next(); 426 uc32 controlLetter = Next();
410 // Special case if it is an ASCII letter. 427 // Special case if it is an ASCII letter.
411 // Convert lower case letters to uppercase. 428 // Convert lower case letters to uppercase.
412 uc32 letter = controlLetter & ~('a' ^ 'A'); 429 uc32 letter = controlLetter & ~('a' ^ 'A');
413 if (letter < 'A' || 'Z' < letter) { 430 if (letter < 'A' || 'Z' < letter) {
414 // controlLetter is not in range 'A'-'Z' or 'a'-'z'. 431 // controlLetter is not in range 'A'-'Z' or 'a'-'z'.
415 // This is outside the specification. We match JSC in 432 // This is outside the specification. We match JSC in
416 // reading the backslash as a literal character instead 433 // reading the backslash as a literal character instead
417 // of as starting an escape. 434 // of as starting an escape.
435 if (unicode()) {
436 // With /u, invalid escapes are not treated as identity escapes.
437 return ReportError(CStrVector("Invalid unicode escape"));
438 }
418 builder->AddCharacter('\\'); 439 builder->AddCharacter('\\');
419 } else { 440 } else {
420 Advance(2); 441 Advance(2);
421 builder->AddCharacter(controlLetter & 0x1f); 442 builder->AddCharacter(controlLetter & 0x1f);
422 } 443 }
423 break; 444 break;
424 } 445 }
425 case 'x': { 446 case 'x': {
426 Advance(2); 447 Advance(2);
427 uc32 value; 448 uc32 value;
428 if (ParseHexEscape(2, &value)) { 449 if (ParseHexEscape(2, &value)) {
429 builder->AddCharacter(value); 450 builder->AddCharacter(value);
430 } else if (!unicode()) { 451 } else if (!unicode()) {
431 builder->AddCharacter('x'); 452 builder->AddCharacter('x');
432 } else { 453 } else {
433 // If the 'u' flag is present, invalid escapes are not treated as 454 // With /u, invalid escapes are not treated as identity escapes.
434 // identity escapes.
435 return ReportError(CStrVector("Invalid escape")); 455 return ReportError(CStrVector("Invalid escape"));
436 } 456 }
437 break; 457 break;
438 } 458 }
439 case 'u': { 459 case 'u': {
440 Advance(2); 460 Advance(2);
441 uc32 value; 461 uc32 value;
442 if (ParseUnicodeEscape(&value)) { 462 if (ParseUnicodeEscape(&value)) {
443 builder->AddUnicodeCharacter(value); 463 builder->AddUnicodeCharacter(value);
444 } else if (!unicode()) { 464 } else if (!unicode()) {
445 builder->AddCharacter('u'); 465 builder->AddCharacter('u');
446 } else { 466 } else {
447 // If the 'u' flag is present, invalid escapes are not treated as 467 // With /u, invalid escapes are not treated as identity escapes.
448 // identity escapes.
449 return ReportError(CStrVector("Invalid unicode escape")); 468 return ReportError(CStrVector("Invalid unicode escape"));
450 } 469 }
451 break; 470 break;
452 } 471 }
453 default: 472 default:
454 Advance(); 473 Advance();
455 // If the 'u' flag is present, only syntax characters can be 474 // With /u, no identity escapes except for syntax characters
456 // escaped, no 475 // are allowed. Otherwise, all identity escapes are allowed.
457 // other identity escapes are allowed. If the 'u' flag is not 476 if (!unicode() || IsSyntaxCharacterOrSlash(current())) {
458 // present,
459 // all identity escapes are allowed.
460 if (!unicode() || IsSyntaxCharacter(current())) {
461 builder->AddCharacter(current()); 477 builder->AddCharacter(current());
462 Advance(); 478 Advance();
463 } else { 479 } else {
464 return ReportError(CStrVector("Invalid escape")); 480 return ReportError(CStrVector("Invalid escape"));
465 } 481 }
466 break; 482 break;
467 } 483 }
468 break; 484 break;
469 case '{': { 485 case '{': {
470 int dummy; 486 int dummy;
471 if (ParseIntervalQuantifier(&dummy, &dummy)) { 487 if (ParseIntervalQuantifier(&dummy, &dummy)) {
472 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); 488 return ReportError(CStrVector("Nothing to repeat"));
473 } 489 }
474 // fallthrough 490 // fallthrough
475 } 491 }
492 case '}':
493 case ']':
494 if (unicode()) {
495 return ReportError(CStrVector("Lone quantifier brackets"));
496 }
497 // fallthrough
476 default: 498 default:
477 builder->AddUnicodeCharacter(current()); 499 builder->AddUnicodeCharacter(current());
478 Advance(); 500 Advance();
479 break; 501 break;
480 } // end switch(current()) 502 } // end switch(current())
481 503
482 int min; 504 int min;
483 int max; 505 int max;
484 switch (current()) { 506 switch (current()) {
485 // QuantifierPrefix :: 507 // QuantifierPrefix ::
(...skipping 12 matching lines...) Expand all
498 Advance(); 520 Advance();
499 break; 521 break;
500 case '?': 522 case '?':
501 min = 0; 523 min = 0;
502 max = 1; 524 max = 1;
503 Advance(); 525 Advance();
504 break; 526 break;
505 case '{': 527 case '{':
506 if (ParseIntervalQuantifier(&min, &max)) { 528 if (ParseIntervalQuantifier(&min, &max)) {
507 if (max < min) { 529 if (max < min) {
508 ReportError(CStrVector("numbers out of order in {} quantifier.") 530 return ReportError(
509 CHECK_FAILED); 531 CStrVector("numbers out of order in {} quantifier"));
510 } 532 }
511 break; 533 break;
512 } else { 534 } else if (unicode()) {
513 continue; 535 // With /u, incomplete quantifiers are not allowed.
536 return ReportError(CStrVector("Incomplete quantifier"));
514 } 537 }
538 continue;
515 default: 539 default:
516 continue; 540 continue;
517 } 541 }
518 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY; 542 RegExpQuantifier::QuantifierType quantifier_type = RegExpQuantifier::GREEDY;
519 if (current() == '?') { 543 if (current() == '?') {
520 quantifier_type = RegExpQuantifier::NON_GREEDY; 544 quantifier_type = RegExpQuantifier::NON_GREEDY;
521 Advance(); 545 Advance();
522 } else if (FLAG_regexp_possessive_quantifier && current() == '+') { 546 } else if (FLAG_regexp_possessive_quantifier && current() == '+') {
523 // FLAG_regexp_possessive_quantifier is a debug-only flag. 547 // FLAG_regexp_possessive_quantifier is a debug-only flag.
524 quantifier_type = RegExpQuantifier::POSSESSIVE; 548 quantifier_type = RegExpQuantifier::POSSESSIVE;
525 Advance(); 549 Advance();
526 } 550 }
527 builder->AddQuantifierToAtom(min, max, quantifier_type); 551 if (!builder->AddQuantifierToAtom(min, max, quantifier_type)) {
552 return ReportError(CStrVector("Invalid quantifier"));
553 }
528 } 554 }
529 } 555 }
530 556
531 557
532 #ifdef DEBUG 558 #ifdef DEBUG
533 // Currently only used in an DCHECK. 559 // Currently only used in an DCHECK.
534 static bool IsSpecialClassEscape(uc32 c) { 560 static bool IsSpecialClassEscape(uc32 c) {
535 switch (c) { 561 switch (c) {
536 case 'd': 562 case 'd':
537 case 'D': 563 case 'D':
(...skipping 277 matching lines...) Expand 10 before | Expand all | Expand 10 after
815 return '\r'; 841 return '\r';
816 case 't': 842 case 't':
817 Advance(); 843 Advance();
818 return '\t'; 844 return '\t';
819 case 'v': 845 case 'v':
820 Advance(); 846 Advance();
821 return '\v'; 847 return '\v';
822 case 'c': { 848 case 'c': {
823 uc32 controlLetter = Next(); 849 uc32 controlLetter = Next();
824 uc32 letter = controlLetter & ~('A' ^ 'a'); 850 uc32 letter = controlLetter & ~('A' ^ 'a');
825 // For compatibility with JSC, inside a character class 851 // For compatibility with JSC, inside a character class. We also accept
826 // we also accept digits and underscore as control characters. 852 // digits and underscore as control characters, unless with /u.
827 if ((controlLetter >= '0' && controlLetter <= '9') || 853 if (letter >= 'A' && letter <= 'Z') {
828 controlLetter == '_' || (letter >= 'A' && letter <= 'Z')) {
829 Advance(2); 854 Advance(2);
830 // Control letters mapped to ASCII control characters in the range 855 // Control letters mapped to ASCII control characters in the range
831 // 0x00-0x1f. 856 // 0x00-0x1f.
832 return controlLetter & 0x1f; 857 return controlLetter & 0x1f;
833 } 858 }
859 if (unicode()) {
860 // With /u, invalid escapes are not treated as identity escapes.
861 ReportError(CStrVector("Invalid class escape"));
862 return 0;
863 }
864 if ((controlLetter >= '0' && controlLetter <= '9') ||
865 controlLetter == '_') {
866 Advance(2);
867 return controlLetter & 0x1f;
868 }
834 // We match JSC in reading the backslash as a literal 869 // We match JSC in reading the backslash as a literal
835 // character instead of as starting an escape. 870 // character instead of as starting an escape.
836 return '\\'; 871 return '\\';
837 } 872 }
838 case '0': 873 case '0':
839 case '1': 874 case '1':
840 case '2': 875 case '2':
841 case '3': 876 case '3':
842 case '4': 877 case '4':
843 case '5': 878 case '5':
844 case '6': 879 case '6':
845 case '7': 880 case '7':
846 // For compatibility, we interpret a decimal escape that isn't 881 // For compatibility, we interpret a decimal escape that isn't
847 // a back reference (and therefore either \0 or not valid according 882 // a back reference (and therefore either \0 or not valid according
848 // to the specification) as a 1..3 digit octal character code. 883 // to the specification) as a 1..3 digit octal character code.
884 if (unicode()) {
885 // With /u, decimal escape is not interpreted as octal character code.
886 ReportError(CStrVector("Invalid class escape"));
887 return 0;
888 }
849 return ParseOctalLiteral(); 889 return ParseOctalLiteral();
850 case 'x': { 890 case 'x': {
851 Advance(); 891 Advance();
852 uc32 value; 892 uc32 value;
853 if (ParseHexEscape(2, &value)) { 893 if (ParseHexEscape(2, &value)) return value;
854 return value; 894 if (unicode()) {
895 // With /u, invalid escapes are not treated as identity escapes.
896 ReportError(CStrVector("Invalid escape"));
897 return 0;
855 } 898 }
856 if (!unicode()) { 899 // If \x is not followed by a two-digit hexadecimal, treat it
857 // If \x is not followed by a two-digit hexadecimal, treat it 900 // as an identity escape.
858 // as an identity escape. 901 return 'x';
859 return 'x';
860 }
861 // If the 'u' flag is present, invalid escapes are not treated as
862 // identity escapes.
863 ReportError(CStrVector("Invalid escape"));
864 return 0;
865 } 902 }
866 case 'u': { 903 case 'u': {
867 Advance(); 904 Advance();
868 uc32 value; 905 uc32 value;
869 if (ParseUnicodeEscape(&value)) { 906 if (ParseUnicodeEscape(&value)) return value;
870 return value; 907 if (unicode()) {
908 // With /u, invalid escapes are not treated as identity escapes.
909 ReportError(CStrVector("Invalid unicode escape"));
910 return 0;
871 } 911 }
872 if (!unicode()) { 912 // If \u is not followed by a two-digit hexadecimal, treat it
873 return 'u'; 913 // as an identity escape.
874 } 914 return 'u';
875 // If the 'u' flag is present, invalid escapes are not treated as
876 // identity escapes.
877 ReportError(CStrVector("Invalid unicode escape"));
878 return 0;
879 } 915 }
880 default: { 916 default: {
881 uc32 result = current(); 917 uc32 result = current();
882 // If the 'u' flag is present, only syntax characters can be escaped, no 918 // With /u, no identity escapes except for syntax characters are
883 // other identity escapes are allowed. If the 'u' flag is not present, all 919 // allowed. Otherwise, all identity escapes are allowed.
884 // identity escapes are allowed. 920 if (!unicode() || IsSyntaxCharacterOrSlash(result)) {
885 if (!unicode() || IsSyntaxCharacter(result)) {
886 Advance(); 921 Advance();
887 return result; 922 return result;
888 } 923 }
889 ReportError(CStrVector("Invalid escape")); 924 ReportError(CStrVector("Invalid escape"));
890 return 0; 925 return 0;
891 } 926 }
892 } 927 }
893 return 0; 928 return 0;
894 } 929 }
895 930
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
949 if (char_class != kNoCharClass) { 984 if (char_class != kNoCharClass) {
950 CharacterRange::AddClassEscape(char_class, ranges, zone); 985 CharacterRange::AddClassEscape(char_class, ranges, zone);
951 } else { 986 } else {
952 ranges->Add(range, zone); 987 ranges->Add(range, zone);
953 } 988 }
954 } 989 }
955 990
956 991
957 RegExpTree* RegExpParser::ParseCharacterClass() { 992 RegExpTree* RegExpParser::ParseCharacterClass() {
958 static const char* kUnterminated = "Unterminated character class"; 993 static const char* kUnterminated = "Unterminated character class";
994 static const char* kRangeInvalid = "Invalid character class";
959 static const char* kRangeOutOfOrder = "Range out of order in character class"; 995 static const char* kRangeOutOfOrder = "Range out of order in character class";
960 996
961 DCHECK_EQ(current(), '['); 997 DCHECK_EQ(current(), '[');
962 Advance(); 998 Advance();
963 bool is_negated = false; 999 bool is_negated = false;
964 if (current() == '^') { 1000 if (current() == '^') {
965 is_negated = true; 1001 is_negated = true;
966 Advance(); 1002 Advance();
967 } 1003 }
968 ZoneList<CharacterRange>* ranges = 1004 ZoneList<CharacterRange>* ranges =
969 new (zone()) ZoneList<CharacterRange>(2, zone()); 1005 new (zone()) ZoneList<CharacterRange>(2, zone());
970 while (has_more() && current() != ']') { 1006 while (has_more() && current() != ']') {
971 uc16 char_class = kNoCharClass; 1007 uc16 char_class = kNoCharClass;
972 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED); 1008 CharacterRange first = ParseClassAtom(&char_class CHECK_FAILED);
973 if (current() == '-') { 1009 if (current() == '-') {
974 Advance(); 1010 Advance();
975 if (current() == kEndMarker) { 1011 if (current() == kEndMarker) {
976 // If we reach the end we break out of the loop and let the 1012 // If we reach the end we break out of the loop and let the
977 // following code report an error. 1013 // following code report an error.
978 break; 1014 break;
979 } else if (current() == ']') { 1015 } else if (current() == ']') {
980 AddRangeOrEscape(ranges, char_class, first, zone()); 1016 AddRangeOrEscape(ranges, char_class, first, zone());
981 ranges->Add(CharacterRange::Singleton('-'), zone()); 1017 ranges->Add(CharacterRange::Singleton('-'), zone());
982 break; 1018 break;
983 } 1019 }
984 uc16 char_class_2 = kNoCharClass; 1020 uc16 char_class_2 = kNoCharClass;
985 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED); 1021 CharacterRange next = ParseClassAtom(&char_class_2 CHECK_FAILED);
986 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) { 1022 if (char_class != kNoCharClass || char_class_2 != kNoCharClass) {
987 // Either end is an escaped character class. Treat the '-' verbatim. 1023 // Either end is an escaped character class. Treat the '-' verbatim.
1024 if (unicode()) {
1025 // ES2015 21.2.2.15.1 step 1.
1026 return ReportError(CStrVector(kRangeInvalid));
1027 }
988 AddRangeOrEscape(ranges, char_class, first, zone()); 1028 AddRangeOrEscape(ranges, char_class, first, zone());
989 ranges->Add(CharacterRange::Singleton('-'), zone()); 1029 ranges->Add(CharacterRange::Singleton('-'), zone());
990 AddRangeOrEscape(ranges, char_class_2, next, zone()); 1030 AddRangeOrEscape(ranges, char_class_2, next, zone());
991 continue; 1031 continue;
992 } 1032 }
1033 // ES2015 21.2.2.15.1 step 6.
993 if (first.from() > next.to()) { 1034 if (first.from() > next.to()) {
994 return ReportError(CStrVector(kRangeOutOfOrder) CHECK_FAILED); 1035 return ReportError(CStrVector(kRangeOutOfOrder));
995 } 1036 }
996 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone()); 1037 ranges->Add(CharacterRange::Range(first.from(), next.to()), zone());
997 } else { 1038 } else {
998 AddRangeOrEscape(ranges, char_class, first, zone()); 1039 AddRangeOrEscape(ranges, char_class, first, zone());
999 } 1040 }
1000 } 1041 }
1001 if (!has_more()) { 1042 if (!has_more()) {
1002 return ReportError(CStrVector(kUnterminated) CHECK_FAILED); 1043 return ReportError(CStrVector(kUnterminated));
1003 } 1044 }
1004 Advance(); 1045 Advance();
1005 if (ranges->length() == 0) { 1046 if (ranges->length() == 0) {
1006 ranges->Add(CharacterRange::Everything(), zone()); 1047 ranges->Add(CharacterRange::Everything(), zone());
1007 is_negated = !is_negated; 1048 is_negated = !is_negated;
1008 } 1049 }
1009 return new (zone()) RegExpCharacterClass(ranges, is_negated); 1050 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1010 } 1051 }
1011 1052
1012 1053
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after
1155 AddCharacter(static_cast<uc16>(c)); 1196 AddCharacter(static_cast<uc16>(c));
1156 } 1197 }
1157 } 1198 }
1158 1199
1159 1200
1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1201 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1161 1202
1162 1203
1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1204 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1164 if (NeedsDesugaringForUnicode(cc)) { 1205 if (NeedsDesugaringForUnicode(cc)) {
1165 // In unicode mode, character class needs to be desugared, so it 1206 // With /u, character class needs to be desugared, so it
1166 // must be a standalone term instead of being part of a RegExpText. 1207 // must be a standalone term instead of being part of a RegExpText.
1167 AddTerm(cc); 1208 AddTerm(cc);
1168 } else { 1209 } else {
1169 AddAtom(cc); 1210 AddAtom(cc);
1170 } 1211 }
1171 } 1212 }
1172 1213
1173 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) { 1214 void RegExpBuilder::AddCharacterClassForDesugaring(uc32 c) {
1174 AddTerm(new (zone()) RegExpCharacterClass( 1215 AddTerm(new (zone()) RegExpCharacterClass(
1175 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); 1216 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
(...skipping 92 matching lines...) Expand 10 before | Expand all | Expand 10 after
1268 1309
1269 1310
1270 RegExpTree* RegExpBuilder::ToRegExp() { 1311 RegExpTree* RegExpBuilder::ToRegExp() {
1271 FlushTerms(); 1312 FlushTerms();
1272 int num_alternatives = alternatives_.length(); 1313 int num_alternatives = alternatives_.length();
1273 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1314 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1274 if (num_alternatives == 1) return alternatives_.last(); 1315 if (num_alternatives == 1) return alternatives_.last();
1275 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1316 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1276 } 1317 }
1277 1318
1278 1319 bool RegExpBuilder::AddQuantifierToAtom(
1279 void RegExpBuilder::AddQuantifierToAtom(
1280 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 1320 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1281 FlushPendingSurrogate(); 1321 FlushPendingSurrogate();
1282 if (pending_empty_) { 1322 if (pending_empty_) {
1283 pending_empty_ = false; 1323 pending_empty_ = false;
1284 return; 1324 return true;
1285 } 1325 }
1286 RegExpTree* atom; 1326 RegExpTree* atom;
1287 if (characters_ != NULL) { 1327 if (characters_ != NULL) {
1288 DCHECK(last_added_ == ADD_CHAR); 1328 DCHECK(last_added_ == ADD_CHAR);
1289 // Last atom was character. 1329 // Last atom was character.
1290 Vector<const uc16> char_vector = characters_->ToConstVector(); 1330 Vector<const uc16> char_vector = characters_->ToConstVector();
1291 int num_chars = char_vector.length(); 1331 int num_chars = char_vector.length();
1292 if (num_chars > 1) { 1332 if (num_chars > 1) {
1293 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1); 1333 Vector<const uc16> prefix = char_vector.SubVector(0, num_chars - 1);
1294 text_.Add(new (zone()) RegExpAtom(prefix), zone()); 1334 text_.Add(new (zone()) RegExpAtom(prefix), zone());
1295 char_vector = char_vector.SubVector(num_chars - 1, num_chars); 1335 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
1296 } 1336 }
1297 characters_ = NULL; 1337 characters_ = NULL;
1298 atom = new (zone()) RegExpAtom(char_vector); 1338 atom = new (zone()) RegExpAtom(char_vector);
1299 FlushText(); 1339 FlushText();
1300 } else if (text_.length() > 0) { 1340 } else if (text_.length() > 0) {
1301 DCHECK(last_added_ == ADD_ATOM); 1341 DCHECK(last_added_ == ADD_ATOM);
1302 atom = text_.RemoveLast(); 1342 atom = text_.RemoveLast();
1303 FlushText(); 1343 FlushText();
1304 } else if (terms_.length() > 0) { 1344 } else if (terms_.length() > 0) {
1305 DCHECK(last_added_ == ADD_ATOM); 1345 DCHECK(last_added_ == ADD_ATOM);
1306 atom = terms_.RemoveLast(); 1346 atom = terms_.RemoveLast();
1347 // With /u, lookarounds are not quantifiable.
1348 if (unicode() && atom->IsLookaround()) return false;
1307 if (atom->max_match() == 0) { 1349 if (atom->max_match() == 0) {
1308 // Guaranteed to only match an empty string. 1350 // Guaranteed to only match an empty string.
1309 LAST(ADD_TERM); 1351 LAST(ADD_TERM);
1310 if (min == 0) { 1352 if (min == 0) {
1311 return; 1353 return true;
1312 } 1354 }
1313 terms_.Add(atom, zone()); 1355 terms_.Add(atom, zone());
1314 return; 1356 return true;
1315 } 1357 }
1316 } else { 1358 } else {
1317 // Only call immediately after adding an atom or character! 1359 // Only call immediately after adding an atom or character!
1318 UNREACHABLE(); 1360 UNREACHABLE();
1319 return; 1361 return false;
1320 } 1362 }
1321 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1363 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1322 zone()); 1364 zone());
1323 LAST(ADD_TERM); 1365 LAST(ADD_TERM);
1366 return true;
1324 } 1367 }
1325 1368
1326 } // namespace internal 1369 } // namespace internal
1327 } // namespace v8 1370 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-regexp-restricted-syntax.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698