Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(311)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: . Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 443 matching lines...) Expand 10 before | Expand all | Expand 10 after
454 } else { 454 } else {
455 // With /u, invalid escapes are not treated as identity escapes. 455 // With /u, invalid escapes are not treated as identity escapes.
456 return ReportError(CStrVector("Invalid escape")); 456 return ReportError(CStrVector("Invalid escape"));
457 } 457 }
458 break; 458 break;
459 } 459 }
460 case 'u': { 460 case 'u': {
461 Advance(2); 461 Advance(2);
462 uc32 value; 462 uc32 value;
463 if (ParseUnicodeEscape(&value)) { 463 if (ParseUnicodeEscape(&value)) {
464 builder->AddUnicodeCharacter(value); 464 builder->AddEscapedUnicodeCharacter(value);
465 } else if (!unicode()) { 465 } else if (!unicode()) {
466 builder->AddCharacter('u'); 466 builder->AddCharacter('u');
467 } else { 467 } else {
468 // With /u, invalid escapes are not treated as identity escapes. 468 // With /u, invalid escapes are not treated as identity escapes.
469 return ReportError(CStrVector("Invalid unicode escape")); 469 return ReportError(CStrVector("Invalid unicode escape"));
470 } 470 }
471 break; 471 break;
472 } 472 }
473 default: 473 default:
474 Advance(); 474 Advance();
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after
773 Reset(start); 773 Reset(start);
774 return false; 774 return false;
775 } 775 }
776 val = val * 16 + d; 776 val = val * 16 + d;
777 Advance(); 777 Advance();
778 } 778 }
779 *value = val; 779 *value = val;
780 return true; 780 return true;
781 } 781 }
782 782
783 783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
786 // allowed). In the latter case, the number of hex digits between { } is 786 // allowed). In the latter case, the number of hex digits between { } is
787 // arbitrary. \ and u have already been read. 787 // arbitrary. \ and u have already been read.
788 if (current() == '{' && unicode()) { 788 if (current() == '{' && unicode()) {
789 int start = position(); 789 int start = position();
790 Advance(); 790 Advance();
791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
792 if (current() == '}') { 792 if (current() == '}') {
793 Advance(); 793 Advance();
794 return true; 794 return true;
795 } 795 }
796 } 796 }
797 Reset(start); 797 Reset(start);
798 return false; 798 return false;
799 } 799 }
800 // \u but no {, or \u{...} escapes not allowed. 800 // \u but no {, or \u{...} escapes not allowed.
801 return ParseHexEscape(4, value); 801 bool result = ParseHexEscape(4, value);
802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
803 current() == '\\') {
804 // Attempt to read trail surrogate.
805 int start = position();
806 if (Next() == 'u') {
807 Advance(2);
808 uc32 trail;
809 if (ParseHexEscape(4, &trail) &&
810 unibrow::Utf16::IsTrailSurrogate(trail)) {
811 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
812 static_cast<uc16>(trail));
813 return true;
814 }
815 }
816 Reset(start);
817 }
818 return result;
802 } 819 }
803 820
804 821
805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { 822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
806 uc32 x = 0; 823 uc32 x = 0;
807 int d = HexValue(current()); 824 int d = HexValue(current());
808 if (d < 0) { 825 if (d < 0) {
809 return false; 826 return false;
810 } 827 }
811 while (d >= 0) { 828 while (d >= 0) {
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
865 if ((controlLetter >= '0' && controlLetter <= '9') || 882 if ((controlLetter >= '0' && controlLetter <= '9') ||
866 controlLetter == '_') { 883 controlLetter == '_') {
867 Advance(2); 884 Advance(2);
868 return controlLetter & 0x1f; 885 return controlLetter & 0x1f;
869 } 886 }
870 // We match JSC in reading the backslash as a literal 887 // We match JSC in reading the backslash as a literal
871 // character instead of as starting an escape. 888 // character instead of as starting an escape.
872 return '\\'; 889 return '\\';
873 } 890 }
874 case '0': 891 case '0':
892 // With /u, \0 is interpreted as NUL if not followed by another digit.
893 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
894 Advance();
895 return 0;
896 }
897 // Fall through.
875 case '1': 898 case '1':
876 case '2': 899 case '2':
877 case '3': 900 case '3':
878 case '4': 901 case '4':
879 case '5': 902 case '5':
880 case '6': 903 case '6':
881 case '7': 904 case '7':
882 // For compatibility, we interpret a decimal escape that isn't 905 // For compatibility, we interpret a decimal escape that isn't
883 // a back reference (and therefore either \0 or not valid according 906 // a back reference (and therefore either \0 or not valid according
884 // to the specification) as a 1..3 digit octal character code. 907 // to the specification) as a 1..3 digit octal character code.
(...skipping 24 matching lines...) Expand all
909 // With /u, invalid escapes are not treated as identity escapes. 932 // With /u, invalid escapes are not treated as identity escapes.
910 ReportError(CStrVector("Invalid unicode escape")); 933 ReportError(CStrVector("Invalid unicode escape"));
911 return 0; 934 return 0;
912 } 935 }
913 // If \u is not followed by a two-digit hexadecimal, treat it 936 // If \u is not followed by a two-digit hexadecimal, treat it
914 // as an identity escape. 937 // as an identity escape.
915 return 'u'; 938 return 'u';
916 } 939 }
917 default: { 940 default: {
918 uc32 result = current(); 941 uc32 result = current();
919 // With /u, no identity escapes except for syntax characters are 942 // With /u, no identity escapes except for syntax characters and '-' are
920 // allowed. Otherwise, all identity escapes are allowed. 943 // allowed. Otherwise, all identity escapes are allowed.
921 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { 944 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
922 Advance(); 945 Advance();
923 return result; 946 return result;
924 } 947 }
925 ReportError(CStrVector("Invalid escape")); 948 ReportError(CStrVector("Invalid escape"));
926 return 0; 949 return 0;
927 } 950 }
928 } 951 }
929 return 0; 952 return 0;
930 } 953 }
931 954
(...skipping 15 matching lines...) Expand all
947 } 970 }
948 case kEndMarker: 971 case kEndMarker:
949 return ReportError(CStrVector("\\ at end of pattern")); 972 return ReportError(CStrVector("\\ at end of pattern"));
950 default: 973 default:
951 first = ParseClassCharacterEscape(CHECK_FAILED); 974 first = ParseClassCharacterEscape(CHECK_FAILED);
952 } 975 }
953 } else { 976 } else {
954 Advance(); 977 Advance();
955 } 978 }
956 979
957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
958 // Combine with possibly following trail surrogate.
959 int start = position();
960 uc32 second = current();
961 if (second == '\\') {
962 second = ParseClassCharacterEscape(CHECK_FAILED);
963 } else {
964 Advance();
965 }
966 if (unibrow::Utf16::IsTrailSurrogate(second)) {
967 first = unibrow::Utf16::CombineSurrogatePair(first, second);
968 } else {
969 Reset(start);
970 }
971 }
972
973 return CharacterRange::Singleton(first); 980 return CharacterRange::Singleton(first);
974 } 981 }
975 982
976 983
977 static const uc16 kNoCharClass = 0; 984 static const uc16 kNoCharClass = 0;
978 985
979 // Adds range or pre-defined character class to character ranges. 986 // Adds range or pre-defined character class to character ranges.
980 // If char_class is not kInvalidClass, it's interpreted as a class 987 // If char_class is not kInvalidClass, it's interpreted as a class
981 // escape (i.e., 's' means whitespace, from '\s'). 988 // escape (i.e., 's' means whitespace, from '\s').
982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 989 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after
1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 1198 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 1199 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1193 AddLeadSurrogate(c); 1200 AddLeadSurrogate(c);
1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { 1201 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1195 AddTrailSurrogate(c); 1202 AddTrailSurrogate(c);
1196 } else { 1203 } else {
1197 AddCharacter(static_cast<uc16>(c)); 1204 AddCharacter(static_cast<uc16>(c));
1198 } 1205 }
1199 } 1206 }
1200 1207
1208 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1209 // A lead or trail surrogate parsed via escape sequence will not
1210 // pair up with any preceding lead or following trail surrogate.
1211 FlushPendingSurrogate();
1212 AddUnicodeCharacter(character);
1213 FlushPendingSurrogate();
1214 }
1201 1215
1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1216 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1203 1217
1204 1218
1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1219 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1206 if (NeedsDesugaringForUnicode(cc)) { 1220 if (NeedsDesugaringForUnicode(cc)) {
1207 // With /u, character class needs to be desugared, so it 1221 // With /u, character class needs to be desugared, so it
1208 // must be a standalone term instead of being part of a RegExpText. 1222 // must be a standalone term instead of being part of a RegExpText.
1209 AddTerm(cc); 1223 AddTerm(cc);
1210 } else { 1224 } else {
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
1362 return false; 1376 return false;
1363 } 1377 }
1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1378 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1365 zone()); 1379 zone());
1366 LAST(ADD_TERM); 1380 LAST(ADD_TERM);
1367 return true; 1381 return true;
1368 } 1382 }
1369 1383
1370 } // namespace internal 1384 } // namespace internal
1371 } // namespace v8 1385 } // namespace v8
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698