Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: fix: /[\00]/u is not allowed. Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 443 matching lines...) Expand 10 before | Expand all | Expand 10 after
454 } else { 454 } else {
455 // With /u, invalid escapes are not treated as identity escapes. 455 // With /u, invalid escapes are not treated as identity escapes.
456 return ReportError(CStrVector("Invalid escape")); 456 return ReportError(CStrVector("Invalid escape"));
457 } 457 }
458 break; 458 break;
459 } 459 }
460 case 'u': { 460 case 'u': {
461 Advance(2); 461 Advance(2);
462 uc32 value; 462 uc32 value;
463 if (ParseUnicodeEscape(&value)) { 463 if (ParseUnicodeEscape(&value)) {
464 builder->AddUnicodeCharacter(value); 464 builder->AddEscapedUnicodeCharacter(value);
465 } else if (!unicode()) { 465 } else if (!unicode()) {
466 builder->AddCharacter('u'); 466 builder->AddCharacter('u');
467 } else { 467 } else {
468 // With /u, invalid escapes are not treated as identity escapes. 468 // With /u, invalid escapes are not treated as identity escapes.
469 return ReportError(CStrVector("Invalid unicode escape")); 469 return ReportError(CStrVector("Invalid unicode escape"));
470 } 470 }
471 break; 471 break;
472 } 472 }
473 default: 473 default:
474 Advance(); 474 Advance();
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after
773 Reset(start); 773 Reset(start);
774 return false; 774 return false;
775 } 775 }
776 val = val * 16 + d; 776 val = val * 16 + d;
777 Advance(); 777 Advance();
778 } 778 }
779 *value = val; 779 *value = val;
780 return true; 780 return true;
781 } 781 }
782 782
783 783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
784 bool RegExpParser::ParseUnicodeEscape(uc32* value) { 784 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
786 // allowed). In the latter case, the number of hex digits between { } is 786 // allowed). In the latter case, the number of hex digits between { } is
787 // arbitrary. \ and u have already been read. 787 // arbitrary. \ and u have already been read.
788 if (current() == '{' && unicode()) { 788 if (current() == '{' && unicode()) {
789 int start = position(); 789 int start = position();
790 Advance(); 790 Advance();
791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
792 if (current() == '}') { 792 if (current() == '}') {
793 Advance(); 793 Advance();
794 return true; 794 return true;
795 } 795 }
796 } 796 }
797 Reset(start); 797 Reset(start);
798 return false; 798 return false;
799 } 799 }
800 // \u but no {, or \u{...} escapes not allowed. 800 // \u but no {, or \u{...} escapes not allowed.
801 return ParseHexEscape(4, value); 801 bool result = ParseHexEscape(4, value);
802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
803 current() == '\\') {
804 // Attempt to read trail surrogate.
805 int start = position();
806 if (Next() == 'u') {
807 Advance(2);
808 uc32 trail;
809 if (ParseHexEscape(4, &trail) &&
810 unibrow::Utf16::IsTrailSurrogate(trail)) {
811 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
812 static_cast<uc16>(trail));
813 return true;
814 }
815 }
816 Reset(start);
817 }
818 return result;
802 } 819 }
803 820
804 821
805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) { 822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
806 uc32 x = 0; 823 uc32 x = 0;
807 int d = HexValue(current()); 824 int d = HexValue(current());
808 if (d < 0) { 825 if (d < 0) {
809 return false; 826 return false;
810 } 827 }
811 while (d >= 0) { 828 while (d >= 0) {
(...skipping 60 matching lines...) Expand 10 before | Expand all | Expand 10 after
872 return '\\'; 889 return '\\';
873 } 890 }
874 case '0': 891 case '0':
875 case '1': 892 case '1':
876 case '2': 893 case '2':
877 case '3': 894 case '3':
878 case '4': 895 case '4':
879 case '5': 896 case '5':
880 case '6': 897 case '6':
881 case '7': 898 case '7':
882 // For compatibility, we interpret a decimal escape that isn't
883 // a back reference (and therefore either \0 or not valid according
884 // to the specification) as a 1..3 digit octal character code.
885 if (unicode()) { 899 if (unicode()) {
900 // \0 is interpreted as \u0000 if it is not followed by another digit.
901 if (current() == '0') {
902 Advance();
903 if (current() < '0' || current() > '9') return 0;
Yang 2016/02/09 19:18:31 This now matches how we parse \<digit> outside of
904 }
886 // With /u, decimal escape is not interpreted as octal character code. 905 // With /u, decimal escape is not interpreted as octal character code.
887 ReportError(CStrVector("Invalid class escape")); 906 ReportError(CStrVector("Invalid class escape"));
888 return 0; 907 return 0;
889 } 908 }
909 // For backward compatibility, we interpret escaped digit from 0 to 7 as
910 // a 1..3 digit octal character code.
890 return ParseOctalLiteral(); 911 return ParseOctalLiteral();
891 case 'x': { 912 case 'x': {
892 Advance(); 913 Advance();
893 uc32 value; 914 uc32 value;
894 if (ParseHexEscape(2, &value)) return value; 915 if (ParseHexEscape(2, &value)) return value;
895 if (unicode()) { 916 if (unicode()) {
896 // With /u, invalid escapes are not treated as identity escapes. 917 // With /u, invalid escapes are not treated as identity escapes.
897 ReportError(CStrVector("Invalid escape")); 918 ReportError(CStrVector("Invalid escape"));
898 return 0; 919 return 0;
899 } 920 }
900 // If \x is not followed by a two-digit hexadecimal, treat it 921 // If \x is not followed by a two-digit hexadecimal, treat it
901 // as an identity escape. 922 // as an identity escape.
902 return 'x'; 923 return 'x';
903 } 924 }
904 case 'u': { 925 case 'u': {
905 Advance(); 926 Advance();
906 uc32 value; 927 uc32 value;
907 if (ParseUnicodeEscape(&value)) return value; 928 if (ParseUnicodeEscape(&value)) return value;
908 if (unicode()) { 929 if (unicode()) {
909 // With /u, invalid escapes are not treated as identity escapes. 930 // With /u, invalid escapes are not treated as identity escapes.
910 ReportError(CStrVector("Invalid unicode escape")); 931 ReportError(CStrVector("Invalid unicode escape"));
911 return 0; 932 return 0;
912 } 933 }
913 // If \u is not followed by a two-digit hexadecimal, treat it 934 // If \u is not followed by a two-digit hexadecimal, treat it
914 // as an identity escape. 935 // as an identity escape.
915 return 'u'; 936 return 'u';
916 } 937 }
917 default: { 938 default: {
918 uc32 result = current(); 939 uc32 result = current();
919 // With /u, no identity escapes except for syntax characters are 940 // With /u, no identity escapes except for syntax characters and '-' are
920 // allowed. Otherwise, all identity escapes are allowed. 941 // allowed. Otherwise, all identity escapes are allowed.
921 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { 942 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
922 Advance(); 943 Advance();
923 return result; 944 return result;
924 } 945 }
925 ReportError(CStrVector("Invalid escape")); 946 ReportError(CStrVector("Invalid escape"));
926 return 0; 947 return 0;
927 } 948 }
928 } 949 }
929 return 0; 950 return 0;
930 } 951 }
931 952
(...skipping 15 matching lines...) Expand all
947 } 968 }
948 case kEndMarker: 969 case kEndMarker:
949 return ReportError(CStrVector("\\ at end of pattern")); 970 return ReportError(CStrVector("\\ at end of pattern"));
950 default: 971 default:
951 first = ParseClassCharacterEscape(CHECK_FAILED); 972 first = ParseClassCharacterEscape(CHECK_FAILED);
952 } 973 }
953 } else { 974 } else {
954 Advance(); 975 Advance();
955 } 976 }
956 977
957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
958 // Combine with possibly following trail surrogate.
959 int start = position();
960 uc32 second = current();
961 if (second == '\\') {
962 second = ParseClassCharacterEscape(CHECK_FAILED);
963 } else {
964 Advance();
965 }
966 if (unibrow::Utf16::IsTrailSurrogate(second)) {
967 first = unibrow::Utf16::CombineSurrogatePair(first, second);
968 } else {
969 Reset(start);
970 }
971 }
972
973 return CharacterRange::Singleton(first); 978 return CharacterRange::Singleton(first);
974 } 979 }
975 980
976 981
977 static const uc16 kNoCharClass = 0; 982 static const uc16 kNoCharClass = 0;
978 983
979 // Adds range or pre-defined character class to character ranges. 984 // Adds range or pre-defined character class to character ranges.
980 // If char_class is not kInvalidClass, it's interpreted as a class 985 // If char_class is not kInvalidClass, it's interpreted as a class
981 // escape (i.e., 's' means whitespace, from '\s'). 986 // escape (i.e., 's' means whitespace, from '\s').
982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 987 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after
1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 1196 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 1197 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1193 AddLeadSurrogate(c); 1198 AddLeadSurrogate(c);
1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { 1199 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1195 AddTrailSurrogate(c); 1200 AddTrailSurrogate(c);
1196 } else { 1201 } else {
1197 AddCharacter(static_cast<uc16>(c)); 1202 AddCharacter(static_cast<uc16>(c));
1198 } 1203 }
1199 } 1204 }
1200 1205
1206 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1207 // A lead or trail surrogate parsed via escape sequence will not
1208 // pair up with any preceding lead or following trail surrogate.
1209 FlushPendingSurrogate();
1210 AddUnicodeCharacter(character);
1211 FlushPendingSurrogate();
1212 }
1201 1213
1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1214 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1203 1215
1204 1216
1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1217 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1206 if (NeedsDesugaringForUnicode(cc)) { 1218 if (NeedsDesugaringForUnicode(cc)) {
1207 // With /u, character class needs to be desugared, so it 1219 // With /u, character class needs to be desugared, so it
1208 // must be a standalone term instead of being part of a RegExpText. 1220 // must be a standalone term instead of being part of a RegExpText.
1209 AddTerm(cc); 1221 AddTerm(cc);
1210 } else { 1222 } else {
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
1362 return false; 1374 return false;
1363 } 1375 }
1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1376 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1365 zone()); 1377 zone());
1366 LAST(ADD_TERM); 1378 LAST(ADD_TERM);
1367 return true; 1379 return true;
1368 } 1380 }
1369 1381
1370 } // namespace internal 1382 } // namespace internal
1371 } // namespace v8 1383 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698