Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(8)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: addressed comments Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
(...skipping 460 matching lines...) Expand 10 before | Expand all | Expand 10 after
471 } else { 471 } else {
472 // With /u, invalid escapes are not treated as identity escapes. 472 // With /u, invalid escapes are not treated as identity escapes.
473 return ReportError(CStrVector("Invalid escape")); 473 return ReportError(CStrVector("Invalid escape"));
474 } 474 }
475 break; 475 break;
476 } 476 }
477 case 'u': { 477 case 'u': {
478 Advance(2); 478 Advance(2);
479 uc32 value; 479 uc32 value;
480 if (ParseUnicodeEscape(&value)) { 480 if (ParseUnicodeEscape(&value)) {
481 builder->AddUnicodeCharacter(value); 481 builder->AddEscapedUnicodeCharacter(value);
482 } else if (!unicode()) { 482 } else if (!unicode()) {
483 builder->AddCharacter('u'); 483 builder->AddCharacter('u');
484 } else { 484 } else {
485 // With /u, invalid escapes are not treated as identity escapes. 485 // With /u, invalid escapes are not treated as identity escapes.
486 return ReportError(CStrVector("Invalid unicode escape")); 486 return ReportError(CStrVector("Invalid unicode escape"));
487 } 487 }
488 break; 488 break;
489 } 489 }
490 default: 490 default:
491 Advance(); 491 Advance();
(...skipping 298 matching lines...) Expand 10 before | Expand all | Expand 10 after
790 Reset(start); 790 Reset(start);
791 return false; 791 return false;
792 } 792 }
793 val = val * 16 + d; 793 val = val * 16 + d;
794 Advance(); 794 Advance();
795 } 795 }
796 *value = val; 796 *value = val;
797 return true; 797 return true;
798 } 798 }
799 799
800 800 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.
801 bool RegExpParser::ParseUnicodeEscape(uc32* value) { 801 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
803 // allowed). In the latter case, the number of hex digits between { } is 803 // allowed). In the latter case, the number of hex digits between { } is
804 // arbitrary. \ and u have already been read. 804 // arbitrary. \ and u have already been read.
805 if (current() == '{' && unicode()) { 805 if (current() == '{' && unicode()) {
806 int start = position(); 806 int start = position();
807 Advance(); 807 Advance();
808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
809 if (current() == '}') { 809 if (current() == '}') {
810 Advance(); 810 Advance();
811 return true; 811 return true;
812 } 812 }
813 } 813 }
814 Reset(start); 814 Reset(start);
815 return false; 815 return false;
816 } 816 }
817 // \u but no {, or \u{...} escapes not allowed. 817 // \u but no {, or \u{...} escapes not allowed.
818 return ParseHexEscape(4, value); 818 bool result = ParseHexEscape(4, value);
819 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&
820 current() == '\\') {
821 // Attempt to read trail surrogate.
822 int start = position();
823 if (Next() == 'u') {
824 Advance(2);
825 uc32 trail;
826 if (ParseHexEscape(4, &trail) &&
827 unibrow::Utf16::IsTrailSurrogate(trail)) {
828 *value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(*value),
829 static_cast<uc16>(trail));
830 return true;
831 }
832 }
833 Reset(start);
834 }
835 return result;
819 } 836 }
820 837
821 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() { 838 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {
822 #ifdef V8_I18N_SUPPORT 839 #ifdef V8_I18N_SUPPORT
823 char property_name[3]; 840 char property_name[3];
824 memset(property_name, 0, sizeof(property_name)); 841 memset(property_name, 0, sizeof(property_name));
825 if (current() == '{') { 842 if (current() == '{') {
826 Advance(); 843 Advance();
827 if (current() < 'A' || current() > 'Z') return nullptr; 844 if (current() < 'A' || current() > 'Z') return nullptr;
828 property_name[0] = static_cast<char>(current()); 845 property_name[0] = static_cast<char>(current());
(...skipping 102 matching lines...) Expand 10 before | Expand all | Expand 10 after
931 if ((controlLetter >= '0' && controlLetter <= '9') || 948 if ((controlLetter >= '0' && controlLetter <= '9') ||
932 controlLetter == '_') { 949 controlLetter == '_') {
933 Advance(2); 950 Advance(2);
934 return controlLetter & 0x1f; 951 return controlLetter & 0x1f;
935 } 952 }
936 // We match JSC in reading the backslash as a literal 953 // We match JSC in reading the backslash as a literal
937 // character instead of as starting an escape. 954 // character instead of as starting an escape.
938 return '\\'; 955 return '\\';
939 } 956 }
940 case '0': 957 case '0':
958 // With /u, \0 is interpreted as NUL if not followed by another digit.
959 if (unicode() && !(Next() >= '0' && Next() <= '9')) {
960 Advance();
961 return 0;
962 }
963 // Fall through.
941 case '1': 964 case '1':
942 case '2': 965 case '2':
943 case '3': 966 case '3':
944 case '4': 967 case '4':
945 case '5': 968 case '5':
946 case '6': 969 case '6':
947 case '7': 970 case '7':
948 // For compatibility, we interpret a decimal escape that isn't 971 // For compatibility, we interpret a decimal escape that isn't
949 // a back reference (and therefore either \0 or not valid according 972 // a back reference (and therefore either \0 or not valid according
950 // to the specification) as a 1..3 digit octal character code. 973 // to the specification) as a 1..3 digit octal character code.
(...skipping 24 matching lines...) Expand all
975 // With /u, invalid escapes are not treated as identity escapes. 998 // With /u, invalid escapes are not treated as identity escapes.
976 ReportError(CStrVector("Invalid unicode escape")); 999 ReportError(CStrVector("Invalid unicode escape"));
977 return 0; 1000 return 0;
978 } 1001 }
979 // If \u is not followed by a two-digit hexadecimal, treat it 1002 // If \u is not followed by a two-digit hexadecimal, treat it
980 // as an identity escape. 1003 // as an identity escape.
981 return 'u'; 1004 return 'u';
982 } 1005 }
983 default: { 1006 default: {
984 uc32 result = current(); 1007 uc32 result = current();
985 // With /u, no identity escapes except for syntax characters are 1008 // With /u, no identity escapes except for syntax characters and '-' are
986 // allowed. Otherwise, all identity escapes are allowed. 1009 // allowed. Otherwise, all identity escapes are allowed.
987 if (!unicode() || IsSyntaxCharacterOrSlash(result)) { 1010 if (!unicode() || IsSyntaxCharacterOrSlash(result) || result == '-') {
988 Advance(); 1011 Advance();
989 return result; 1012 return result;
990 } 1013 }
991 ReportError(CStrVector("Invalid escape")); 1014 ReportError(CStrVector("Invalid escape"));
992 return 0; 1015 return 0;
993 } 1016 }
994 } 1017 }
995 return 0; 1018 return 0;
996 } 1019 }
997 1020
(...skipping 15 matching lines...) Expand all
1013 } 1036 }
1014 case kEndMarker: 1037 case kEndMarker:
1015 return ReportError(CStrVector("\\ at end of pattern")); 1038 return ReportError(CStrVector("\\ at end of pattern"));
1016 default: 1039 default:
1017 first = ParseClassCharacterEscape(CHECK_FAILED); 1040 first = ParseClassCharacterEscape(CHECK_FAILED);
1018 } 1041 }
1019 } else { 1042 } else {
1020 Advance(); 1043 Advance();
1021 } 1044 }
1022 1045
1023 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
1024 // Combine with possibly following trail surrogate.
1025 int start = position();
1026 uc32 second = current();
1027 if (second == '\\') {
1028 second = ParseClassCharacterEscape(CHECK_FAILED);
1029 } else {
1030 Advance();
1031 }
1032 if (unibrow::Utf16::IsTrailSurrogate(second)) {
1033 first = unibrow::Utf16::CombineSurrogatePair(first, second);
1034 } else {
1035 Reset(start);
1036 }
1037 }
1038
1039 return CharacterRange::Singleton(first); 1046 return CharacterRange::Singleton(first);
1040 } 1047 }
1041 1048
1042 1049
1043 static const uc16 kNoCharClass = 0; 1050 static const uc16 kNoCharClass = 0;
1044 1051
1045 // Adds range or pre-defined character class to character ranges. 1052 // Adds range or pre-defined character class to character ranges.
1046 // If char_class is not kInvalidClass, it's interpreted as a class 1053 // If char_class is not kInvalidClass, it's interpreted as a class
1047 // escape (i.e., 's' means whitespace, from '\s'). 1054 // escape (i.e., 's' means whitespace, from '\s').
1048 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 1055 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
(...skipping 208 matching lines...) Expand 10 before | Expand all | Expand 10 after
1257 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 1264 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1258 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 1265 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1259 AddLeadSurrogate(c); 1266 AddLeadSurrogate(c);
1260 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { 1267 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1261 AddTrailSurrogate(c); 1268 AddTrailSurrogate(c);
1262 } else { 1269 } else {
1263 AddCharacter(static_cast<uc16>(c)); 1270 AddCharacter(static_cast<uc16>(c));
1264 } 1271 }
1265 } 1272 }
1266 1273
1274 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {
1275 // A lead or trail surrogate parsed via escape sequence will not
1276 // pair up with any preceding lead or following trail surrogate.
1277 FlushPendingSurrogate();
1278 AddUnicodeCharacter(character);
1279 FlushPendingSurrogate();
1280 }
1267 1281
1268 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1282 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1269 1283
1270 1284
1271 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1285 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1272 if (NeedsDesugaringForUnicode(cc)) { 1286 if (NeedsDesugaringForUnicode(cc)) {
1273 // With /u, character class needs to be desugared, so it 1287 // With /u, character class needs to be desugared, so it
1274 // must be a standalone term instead of being part of a RegExpText. 1288 // must be a standalone term instead of being part of a RegExpText.
1275 AddTerm(cc); 1289 AddTerm(cc);
1276 } else { 1290 } else {
(...skipping 151 matching lines...) Expand 10 before | Expand all | Expand 10 after
1428 return false; 1442 return false;
1429 } 1443 }
1430 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1444 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1431 zone()); 1445 zone());
1432 LAST(ADD_TERM); 1446 LAST(ADD_TERM);
1433 return true; 1447 return true;
1434 } 1448 }
1435 1449
1436 } // namespace internal 1450 } // namespace internal
1437 } // namespace v8 1451 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698