src/regexp/regexp-parser.cc - Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: addressed comments Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 460 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
471 } else {	471 } else {

472 // With /u, invalid escapes are not treated as identity escapes.	472 // With /u, invalid escapes are not treated as identity escapes.

473 return ReportError(CStrVector("Invalid escape"));	473 return ReportError(CStrVector("Invalid escape"));

474 }	474 }

475 break;	475 break;

476 }	476 }

477 case 'u': {	477 case 'u': {

478 Advance(2);	478 Advance(2);

479 uc32 value;	479 uc32 value;

480 if (ParseUnicodeEscape(&value)) {	480 if (ParseUnicodeEscape(&value)) {

481 builder->AddUnicodeCharacter(value);	481 builder->AddEscapedUnicodeCharacter(value);

482 } else if (!unicode()) {	482 } else if (!unicode()) {

483 builder->AddCharacter('u');	483 builder->AddCharacter('u');

484 } else {	484 } else {

485 // With /u, invalid escapes are not treated as identity escapes.	485 // With /u, invalid escapes are not treated as identity escapes.

486 return ReportError(CStrVector("Invalid unicode escape"));	486 return ReportError(CStrVector("Invalid unicode escape"));

487 }	487 }

488 break;	488 break;

489 }	489 }

490 default:	490 default:

491 Advance();	491 Advance();

(...skipping 298 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
790 Reset(start);	790 Reset(start);

791 return false;	791 return false;

792 }	792 }

793 val = val * 16 + d;	793 val = val * 16 + d;

794 Advance();	794 Advance();

795 }	795 }

796 *value = val;	796 *value = val;

797 return true;	797 return true;

798 }	798 }

799	799

800	800 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.

801 bool RegExpParser::ParseUnicodeEscape(uc32* value) {	801 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are	802 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

803 // allowed). In the latter case, the number of hex digits between { } is	803 // allowed). In the latter case, the number of hex digits between { } is

804 // arbitrary. \ and u have already been read.	804 // arbitrary. \ and u have already been read.

805 if (current() == '{' && unicode()) {	805 if (current() == '{' && unicode()) {

806 int start = position();	806 int start = position();

807 Advance();	807 Advance();

808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {	808 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

809 if (current() == '}') {	809 if (current() == '}') {

810 Advance();	810 Advance();

811 return true;	811 return true;

812 }	812 }

813 }	813 }

814 Reset(start);	814 Reset(start);

815 return false;	815 return false;

816 }	816 }

817 // \u but no {, or \u{...} escapes not allowed.	817 // \u but no {, or \u{...} escapes not allowed.

818 return ParseHexEscape(4, value);	818 bool result = ParseHexEscape(4, value);

	819 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&

	820 current() == '\\') {

	821 // Attempt to read trail surrogate.

	822 int start = position();

	823 if (Next() == 'u') {

	824 Advance(2);

	825 uc32 trail;

	826 if (ParseHexEscape(4, &trail) &&

	827 unibrow::Utf16::IsTrailSurrogate(trail)) {

	828 value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(value),

	829 static_cast<uc16>(trail));

	830 return true;

	831 }

	832 }

	833 Reset(start);

	834 }

	835 return result;

819 }	836 }

820	837

821 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {	838 ZoneList<CharacterRange>* RegExpParser::ParsePropertyClass() {

822 #ifdef V8_I18N_SUPPORT	839 #ifdef V8_I18N_SUPPORT

823 char property_name[3];	840 char property_name[3];

824 memset(property_name, 0, sizeof(property_name));	841 memset(property_name, 0, sizeof(property_name));

825 if (current() == '{') {	842 if (current() == '{') {

826 Advance();	843 Advance();

827 if (current() < 'A' \|\| current() > 'Z') return nullptr;	844 if (current() < 'A' \|\| current() > 'Z') return nullptr;

828 property_name[0] = static_cast<char>(current());	845 property_name[0] = static_cast<char>(current());

(...skipping 102 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
931 if ((controlLetter >= '0' && controlLetter <= '9') \|\|	948 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

932 controlLetter == '_') {	949 controlLetter == '_') {

933 Advance(2);	950 Advance(2);

934 return controlLetter & 0x1f;	951 return controlLetter & 0x1f;

935 }	952 }

936 // We match JSC in reading the backslash as a literal	953 // We match JSC in reading the backslash as a literal

937 // character instead of as starting an escape.	954 // character instead of as starting an escape.

938 return '\\';	955 return '\\';

939 }	956 }

940 case '0':	957 case '0':

	958 // With /u, \0 is interpreted as NUL if not followed by another digit.

	959 if (unicode() && !(Next() >= '0' && Next() <= '9')) {

	960 Advance();

	961 return 0;

	962 }

	963 // Fall through.

941 case '1':	964 case '1':

942 case '2':	965 case '2':

943 case '3':	966 case '3':

944 case '4':	967 case '4':

945 case '5':	968 case '5':

946 case '6':	969 case '6':

947 case '7':	970 case '7':

948 // For compatibility, we interpret a decimal escape that isn't	971 // For compatibility, we interpret a decimal escape that isn't

949 // a back reference (and therefore either \0 or not valid according	972 // a back reference (and therefore either \0 or not valid according

950 // to the specification) as a 1..3 digit octal character code.	973 // to the specification) as a 1..3 digit octal character code.

(...skipping 24 matching lines...) Expand all Loading...
975 // With /u, invalid escapes are not treated as identity escapes.	998 // With /u, invalid escapes are not treated as identity escapes.

976 ReportError(CStrVector("Invalid unicode escape"));	999 ReportError(CStrVector("Invalid unicode escape"));

977 return 0;	1000 return 0;

978 }	1001 }

979 // If \u is not followed by a two-digit hexadecimal, treat it	1002 // If \u is not followed by a two-digit hexadecimal, treat it

980 // as an identity escape.	1003 // as an identity escape.

981 return 'u';	1004 return 'u';

982 }	1005 }

983 default: {	1006 default: {

984 uc32 result = current();	1007 uc32 result = current();

985 // With /u, no identity escapes except for syntax characters are	1008 // With /u, no identity escapes except for syntax characters and '-' are

986 // allowed. Otherwise, all identity escapes are allowed.	1009 // allowed. Otherwise, all identity escapes are allowed.

987 if (!unicode() \|\| IsSyntaxCharacterOrSlash(result)) {	1010 if (!unicode() \|\| IsSyntaxCharacterOrSlash(result) \|\| result == '-') {

988 Advance();	1011 Advance();

989 return result;	1012 return result;

990 }	1013 }

991 ReportError(CStrVector("Invalid escape"));	1014 ReportError(CStrVector("Invalid escape"));

992 return 0;	1015 return 0;

993 }	1016 }

994 }	1017 }

995 return 0;	1018 return 0;

996 }	1019 }

997	1020

(...skipping 15 matching lines...) Expand all Loading...
1013 }	1036 }

1014 case kEndMarker:	1037 case kEndMarker:

1015 return ReportError(CStrVector("\\ at end of pattern"));	1038 return ReportError(CStrVector("\\ at end of pattern"));

1016 default:	1039 default:

1017 first = ParseClassCharacterEscape(CHECK_FAILED);	1040 first = ParseClassCharacterEscape(CHECK_FAILED);

1018 }	1041 }

1019 } else {	1042 } else {

1020 Advance();	1043 Advance();

1021 }	1044 }

1022	1045

1023 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

1024 // Combine with possibly following trail surrogate.

1025 int start = position();

1026 uc32 second = current();

1027 if (second == '\\') {

1028 second = ParseClassCharacterEscape(CHECK_FAILED);

1029 } else {

1030 Advance();

1031 }

1032 if (unibrow::Utf16::IsTrailSurrogate(second)) {

1033 first = unibrow::Utf16::CombineSurrogatePair(first, second);

1034 } else {

1035 Reset(start);

1036 }

1037 }

1038

1039 return CharacterRange::Singleton(first);	1046 return CharacterRange::Singleton(first);

1040 }	1047 }

1041	1048

1042	1049

1043 static const uc16 kNoCharClass = 0;	1050 static const uc16 kNoCharClass = 0;

1044	1051

1045 // Adds range or pre-defined character class to character ranges.	1052 // Adds range or pre-defined character class to character ranges.

1046 // If char_class is not kInvalidClass, it's interpreted as a class	1053 // If char_class is not kInvalidClass, it's interpreted as a class

1047 // escape (i.e., 's' means whitespace, from '\s').	1054 // escape (i.e., 's' means whitespace, from '\s').

1048 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,	1055 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

(...skipping 208 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1257 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));	1264 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

1258 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {	1265 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {

1259 AddLeadSurrogate(c);	1266 AddLeadSurrogate(c);

1260 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {	1267 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

1261 AddTrailSurrogate(c);	1268 AddTrailSurrogate(c);

1262 } else {	1269 } else {

1263 AddCharacter(static_cast<uc16>(c));	1270 AddCharacter(static_cast<uc16>(c));

1264 }	1271 }

1265 }	1272 }

1266	1273

	1274 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {

	1275 // A lead or trail surrogate parsed via escape sequence will not

	1276 // pair up with any preceding lead or following trail surrogate.

	1277 FlushPendingSurrogate();

	1278 AddUnicodeCharacter(character);

	1279 FlushPendingSurrogate();

	1280 }

1267	1281

1268 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }	1282 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

1269	1283

1270	1284

1271 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {	1285 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

1272 if (NeedsDesugaringForUnicode(cc)) {	1286 if (NeedsDesugaringForUnicode(cc)) {

1273 // With /u, character class needs to be desugared, so it	1287 // With /u, character class needs to be desugared, so it

1274 // must be a standalone term instead of being part of a RegExpText.	1288 // must be a standalone term instead of being part of a RegExpText.

1275 AddTerm(cc);	1289 AddTerm(cc);

1276 } else {	1290 } else {

(...skipping 151 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1428 return false;	1442 return false;

1429 }	1443 }

1430 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1444 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1431 zone());	1445 zone());

1432 LAST(ADD_TERM);	1446 LAST(ADD_TERM);

1433 return true;	1447 return true;

1434 }	1448 }

1435	1449

1436 } // namespace internal	1450 } // namespace internal

1437 } // namespace v8	1451 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »