src/regexp/regexp-parser.cc - Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: . Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 443 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
454 } else {	454 } else {

455 // With /u, invalid escapes are not treated as identity escapes.	455 // With /u, invalid escapes are not treated as identity escapes.

456 return ReportError(CStrVector("Invalid escape"));	456 return ReportError(CStrVector("Invalid escape"));

457 }	457 }

458 break;	458 break;

459 }	459 }

460 case 'u': {	460 case 'u': {

461 Advance(2);	461 Advance(2);

462 uc32 value;	462 uc32 value;

463 if (ParseUnicodeEscape(&value)) {	463 if (ParseUnicodeEscape(&value)) {

464 builder->AddUnicodeCharacter(value);	464 builder->AddEscapedUnicodeCharacter(value);

465 } else if (!unicode()) {	465 } else if (!unicode()) {

466 builder->AddCharacter('u');	466 builder->AddCharacter('u');

467 } else {	467 } else {

468 // With /u, invalid escapes are not treated as identity escapes.	468 // With /u, invalid escapes are not treated as identity escapes.

469 return ReportError(CStrVector("Invalid unicode escape"));	469 return ReportError(CStrVector("Invalid unicode escape"));

470 }	470 }

471 break;	471 break;

472 }	472 }

473 default:	473 default:

474 Advance();	474 Advance();

(...skipping 298 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
773 Reset(start);	773 Reset(start);

774 return false;	774 return false;

775 }	775 }

776 val = val * 16 + d;	776 val = val * 16 + d;

777 Advance();	777 Advance();

778 }	778 }

779 *value = val;	779 *value = val;

780 return true;	780 return true;

781 }	781 }

782	782

783	783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.

784 bool RegExpParser::ParseUnicodeEscape(uc32* value) {	784 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are	785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

786 // allowed). In the latter case, the number of hex digits between { } is	786 // allowed). In the latter case, the number of hex digits between { } is

787 // arbitrary. \ and u have already been read.	787 // arbitrary. \ and u have already been read.

788 if (current() == '{' && unicode()) {	788 if (current() == '{' && unicode()) {

789 int start = position();	789 int start = position();

790 Advance();	790 Advance();

791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {	791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

792 if (current() == '}') {	792 if (current() == '}') {

793 Advance();	793 Advance();

794 return true;	794 return true;

795 }	795 }

796 }	796 }

797 Reset(start);	797 Reset(start);

798 return false;	798 return false;

799 }	799 }

800 // \u but no {, or \u{...} escapes not allowed.	800 // \u but no {, or \u{...} escapes not allowed.

801 return ParseHexEscape(4, value);	801 bool result = ParseHexEscape(4, value);

	802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&

	803 current() == '\\') {

	804 // Attempt to read trail surrogate.

	805 int start = position();

	806 if (Next() == 'u') {

	807 Advance(2);

	808 uc32 trail;

	809 if (ParseHexEscape(4, &trail) &&

	810 unibrow::Utf16::IsTrailSurrogate(trail)) {

	811 value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(value),

	812 static_cast<uc16>(trail));

	813 return true;

	814 }

	815 }

	816 Reset(start);

	817 }

	818 return result;

802 }	819 }

803	820

804	821

805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {	822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {

806 uc32 x = 0;	823 uc32 x = 0;

807 int d = HexValue(current());	824 int d = HexValue(current());

808 if (d < 0) {	825 if (d < 0) {

809 return false;	826 return false;

810 }	827 }

811 while (d >= 0) {	828 while (d >= 0) {

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
865 if ((controlLetter >= '0' && controlLetter <= '9') \|\|	882 if ((controlLetter >= '0' && controlLetter <= '9') \|\|

866 controlLetter == '_') {	883 controlLetter == '_') {

867 Advance(2);	884 Advance(2);

868 return controlLetter & 0x1f;	885 return controlLetter & 0x1f;

869 }	886 }

870 // We match JSC in reading the backslash as a literal	887 // We match JSC in reading the backslash as a literal

871 // character instead of as starting an escape.	888 // character instead of as starting an escape.

872 return '\\';	889 return '\\';

873 }	890 }

874 case '0':	891 case '0':

	892 // With /u, \0 is interpreted as NUL if not followed by another digit.

	893 if (unicode() && !(Next() >= '0' && Next() <= '9')) {

	894 Advance();

	895 return 0;

	896 }

	897 // Fall through.

875 case '1':	898 case '1':

876 case '2':	899 case '2':

877 case '3':	900 case '3':

878 case '4':	901 case '4':

879 case '5':	902 case '5':

880 case '6':	903 case '6':

881 case '7':	904 case '7':

882 // For compatibility, we interpret a decimal escape that isn't	905 // For compatibility, we interpret a decimal escape that isn't

883 // a back reference (and therefore either \0 or not valid according	906 // a back reference (and therefore either \0 or not valid according

884 // to the specification) as a 1..3 digit octal character code.	907 // to the specification) as a 1..3 digit octal character code.

(...skipping 24 matching lines...) Expand all Loading...
909 // With /u, invalid escapes are not treated as identity escapes.	932 // With /u, invalid escapes are not treated as identity escapes.

910 ReportError(CStrVector("Invalid unicode escape"));	933 ReportError(CStrVector("Invalid unicode escape"));

911 return 0;	934 return 0;

912 }	935 }

913 // If \u is not followed by a two-digit hexadecimal, treat it	936 // If \u is not followed by a two-digit hexadecimal, treat it

914 // as an identity escape.	937 // as an identity escape.

915 return 'u';	938 return 'u';

916 }	939 }

917 default: {	940 default: {

918 uc32 result = current();	941 uc32 result = current();

919 // With /u, no identity escapes except for syntax characters are	942 // With /u, no identity escapes except for syntax characters and '-' are

920 // allowed. Otherwise, all identity escapes are allowed.	943 // allowed. Otherwise, all identity escapes are allowed.

921 if (!unicode() \|\| IsSyntaxCharacterOrSlash(result)) {	944 if (!unicode() \|\| IsSyntaxCharacterOrSlash(result) \|\| result == '-') {

922 Advance();	945 Advance();

923 return result;	946 return result;

924 }	947 }

925 ReportError(CStrVector("Invalid escape"));	948 ReportError(CStrVector("Invalid escape"));

926 return 0;	949 return 0;

927 }	950 }

928 }	951 }

929 return 0;	952 return 0;

930 }	953 }

931	954

(...skipping 15 matching lines...) Expand all Loading...
947 }	970 }

948 case kEndMarker:	971 case kEndMarker:

949 return ReportError(CStrVector("\\ at end of pattern"));	972 return ReportError(CStrVector("\\ at end of pattern"));

950 default:	973 default:

951 first = ParseClassCharacterEscape(CHECK_FAILED);	974 first = ParseClassCharacterEscape(CHECK_FAILED);

952 }	975 }

953 } else {	976 } else {

954 Advance();	977 Advance();

955 }	978 }

956	979

957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

958 // Combine with possibly following trail surrogate.

959 int start = position();

960 uc32 second = current();

961 if (second == '\\') {

962 second = ParseClassCharacterEscape(CHECK_FAILED);

963 } else {

964 Advance();

965 }

966 if (unibrow::Utf16::IsTrailSurrogate(second)) {

967 first = unibrow::Utf16::CombineSurrogatePair(first, second);

968 } else {

969 Reset(start);

970 }

971 }

972

973 return CharacterRange::Singleton(first);	980 return CharacterRange::Singleton(first);

974 }	981 }

975	982

976	983

977 static const uc16 kNoCharClass = 0;	984 static const uc16 kNoCharClass = 0;

978	985

979 // Adds range or pre-defined character class to character ranges.	986 // Adds range or pre-defined character class to character ranges.

980 // If char_class is not kInvalidClass, it's interpreted as a class	987 // If char_class is not kInvalidClass, it's interpreted as a class

981 // escape (i.e., 's' means whitespace, from '\s').	988 // escape (i.e., 's' means whitespace, from '\s').

982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,	989 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

(...skipping 208 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));	1198 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {	1199 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {

1193 AddLeadSurrogate(c);	1200 AddLeadSurrogate(c);

1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {	1201 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

1195 AddTrailSurrogate(c);	1202 AddTrailSurrogate(c);

1196 } else {	1203 } else {

1197 AddCharacter(static_cast<uc16>(c));	1204 AddCharacter(static_cast<uc16>(c));

1198 }	1205 }

1199 }	1206 }

1200	1207

	1208 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {

	1209 // A lead or trail surrogate parsed via escape sequence will not

	1210 // pair up with any preceding lead or following trail surrogate.

	1211 FlushPendingSurrogate();

	1212 AddUnicodeCharacter(character);

	1213 FlushPendingSurrogate();

	1214 }

1201	1215

1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }	1216 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

1203	1217

1204	1218

1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {	1219 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

1206 if (NeedsDesugaringForUnicode(cc)) {	1220 if (NeedsDesugaringForUnicode(cc)) {

1207 // With /u, character class needs to be desugared, so it	1221 // With /u, character class needs to be desugared, so it

1208 // must be a standalone term instead of being part of a RegExpText.	1222 // must be a standalone term instead of being part of a RegExpText.

1209 AddTerm(cc);	1223 AddTerm(cc);

1210 } else {	1224 } else {

(...skipping 151 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1362 return false;	1376 return false;

1363 }	1377 }

1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1378 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1365 zone());	1379 zone());

1366 LAST(ADD_TERM);	1380 LAST(ADD_TERM);

1367 return true;	1381 return true;

1368 }	1382 }

1369	1383

1370 } // namespace internal	1384 } // namespace internal

1371 } // namespace v8	1385 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('J')