src/regexp/regexp-parser.cc - Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1681893002: [regexp] parse RegExpUnicodeEscapeSequence according to spec. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: fix: /[\00]/u is not allowed. Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 443 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
454 } else {	454 } else {

455 // With /u, invalid escapes are not treated as identity escapes.	455 // With /u, invalid escapes are not treated as identity escapes.

456 return ReportError(CStrVector("Invalid escape"));	456 return ReportError(CStrVector("Invalid escape"));

457 }	457 }

458 break;	458 break;

459 }	459 }

460 case 'u': {	460 case 'u': {

461 Advance(2);	461 Advance(2);

462 uc32 value;	462 uc32 value;

463 if (ParseUnicodeEscape(&value)) {	463 if (ParseUnicodeEscape(&value)) {

464 builder->AddUnicodeCharacter(value);	464 builder->AddEscapedUnicodeCharacter(value);

465 } else if (!unicode()) {	465 } else if (!unicode()) {

466 builder->AddCharacter('u');	466 builder->AddCharacter('u');

467 } else {	467 } else {

468 // With /u, invalid escapes are not treated as identity escapes.	468 // With /u, invalid escapes are not treated as identity escapes.

469 return ReportError(CStrVector("Invalid unicode escape"));	469 return ReportError(CStrVector("Invalid unicode escape"));

470 }	470 }

471 break;	471 break;

472 }	472 }

473 default:	473 default:

474 Advance();	474 Advance();

(...skipping 298 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
773 Reset(start);	773 Reset(start);

774 return false;	774 return false;

775 }	775 }

776 val = val * 16 + d;	776 val = val * 16 + d;

777 Advance();	777 Advance();

778 }	778 }

779 *value = val;	779 *value = val;

780 return true;	780 return true;

781 }	781 }

782	782

783	783 // This parses RegExpUnicodeEscapeSequence as described in ECMA262.

784 bool RegExpParser::ParseUnicodeEscape(uc32* value) {	784 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are	785 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

786 // allowed). In the latter case, the number of hex digits between { } is	786 // allowed). In the latter case, the number of hex digits between { } is

787 // arbitrary. \ and u have already been read.	787 // arbitrary. \ and u have already been read.

788 if (current() == '{' && unicode()) {	788 if (current() == '{' && unicode()) {

789 int start = position();	789 int start = position();

790 Advance();	790 Advance();

791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {	791 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

792 if (current() == '}') {	792 if (current() == '}') {

793 Advance();	793 Advance();

794 return true;	794 return true;

795 }	795 }

796 }	796 }

797 Reset(start);	797 Reset(start);

798 return false;	798 return false;

799 }	799 }

800 // \u but no {, or \u{...} escapes not allowed.	800 // \u but no {, or \u{...} escapes not allowed.

801 return ParseHexEscape(4, value);	801 bool result = ParseHexEscape(4, value);

	802 if (result && unicode() && unibrow::Utf16::IsLeadSurrogate(*value) &&

	803 current() == '\\') {

	804 // Attempt to read trail surrogate.

	805 int start = position();

	806 if (Next() == 'u') {

	807 Advance(2);

	808 uc32 trail;

	809 if (ParseHexEscape(4, &trail) &&

	810 unibrow::Utf16::IsTrailSurrogate(trail)) {

	811 value = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(value),

	812 static_cast<uc16>(trail));

	813 return true;

	814 }

	815 }

	816 Reset(start);

	817 }

	818 return result;

802 }	819 }

803	820

804	821

805 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {	822 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {

806 uc32 x = 0;	823 uc32 x = 0;

807 int d = HexValue(current());	824 int d = HexValue(current());

808 if (d < 0) {	825 if (d < 0) {

809 return false;	826 return false;

810 }	827 }

811 while (d >= 0) {	828 while (d >= 0) {

(...skipping 60 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
872 return '\\';	889 return '\\';

873 }	890 }

874 case '0':	891 case '0':

875 case '1':	892 case '1':

876 case '2':	893 case '2':

877 case '3':	894 case '3':

878 case '4':	895 case '4':

879 case '5':	896 case '5':

880 case '6':	897 case '6':

881 case '7':	898 case '7':

882 // For compatibility, we interpret a decimal escape that isn't

883 // a back reference (and therefore either \0 or not valid according

884 // to the specification) as a 1..3 digit octal character code.

885 if (unicode()) {	899 if (unicode()) {

	900 // \0 is interpreted as \u0000 if it is not followed by another digit.

	901 if (current() == '0') {

	902 Advance();

	903 if (current() < '0' \|\| current() > '9') return 0;
	Yang 2016/02/09 19:18:31 This now matches how we parse \<digit> outside of This now matches how we parse \<digit> outside of a character class.
	904 }

886 // With /u, decimal escape is not interpreted as octal character code.	905 // With /u, decimal escape is not interpreted as octal character code.

887 ReportError(CStrVector("Invalid class escape"));	906 ReportError(CStrVector("Invalid class escape"));

888 return 0;	907 return 0;

889 }	908 }

	909 // For backward compatibility, we interpret escaped digit from 0 to 7 as

	910 // a 1..3 digit octal character code.

890 return ParseOctalLiteral();	911 return ParseOctalLiteral();

891 case 'x': {	912 case 'x': {

892 Advance();	913 Advance();

893 uc32 value;	914 uc32 value;

894 if (ParseHexEscape(2, &value)) return value;	915 if (ParseHexEscape(2, &value)) return value;

895 if (unicode()) {	916 if (unicode()) {

896 // With /u, invalid escapes are not treated as identity escapes.	917 // With /u, invalid escapes are not treated as identity escapes.

897 ReportError(CStrVector("Invalid escape"));	918 ReportError(CStrVector("Invalid escape"));

898 return 0;	919 return 0;

899 }	920 }

900 // If \x is not followed by a two-digit hexadecimal, treat it	921 // If \x is not followed by a two-digit hexadecimal, treat it

901 // as an identity escape.	922 // as an identity escape.

902 return 'x';	923 return 'x';

903 }	924 }

904 case 'u': {	925 case 'u': {

905 Advance();	926 Advance();

906 uc32 value;	927 uc32 value;

907 if (ParseUnicodeEscape(&value)) return value;	928 if (ParseUnicodeEscape(&value)) return value;

908 if (unicode()) {	929 if (unicode()) {

909 // With /u, invalid escapes are not treated as identity escapes.	930 // With /u, invalid escapes are not treated as identity escapes.

910 ReportError(CStrVector("Invalid unicode escape"));	931 ReportError(CStrVector("Invalid unicode escape"));

911 return 0;	932 return 0;

912 }	933 }

913 // If \u is not followed by a two-digit hexadecimal, treat it	934 // If \u is not followed by a two-digit hexadecimal, treat it

914 // as an identity escape.	935 // as an identity escape.

915 return 'u';	936 return 'u';

916 }	937 }

917 default: {	938 default: {

918 uc32 result = current();	939 uc32 result = current();

919 // With /u, no identity escapes except for syntax characters are	940 // With /u, no identity escapes except for syntax characters and '-' are

920 // allowed. Otherwise, all identity escapes are allowed.	941 // allowed. Otherwise, all identity escapes are allowed.

921 if (!unicode() \|\| IsSyntaxCharacterOrSlash(result)) {	942 if (!unicode() \|\| IsSyntaxCharacterOrSlash(result) \|\| result == '-') {

922 Advance();	943 Advance();

923 return result;	944 return result;

924 }	945 }

925 ReportError(CStrVector("Invalid escape"));	946 ReportError(CStrVector("Invalid escape"));

926 return 0;	947 return 0;

927 }	948 }

928 }	949 }

929 return 0;	950 return 0;

930 }	951 }

931	952

(...skipping 15 matching lines...) Expand all Loading...
947 }	968 }

948 case kEndMarker:	969 case kEndMarker:

949 return ReportError(CStrVector("\\ at end of pattern"));	970 return ReportError(CStrVector("\\ at end of pattern"));

950 default:	971 default:

951 first = ParseClassCharacterEscape(CHECK_FAILED);	972 first = ParseClassCharacterEscape(CHECK_FAILED);

952 }	973 }

953 } else {	974 } else {

954 Advance();	975 Advance();

955 }	976 }

956	977

957 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

958 // Combine with possibly following trail surrogate.

959 int start = position();

960 uc32 second = current();

961 if (second == '\\') {

962 second = ParseClassCharacterEscape(CHECK_FAILED);

963 } else {

964 Advance();

965 }

966 if (unibrow::Utf16::IsTrailSurrogate(second)) {

967 first = unibrow::Utf16::CombineSurrogatePair(first, second);

968 } else {

969 Reset(start);

970 }

971 }

972

973 return CharacterRange::Singleton(first);	978 return CharacterRange::Singleton(first);

974 }	979 }

975	980

976	981

977 static const uc16 kNoCharClass = 0;	982 static const uc16 kNoCharClass = 0;

978	983

979 // Adds range or pre-defined character class to character ranges.	984 // Adds range or pre-defined character class to character ranges.

980 // If char_class is not kInvalidClass, it's interpreted as a class	985 // If char_class is not kInvalidClass, it's interpreted as a class

981 // escape (i.e., 's' means whitespace, from '\s').	986 // escape (i.e., 's' means whitespace, from '\s').

982 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,	987 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

(...skipping 208 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1191 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));	1196 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

1192 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {	1197 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {

1193 AddLeadSurrogate(c);	1198 AddLeadSurrogate(c);

1194 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {	1199 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

1195 AddTrailSurrogate(c);	1200 AddTrailSurrogate(c);

1196 } else {	1201 } else {

1197 AddCharacter(static_cast<uc16>(c));	1202 AddCharacter(static_cast<uc16>(c));

1198 }	1203 }

1199 }	1204 }

1200	1205

	1206 void RegExpBuilder::AddEscapedUnicodeCharacter(uc32 character) {

	1207 // A lead or trail surrogate parsed via escape sequence will not

	1208 // pair up with any preceding lead or following trail surrogate.

	1209 FlushPendingSurrogate();

	1210 AddUnicodeCharacter(character);

	1211 FlushPendingSurrogate();

	1212 }

1201	1213

1202 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }	1214 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

1203	1215

1204	1216

1205 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {	1217 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

1206 if (NeedsDesugaringForUnicode(cc)) {	1218 if (NeedsDesugaringForUnicode(cc)) {

1207 // With /u, character class needs to be desugared, so it	1219 // With /u, character class needs to be desugared, so it

1208 // must be a standalone term instead of being part of a RegExpText.	1220 // must be a standalone term instead of being part of a RegExpText.

1209 AddTerm(cc);	1221 AddTerm(cc);

1210 } else {	1222 } else {

(...skipping 151 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1362 return false;	1374 return false;

1363 }	1375 }

1364 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1376 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1365 zone());	1377 zone());

1366 LAST(ADD_TERM);	1378 LAST(ADD_TERM);

1367 return true;	1379 return true;

1368 }	1380 }

1369	1381

1370 } // namespace internal	1382 } // namespace internal

1371 } // namespace v8	1383 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/mjsunit/harmony/unicode-escapes-in-regexps.js » ('j') | no next file with comments »