src/parser.cc - Issue 788043005: ES6 unicode escapes, part 2: Regexps.

Side by Side Diff: src/parser.cc

Issue 788043005: ES6 unicode escapes, part 2: Regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: error reporting Created 5 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/api.h"	7 #include "src/api.h"

8 #include "src/ast.h"	8 #include "src/ast.h"

9 #include "src/bailout-reason.h"	9 #include "src/bailout-reason.h"

10 #include "src/base/platform/platform.h"	10 #include "src/base/platform/platform.h"

(...skipping 4260 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4271 }	4271 }

4272 isolate()->counters()->total_preparse_skipped()->Increment(	4272 isolate()->counters()->total_preparse_skipped()->Increment(

4273 total_preparse_skipped_);	4273 total_preparse_skipped_);

4274 }	4274 }

4275	4275

4276	4276

4277 // ----------------------------------------------------------------------------	4277 // ----------------------------------------------------------------------------

4278 // Regular expressions	4278 // Regular expressions

4279	4279

4280	4280

4281 RegExpParser::RegExpParser(FlatStringReader* in,	4281 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

4282 Handle<String>* error,	4282 bool multiline, bool unicode, Zone* zone)

4283 bool multiline,

4284 Zone* zone)

4285 : isolate_(zone->isolate()),	4283 : isolate_(zone->isolate()),

4286 zone_(zone),	4284 zone_(zone),

4287 error_(error),	4285 error_(error),

4288 captures_(NULL),	4286 captures_(NULL),

4289 in_(in),	4287 in_(in),

4290 current_(kEndMarker),	4288 current_(kEndMarker),

4291 next_pos_(0),	4289 next_pos_(0),

4292 capture_count_(0),	4290 capture_count_(0),

4293 has_more_(true),	4291 has_more_(true),

4294 multiline_(multiline),	4292 multiline_(multiline),

	4293 unicode_(unicode),

4295 simple_(false),	4294 simple_(false),

4296 contains_anchor_(false),	4295 contains_anchor_(false),

4297 is_scanned_for_captures_(false),	4296 is_scanned_for_captures_(false),

4298 failed_(false) {	4297 failed_(false) {

4299 Advance();	4298 Advance();

4300 }	4299 }

4301	4300

4302	4301

4303 uc32 RegExpParser::Next() {	4302 uc32 RegExpParser::Next() {

4304 if (has_next()) {	4303 if (has_next()) {

(...skipping 36 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4341 next_pos_ += dist - 1;	4340 next_pos_ += dist - 1;

4342 Advance();	4341 Advance();

4343 }	4342 }

4344	4343

4345	4344

4346 bool RegExpParser::simple() {	4345 bool RegExpParser::simple() {

4347 return simple_;	4346 return simple_;

4348 }	4347 }

4349	4348

4350	4349

	4350 bool RegExpParser::IsSyntaxCharacter(uc32 c) {

	4351 return c == '^' \|\| c == '$' \|\| c == '\\' \|\| c == '.' \|\| c == '*' \|\|

	4352 c == '+' \|\| c == '?' \|\| c == '(' \|\| c == ')' \|\| c == '[' \|\| c == ']' \|\|

	4353 c == '{' \|\| c == '}' \|\| c == '\|';

	4354 }
	mathias 2015/01/08 12:29:07 Should `-` be a “syntax character” as well because Should `-` be a “syntax character” as well because of its special meaning within character classes (e.g. `/[a-b]/`)? marja 2015/01/08 13:42:18 The spec (draft rev 30) says: SyntaxCharacter :: Show quoted text On 2015/01/08 12:29:07, mathias wrote: > Should `-` be a “syntax character” as well because of its special meaning within > character classes (e.g. `/[a-b]/`)? The spec (draft rev 30) says: SyntaxCharacter :: one of ^ $ \ . * + ? ( ) [ ] { } \| - loses its special meaning if it's the first or the last character in the character class, so [a-b-] matches a, b, and -. But idk, hard to say if it's an omission in the spec or intentional. I filed a bug to ask that: https://bugs.ecmascript.org/show_bug.cgi?id=3519 If they update the spec, I'll update the implementation. rossberg 2015/01/08 14:11:45 It's not a SyntaxCharacter and that's likely inten Show quoted text On 2015/01/08 13:42:18, marja wrote: > On 2015/01/08 12:29:07, mathias wrote: > > Should `-` be a “syntax character” as well because of its special meaning > within > > character classes (e.g. `/[a-b]/`)? > > The spec (draft rev 30) says: > > SyntaxCharacter :: one of > ^ $ \ . * + ? ( ) [ ] { } \| > > - loses its special meaning if it's the first or the last character in the > character class, so [a-b-] matches a, b, and -. > > But idk, hard to say if it's an omission in the spec or intentional. I filed a > bug to ask that: https://bugs.ecmascript.org/show_bug.cgi?id=3519 > > If they update the spec, I'll update the implementation. It's not a SyntaxCharacter and that's likely intentional: it has no special status outside a character class, and inside you can put it first or last, so you don't need to escape it in either case. mathias 2015/01/21 07:16:54 /[\-]/u is now allowed: https://bugs.ecmascript.or /[\-]/u is now allowed: https://bugs.ecmascript.org/show_bug.cgi?id=3519#c1
	4355

	4356

4351 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {	4357 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {

4352 failed_ = true;	4358 failed_ = true;

4353 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();	4359 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();

4354 // Zip to the end to make sure the no more input is read.	4360 // Zip to the end to make sure the no more input is read.

4355 current_ = kEndMarker;	4361 current_ = kEndMarker;

4356 next_pos_ = in()->length();	4362 next_pos_ = in()->length();

4357 return NULL;	4363 return NULL;

4358 }	4364 }

4359	4365

4360	4366

(...skipping 196 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4557 if (capture == NULL) {	4563 if (capture == NULL) {

4558 builder->AddEmpty();	4564 builder->AddEmpty();

4559 break;	4565 break;

4560 }	4566 }

4561 RegExpTree* atom = new(zone()) RegExpBackReference(capture);	4567 RegExpTree* atom = new(zone()) RegExpBackReference(capture);

4562 builder->AddAtom(atom);	4568 builder->AddAtom(atom);

4563 break;	4569 break;

4564 }	4570 }

4565 uc32 first_digit = Next();	4571 uc32 first_digit = Next();

4566 if (first_digit == '8' \|\| first_digit == '9') {	4572 if (first_digit == '8' \|\| first_digit == '9') {

4567 // Treat as identity escape	4573 // If the 'u' flag is present, only syntax characters can be escaped,

4568 builder->AddCharacter(first_digit);	4574 // no other identity escapes are allowed. If the 'u' flag is not

4569 Advance(2);	4575 // present, all identity escapes are allowed.

	4576 if (!FLAG_harmony_unicode \|\| !unicode_) {

	4577 builder->AddCharacter(first_digit);

	4578 Advance(2);

	4579 } else {

	4580 return ReportError(CStrVector("Invalid escape"));

	4581 }

4570 break;	4582 break;

4571 }	4583 }

4572 }	4584 }

4573 // FALLTHROUGH	4585 // FALLTHROUGH

4574 case '0': {	4586 case '0': {

4575 Advance();	4587 Advance();

4576 uc32 octal = ParseOctalLiteral();	4588 uc32 octal = ParseOctalLiteral();

4577 builder->AddCharacter(octal);	4589 builder->AddCharacter(octal);

4578 break;	4590 break;

4579 }	4591 }

(...skipping 35 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4615 Advance(2);	4627 Advance(2);

4616 builder->AddCharacter(controlLetter & 0x1f);	4628 builder->AddCharacter(controlLetter & 0x1f);

4617 }	4629 }

4618 break;	4630 break;

4619 }	4631 }

4620 case 'x': {	4632 case 'x': {

4621 Advance(2);	4633 Advance(2);

4622 uc32 value;	4634 uc32 value;

4623 if (ParseHexEscape(2, &value)) {	4635 if (ParseHexEscape(2, &value)) {

4624 builder->AddCharacter(value);	4636 builder->AddCharacter(value);

	4637 } else if (!FLAG_harmony_unicode \|\| !unicode_) {

	4638 builder->AddCharacter('x');

4625 } else {	4639 } else {

4626 builder->AddCharacter('x');	4640 // If the 'u' flag is present, invalid escapes are not treated as

	4641 // identity escapes.

	4642 return ReportError(CStrVector("Invalid escape"));

4627 }	4643 }

4628 break;	4644 break;

4629 }	4645 }

4630 case 'u': {	4646 case 'u': {

4631 Advance(2);	4647 Advance(2);

4632 uc32 value;	4648 uc32 value;

4633 if (ParseHexEscape(4, &value)) {	4649 if (ParseUnicodeEscape(&value)) {

4634 builder->AddCharacter(value);	4650 builder->AddCharacter(value);

	4651 } else if (!FLAG_harmony_unicode \|\| !unicode_) {

	4652 builder->AddCharacter('u');

4635 } else {	4653 } else {

4636 builder->AddCharacter('u');	4654 // If the 'u' flag is present, invalid escapes are not treated as

	4655 // identity escapes.

	4656 return ReportError(CStrVector("Invalid unicode escape"));

4637 }	4657 }

4638 break;	4658 break;

4639 }	4659 }

4640 default:	4660 default:

4641 // Identity escape.	4661 Advance();

4642 builder->AddCharacter(Next());	4662 // If the 'u' flag is present, only syntax characters can be escaped, no

4643 Advance(2);	4663 // other identity escapes are allowed. If the 'u' flag is not present,

	4664 // all identity escapes are allowed.

	4665 if (!FLAG_harmony_unicode \|\| !unicode_ \|\|

	4666 IsSyntaxCharacter(current())) {

	4667 builder->AddCharacter(current());

	4668 Advance();

	4669 } else {

	4670 return ReportError(CStrVector("Invalid escape"));

	4671 }

4644 break;	4672 break;

4645 }	4673 }

4646 break;	4674 break;

4647 case '{': {	4675 case '{': {

4648 int dummy;	4676 int dummy;

4649 if (ParseIntervalQuantifier(&dummy, &dummy)) {	4677 if (ParseIntervalQuantifier(&dummy, &dummy)) {

4650 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);	4678 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);

4651 }	4679 }

4652 // fallthrough	4680 // fallthrough

4653 }	4681 }

(...skipping 222 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4876 Advance();	4904 Advance();

4877 if (value < 32 && '0' <= current() && current() <= '7') {	4905 if (value < 32 && '0' <= current() && current() <= '7') {

4878 value = value * 8 + current() - '0';	4906 value = value * 8 + current() - '0';

4879 Advance();	4907 Advance();

4880 }	4908 }

4881 }	4909 }

4882 return value;	4910 return value;

4883 }	4911 }

4884	4912

4885	4913

4886 bool RegExpParser::ParseHexEscape(int length, uc32 *value) {	4914 bool RegExpParser::ParseHexEscape(int length, uc32* value) {

4887 int start = position();	4915 int start = position();

4888 uc32 val = 0;	4916 uc32 val = 0;

4889 bool done = false;	4917 for (int i = 0; i < length; ++i) {

4890 for (int i = 0; !done; i++) {

4891 uc32 c = current();	4918 uc32 c = current();

4892 int d = HexValue(c);	4919 int d = HexValue(c);

4893 if (d < 0) {	4920 if (d < 0) {

4894 Reset(start);	4921 Reset(start);

4895 return false;	4922 return false;

4896 }	4923 }

4897 val = val * 16 + d;	4924 val = val * 16 + d;

4898 Advance();	4925 Advance();

4899 if (i == length - 1) {

4900 done = true;

4901 }

4902 }	4926 }

4903 *value = val;	4927 *value = val;

4904 return true;	4928 return true;

4905 }	4929 }

4906	4930

4907	4931

	4932 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

	4933 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

	4934 // allowed). In the latter case, the number of hex digits between { } is

	4935 // arbitrary. \ and u have already been read.

	4936 if (current() == '{' && FLAG_harmony_unicode && unicode_) {

	4937 int start = position();

	4938 Advance();

	4939 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

	4940 if (current() == '}') {

	4941 Advance();

	4942 return true;

	4943 }

	4944 }

	4945 Reset(start);

	4946 return false;

	4947 }

	4948 // \u but no {, or \u{...} escapes not allowed.

	4949 return ParseHexEscape(4, value);

	4950 }

	4951

	4952

	4953 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {

	4954 uc32 x = 0;

	4955 int d = HexValue(current());

	4956 if (d < 0) {

	4957 return false;

	4958 }

	4959 while (d >= 0) {

	4960 x = x * 16 + d;

	4961 if (x > max_value) {

	4962 return false;

	4963 }

	4964 Advance();

	4965 d = HexValue(current());

	4966 }

	4967 *value = x;

	4968 return true;

	4969 }

	4970

	4971

4908 uc32 RegExpParser::ParseClassCharacterEscape() {	4972 uc32 RegExpParser::ParseClassCharacterEscape() {

4909 DCHECK(current() == '\\');	4973 DCHECK(current() == '\\');

4910 DCHECK(has_next() && !IsSpecialClassEscape(Next()));	4974 DCHECK(has_next() && !IsSpecialClassEscape(Next()));

4911 Advance();	4975 Advance();

4912 switch (current()) {	4976 switch (current()) {

4913 case 'b':	4977 case 'b':

4914 Advance();	4978 Advance();

4915 return '\b';	4979 return '\b';

4916 // ControlEscape :: one of	4980 // ControlEscape :: one of

4917 // f n r t v	4981 // f n r t v

(...skipping 34 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4952 // For compatibility, we interpret a decimal escape that isn't	5016 // For compatibility, we interpret a decimal escape that isn't

4953 // a back reference (and therefore either \0 or not valid according	5017 // a back reference (and therefore either \0 or not valid according

4954 // to the specification) as a 1..3 digit octal character code.	5018 // to the specification) as a 1..3 digit octal character code.

4955 return ParseOctalLiteral();	5019 return ParseOctalLiteral();

4956 case 'x': {	5020 case 'x': {

4957 Advance();	5021 Advance();

4958 uc32 value;	5022 uc32 value;

4959 if (ParseHexEscape(2, &value)) {	5023 if (ParseHexEscape(2, &value)) {

4960 return value;	5024 return value;

4961 }	5025 }

4962 // If \x is not followed by a two-digit hexadecimal, treat it	5026 if (!FLAG_harmony_unicode \|\| !unicode_) {

4963 // as an identity escape.	5027 // If \x is not followed by a two-digit hexadecimal, treat it

4964 return 'x';	5028 // as an identity escape.

	5029 return 'x';

	5030 }

	5031 // If the 'u' flag is present, invalid escapes are not treated as

	5032 // identity escapes.

	5033 ReportError(CStrVector("Invalid escape"));

	5034 return 0;

4965 }	5035 }

4966 case 'u': {	5036 case 'u': {

4967 Advance();	5037 Advance();

4968 uc32 value;	5038 uc32 value;

4969 if (ParseHexEscape(4, &value)) {	5039 if (ParseUnicodeEscape(&value)) {

4970 return value;	5040 return value;

4971 }	5041 }

4972 // If \u is not followed by a four-digit hexadecimal, treat it	5042 if (!FLAG_harmony_unicode \|\| !unicode_) {

4973 // as an identity escape.	5043 return 'u';

4974 return 'u';	5044 }

	5045 // If the 'u' flag is present, invalid escapes are not treated as

	5046 // identity escapes.

	5047 ReportError(CStrVector("Invalid unicode escape"));

	5048 return 0;

4975 }	5049 }

4976 default: {	5050 default: {

4977 // Extended identity escape. We accept any character that hasn't

4978 // been matched by a more specific case, not just the subset required

4979 // by the ECMAScript specification.

4980 uc32 result = current();	5051 uc32 result = current();

4981 Advance();	5052 // If the 'u' flag is present, only syntax characters can be escaped, no

4982 return result;	5053 // other identity escapes are allowed. If the 'u' flag is not present, all

	5054 // identity escapes are allowed.

	5055 if (!FLAG_harmony_unicode \|\| !unicode_ \|\| IsSyntaxCharacter(result)) {

	5056 Advance();

	5057 return result;

	5058 }

	5059 ReportError(CStrVector("Invalid escape"));

	5060 return 0;

4983 }	5061 }

4984 }	5062 }

4985 return 0;	5063 return 0;

4986 }	5064 }

4987	5065

4988	5066

4989 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {	5067 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

4990 DCHECK_EQ(0, *char_class);	5068 DCHECK_EQ(0, *char_class);

4991 uc32 first = current();	5069 uc32 first = current();

4992 if (first == '\\') {	5070 if (first == '\\') {

(...skipping 85 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5078 ranges->Add(CharacterRange::Everything(), zone());	5156 ranges->Add(CharacterRange::Everything(), zone());

5079 is_negated = !is_negated;	5157 is_negated = !is_negated;

5080 }	5158 }

5081 return new(zone()) RegExpCharacterClass(ranges, is_negated);	5159 return new(zone()) RegExpCharacterClass(ranges, is_negated);

5082 }	5160 }

5083	5161

5084	5162

5085 // ----------------------------------------------------------------------------	5163 // ----------------------------------------------------------------------------

5086 // The Parser interface.	5164 // The Parser interface.

5087	5165

5088 bool RegExpParser::ParseRegExp(FlatStringReader* input,	5166 bool RegExpParser::ParseRegExp(FlatStringReader* input, bool multiline,

5089 bool multiline,	5167 bool unicode, RegExpCompileData* result,

5090 RegExpCompileData* result,

5091 Zone* zone) {	5168 Zone* zone) {

5092 DCHECK(result != NULL);	5169 DCHECK(result != NULL);

5093 RegExpParser parser(input, &result->error, multiline, zone);	5170 RegExpParser parser(input, &result->error, multiline, unicode, zone);

5094 RegExpTree* tree = parser.ParsePattern();	5171 RegExpTree* tree = parser.ParsePattern();

5095 if (parser.failed()) {	5172 if (parser.failed()) {

5096 DCHECK(tree == NULL);	5173 DCHECK(tree == NULL);

5097 DCHECK(!result->error.is_null());	5174 DCHECK(!result->error.is_null());

5098 } else {	5175 } else {

5099 DCHECK(tree != NULL);	5176 DCHECK(tree != NULL);

5100 DCHECK(result->error.is_null());	5177 DCHECK(result->error.is_null());

5101 result->tree = tree;	5178 result->tree = tree;

5102 int capture_count = parser.captures_started();	5179 int capture_count = parser.captures_started();

5103 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;	5180 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

(...skipping 189 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5293 } else {	5370 } else {

5294 const uc16* data = reinterpret_cast<const uc16*>(raw_string->raw_data());	5371 const uc16* data = reinterpret_cast<const uc16*>(raw_string->raw_data());

5295 running_hash = StringHasher::ComputeRunningHash(running_hash, data,	5372 running_hash = StringHasher::ComputeRunningHash(running_hash, data,

5296 raw_string->length());	5373 raw_string->length());

5297 }	5374 }

5298 }	5375 }

5299	5376

5300 return running_hash;	5377 return running_hash;

5301 }	5378 }

5302 } } // namespace v8::internal	5379 } } // namespace v8::internal

OLD	NEW

« no previous file with comments | « src/parser.h ('k') | src/regexp.js » ('j') | src/regexp.js » ('J')