Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1926)

Side by Side Diff: src/parser.cc

Issue 788043005: ES6 unicode escapes, part 2: Regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: error reporting Created 5 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/parser.h ('k') | src/regexp.js » ('j') | src/regexp.js » ('J')
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/api.h" 7 #include "src/api.h"
8 #include "src/ast.h" 8 #include "src/ast.h"
9 #include "src/bailout-reason.h" 9 #include "src/bailout-reason.h"
10 #include "src/base/platform/platform.h" 10 #include "src/base/platform/platform.h"
(...skipping 4260 matching lines...) Expand 10 before | Expand all | Expand 10 after
4271 } 4271 }
4272 isolate()->counters()->total_preparse_skipped()->Increment( 4272 isolate()->counters()->total_preparse_skipped()->Increment(
4273 total_preparse_skipped_); 4273 total_preparse_skipped_);
4274 } 4274 }
4275 4275
4276 4276
4277 // ---------------------------------------------------------------------------- 4277 // ----------------------------------------------------------------------------
4278 // Regular expressions 4278 // Regular expressions
4279 4279
4280 4280
4281 RegExpParser::RegExpParser(FlatStringReader* in, 4281 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
4282 Handle<String>* error, 4282 bool multiline, bool unicode, Zone* zone)
4283 bool multiline,
4284 Zone* zone)
4285 : isolate_(zone->isolate()), 4283 : isolate_(zone->isolate()),
4286 zone_(zone), 4284 zone_(zone),
4287 error_(error), 4285 error_(error),
4288 captures_(NULL), 4286 captures_(NULL),
4289 in_(in), 4287 in_(in),
4290 current_(kEndMarker), 4288 current_(kEndMarker),
4291 next_pos_(0), 4289 next_pos_(0),
4292 capture_count_(0), 4290 capture_count_(0),
4293 has_more_(true), 4291 has_more_(true),
4294 multiline_(multiline), 4292 multiline_(multiline),
4293 unicode_(unicode),
4295 simple_(false), 4294 simple_(false),
4296 contains_anchor_(false), 4295 contains_anchor_(false),
4297 is_scanned_for_captures_(false), 4296 is_scanned_for_captures_(false),
4298 failed_(false) { 4297 failed_(false) {
4299 Advance(); 4298 Advance();
4300 } 4299 }
4301 4300
4302 4301
4303 uc32 RegExpParser::Next() { 4302 uc32 RegExpParser::Next() {
4304 if (has_next()) { 4303 if (has_next()) {
(...skipping 36 matching lines...) Expand 10 before | Expand all | Expand 10 after
4341 next_pos_ += dist - 1; 4340 next_pos_ += dist - 1;
4342 Advance(); 4341 Advance();
4343 } 4342 }
4344 4343
4345 4344
4346 bool RegExpParser::simple() { 4345 bool RegExpParser::simple() {
4347 return simple_; 4346 return simple_;
4348 } 4347 }
4349 4348
4350 4349
4350 bool RegExpParser::IsSyntaxCharacter(uc32 c) {
4351 return c == '^' || c == '$' || c == '\\' || c == '.' || c == '*' ||
4352 c == '+' || c == '?' || c == '(' || c == ')' || c == '[' || c == ']' ||
4353 c == '{' || c == '}' || c == '|';
4354 }
mathias 2015/01/08 12:29:07 Should `-` be a “syntax character” as well because
marja 2015/01/08 13:42:18 The spec (draft rev 30) says: SyntaxCharacter ::
rossberg 2015/01/08 14:11:45 It's not a SyntaxCharacter and that's likely inten
mathias 2015/01/21 07:16:54 /[\-]/u is now allowed: https://bugs.ecmascript.or
4355
4356
4351 RegExpTree* RegExpParser::ReportError(Vector<const char> message) { 4357 RegExpTree* RegExpParser::ReportError(Vector<const char> message) {
4352 failed_ = true; 4358 failed_ = true;
4353 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked(); 4359 *error_ = isolate()->factory()->NewStringFromAscii(message).ToHandleChecked();
4354 // Zip to the end to make sure the no more input is read. 4360 // Zip to the end to make sure the no more input is read.
4355 current_ = kEndMarker; 4361 current_ = kEndMarker;
4356 next_pos_ = in()->length(); 4362 next_pos_ = in()->length();
4357 return NULL; 4363 return NULL;
4358 } 4364 }
4359 4365
4360 4366
(...skipping 196 matching lines...) Expand 10 before | Expand all | Expand 10 after
4557 if (capture == NULL) { 4563 if (capture == NULL) {
4558 builder->AddEmpty(); 4564 builder->AddEmpty();
4559 break; 4565 break;
4560 } 4566 }
4561 RegExpTree* atom = new(zone()) RegExpBackReference(capture); 4567 RegExpTree* atom = new(zone()) RegExpBackReference(capture);
4562 builder->AddAtom(atom); 4568 builder->AddAtom(atom);
4563 break; 4569 break;
4564 } 4570 }
4565 uc32 first_digit = Next(); 4571 uc32 first_digit = Next();
4566 if (first_digit == '8' || first_digit == '9') { 4572 if (first_digit == '8' || first_digit == '9') {
4567 // Treat as identity escape 4573 // If the 'u' flag is present, only syntax characters can be escaped,
4568 builder->AddCharacter(first_digit); 4574 // no other identity escapes are allowed. If the 'u' flag is not
4569 Advance(2); 4575 // present, all identity escapes are allowed.
4576 if (!FLAG_harmony_unicode || !unicode_) {
4577 builder->AddCharacter(first_digit);
4578 Advance(2);
4579 } else {
4580 return ReportError(CStrVector("Invalid escape"));
4581 }
4570 break; 4582 break;
4571 } 4583 }
4572 } 4584 }
4573 // FALLTHROUGH 4585 // FALLTHROUGH
4574 case '0': { 4586 case '0': {
4575 Advance(); 4587 Advance();
4576 uc32 octal = ParseOctalLiteral(); 4588 uc32 octal = ParseOctalLiteral();
4577 builder->AddCharacter(octal); 4589 builder->AddCharacter(octal);
4578 break; 4590 break;
4579 } 4591 }
(...skipping 35 matching lines...) Expand 10 before | Expand all | Expand 10 after
4615 Advance(2); 4627 Advance(2);
4616 builder->AddCharacter(controlLetter & 0x1f); 4628 builder->AddCharacter(controlLetter & 0x1f);
4617 } 4629 }
4618 break; 4630 break;
4619 } 4631 }
4620 case 'x': { 4632 case 'x': {
4621 Advance(2); 4633 Advance(2);
4622 uc32 value; 4634 uc32 value;
4623 if (ParseHexEscape(2, &value)) { 4635 if (ParseHexEscape(2, &value)) {
4624 builder->AddCharacter(value); 4636 builder->AddCharacter(value);
4637 } else if (!FLAG_harmony_unicode || !unicode_) {
4638 builder->AddCharacter('x');
4625 } else { 4639 } else {
4626 builder->AddCharacter('x'); 4640 // If the 'u' flag is present, invalid escapes are not treated as
4641 // identity escapes.
4642 return ReportError(CStrVector("Invalid escape"));
4627 } 4643 }
4628 break; 4644 break;
4629 } 4645 }
4630 case 'u': { 4646 case 'u': {
4631 Advance(2); 4647 Advance(2);
4632 uc32 value; 4648 uc32 value;
4633 if (ParseHexEscape(4, &value)) { 4649 if (ParseUnicodeEscape(&value)) {
4634 builder->AddCharacter(value); 4650 builder->AddCharacter(value);
4651 } else if (!FLAG_harmony_unicode || !unicode_) {
4652 builder->AddCharacter('u');
4635 } else { 4653 } else {
4636 builder->AddCharacter('u'); 4654 // If the 'u' flag is present, invalid escapes are not treated as
4655 // identity escapes.
4656 return ReportError(CStrVector("Invalid unicode escape"));
4637 } 4657 }
4638 break; 4658 break;
4639 } 4659 }
4640 default: 4660 default:
4641 // Identity escape. 4661 Advance();
4642 builder->AddCharacter(Next()); 4662 // If the 'u' flag is present, only syntax characters can be escaped, no
4643 Advance(2); 4663 // other identity escapes are allowed. If the 'u' flag is not present,
4664 // all identity escapes are allowed.
4665 if (!FLAG_harmony_unicode || !unicode_ ||
4666 IsSyntaxCharacter(current())) {
4667 builder->AddCharacter(current());
4668 Advance();
4669 } else {
4670 return ReportError(CStrVector("Invalid escape"));
4671 }
4644 break; 4672 break;
4645 } 4673 }
4646 break; 4674 break;
4647 case '{': { 4675 case '{': {
4648 int dummy; 4676 int dummy;
4649 if (ParseIntervalQuantifier(&dummy, &dummy)) { 4677 if (ParseIntervalQuantifier(&dummy, &dummy)) {
4650 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED); 4678 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);
4651 } 4679 }
4652 // fallthrough 4680 // fallthrough
4653 } 4681 }
(...skipping 222 matching lines...) Expand 10 before | Expand all | Expand 10 after
4876 Advance(); 4904 Advance();
4877 if (value < 32 && '0' <= current() && current() <= '7') { 4905 if (value < 32 && '0' <= current() && current() <= '7') {
4878 value = value * 8 + current() - '0'; 4906 value = value * 8 + current() - '0';
4879 Advance(); 4907 Advance();
4880 } 4908 }
4881 } 4909 }
4882 return value; 4910 return value;
4883 } 4911 }
4884 4912
4885 4913
4886 bool RegExpParser::ParseHexEscape(int length, uc32 *value) { 4914 bool RegExpParser::ParseHexEscape(int length, uc32* value) {
4887 int start = position(); 4915 int start = position();
4888 uc32 val = 0; 4916 uc32 val = 0;
4889 bool done = false; 4917 for (int i = 0; i < length; ++i) {
4890 for (int i = 0; !done; i++) {
4891 uc32 c = current(); 4918 uc32 c = current();
4892 int d = HexValue(c); 4919 int d = HexValue(c);
4893 if (d < 0) { 4920 if (d < 0) {
4894 Reset(start); 4921 Reset(start);
4895 return false; 4922 return false;
4896 } 4923 }
4897 val = val * 16 + d; 4924 val = val * 16 + d;
4898 Advance(); 4925 Advance();
4899 if (i == length - 1) {
4900 done = true;
4901 }
4902 } 4926 }
4903 *value = val; 4927 *value = val;
4904 return true; 4928 return true;
4905 } 4929 }
4906 4930
4907 4931
4932 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
4933 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
4934 // allowed). In the latter case, the number of hex digits between { } is
4935 // arbitrary. \ and u have already been read.
4936 if (current() == '{' && FLAG_harmony_unicode && unicode_) {
4937 int start = position();
4938 Advance();
4939 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
4940 if (current() == '}') {
4941 Advance();
4942 return true;
4943 }
4944 }
4945 Reset(start);
4946 return false;
4947 }
4948 // \u but no {, or \u{...} escapes not allowed.
4949 return ParseHexEscape(4, value);
4950 }
4951
4952
4953 bool RegExpParser::ParseUnlimitedLengthHexNumber(int max_value, uc32* value) {
4954 uc32 x = 0;
4955 int d = HexValue(current());
4956 if (d < 0) {
4957 return false;
4958 }
4959 while (d >= 0) {
4960 x = x * 16 + d;
4961 if (x > max_value) {
4962 return false;
4963 }
4964 Advance();
4965 d = HexValue(current());
4966 }
4967 *value = x;
4968 return true;
4969 }
4970
4971
4908 uc32 RegExpParser::ParseClassCharacterEscape() { 4972 uc32 RegExpParser::ParseClassCharacterEscape() {
4909 DCHECK(current() == '\\'); 4973 DCHECK(current() == '\\');
4910 DCHECK(has_next() && !IsSpecialClassEscape(Next())); 4974 DCHECK(has_next() && !IsSpecialClassEscape(Next()));
4911 Advance(); 4975 Advance();
4912 switch (current()) { 4976 switch (current()) {
4913 case 'b': 4977 case 'b':
4914 Advance(); 4978 Advance();
4915 return '\b'; 4979 return '\b';
4916 // ControlEscape :: one of 4980 // ControlEscape :: one of
4917 // f n r t v 4981 // f n r t v
(...skipping 34 matching lines...) Expand 10 before | Expand all | Expand 10 after
4952 // For compatibility, we interpret a decimal escape that isn't 5016 // For compatibility, we interpret a decimal escape that isn't
4953 // a back reference (and therefore either \0 or not valid according 5017 // a back reference (and therefore either \0 or not valid according
4954 // to the specification) as a 1..3 digit octal character code. 5018 // to the specification) as a 1..3 digit octal character code.
4955 return ParseOctalLiteral(); 5019 return ParseOctalLiteral();
4956 case 'x': { 5020 case 'x': {
4957 Advance(); 5021 Advance();
4958 uc32 value; 5022 uc32 value;
4959 if (ParseHexEscape(2, &value)) { 5023 if (ParseHexEscape(2, &value)) {
4960 return value; 5024 return value;
4961 } 5025 }
4962 // If \x is not followed by a two-digit hexadecimal, treat it 5026 if (!FLAG_harmony_unicode || !unicode_) {
4963 // as an identity escape. 5027 // If \x is not followed by a two-digit hexadecimal, treat it
4964 return 'x'; 5028 // as an identity escape.
5029 return 'x';
5030 }
5031 // If the 'u' flag is present, invalid escapes are not treated as
5032 // identity escapes.
5033 ReportError(CStrVector("Invalid escape"));
5034 return 0;
4965 } 5035 }
4966 case 'u': { 5036 case 'u': {
4967 Advance(); 5037 Advance();
4968 uc32 value; 5038 uc32 value;
4969 if (ParseHexEscape(4, &value)) { 5039 if (ParseUnicodeEscape(&value)) {
4970 return value; 5040 return value;
4971 } 5041 }
4972 // If \u is not followed by a four-digit hexadecimal, treat it 5042 if (!FLAG_harmony_unicode || !unicode_) {
4973 // as an identity escape. 5043 return 'u';
4974 return 'u'; 5044 }
5045 // If the 'u' flag is present, invalid escapes are not treated as
5046 // identity escapes.
5047 ReportError(CStrVector("Invalid unicode escape"));
5048 return 0;
4975 } 5049 }
4976 default: { 5050 default: {
4977 // Extended identity escape. We accept any character that hasn't
4978 // been matched by a more specific case, not just the subset required
4979 // by the ECMAScript specification.
4980 uc32 result = current(); 5051 uc32 result = current();
4981 Advance(); 5052 // If the 'u' flag is present, only syntax characters can be escaped, no
4982 return result; 5053 // other identity escapes are allowed. If the 'u' flag is not present, all
5054 // identity escapes are allowed.
5055 if (!FLAG_harmony_unicode || !unicode_ || IsSyntaxCharacter(result)) {
5056 Advance();
5057 return result;
5058 }
5059 ReportError(CStrVector("Invalid escape"));
5060 return 0;
4983 } 5061 }
4984 } 5062 }
4985 return 0; 5063 return 0;
4986 } 5064 }
4987 5065
4988 5066
4989 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { 5067 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
4990 DCHECK_EQ(0, *char_class); 5068 DCHECK_EQ(0, *char_class);
4991 uc32 first = current(); 5069 uc32 first = current();
4992 if (first == '\\') { 5070 if (first == '\\') {
(...skipping 85 matching lines...) Expand 10 before | Expand all | Expand 10 after
5078 ranges->Add(CharacterRange::Everything(), zone()); 5156 ranges->Add(CharacterRange::Everything(), zone());
5079 is_negated = !is_negated; 5157 is_negated = !is_negated;
5080 } 5158 }
5081 return new(zone()) RegExpCharacterClass(ranges, is_negated); 5159 return new(zone()) RegExpCharacterClass(ranges, is_negated);
5082 } 5160 }
5083 5161
5084 5162
5085 // ---------------------------------------------------------------------------- 5163 // ----------------------------------------------------------------------------
5086 // The Parser interface. 5164 // The Parser interface.
5087 5165
5088 bool RegExpParser::ParseRegExp(FlatStringReader* input, 5166 bool RegExpParser::ParseRegExp(FlatStringReader* input, bool multiline,
5089 bool multiline, 5167 bool unicode, RegExpCompileData* result,
5090 RegExpCompileData* result,
5091 Zone* zone) { 5168 Zone* zone) {
5092 DCHECK(result != NULL); 5169 DCHECK(result != NULL);
5093 RegExpParser parser(input, &result->error, multiline, zone); 5170 RegExpParser parser(input, &result->error, multiline, unicode, zone);
5094 RegExpTree* tree = parser.ParsePattern(); 5171 RegExpTree* tree = parser.ParsePattern();
5095 if (parser.failed()) { 5172 if (parser.failed()) {
5096 DCHECK(tree == NULL); 5173 DCHECK(tree == NULL);
5097 DCHECK(!result->error.is_null()); 5174 DCHECK(!result->error.is_null());
5098 } else { 5175 } else {
5099 DCHECK(tree != NULL); 5176 DCHECK(tree != NULL);
5100 DCHECK(result->error.is_null()); 5177 DCHECK(result->error.is_null());
5101 result->tree = tree; 5178 result->tree = tree;
5102 int capture_count = parser.captures_started(); 5179 int capture_count = parser.captures_started();
5103 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 5180 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
(...skipping 189 matching lines...) Expand 10 before | Expand all | Expand 10 after
5293 } else { 5370 } else {
5294 const uc16* data = reinterpret_cast<const uc16*>(raw_string->raw_data()); 5371 const uc16* data = reinterpret_cast<const uc16*>(raw_string->raw_data());
5295 running_hash = StringHasher::ComputeRunningHash(running_hash, data, 5372 running_hash = StringHasher::ComputeRunningHash(running_hash, data,
5296 raw_string->length()); 5373 raw_string->length());
5297 } 5374 }
5298 } 5375 }
5299 5376
5300 return running_hash; 5377 return running_hash;
5301 } 5378 }
5302 } } // namespace v8::internal 5379 } } // namespace v8::internal
OLDNEW
« no previous file with comments | « src/parser.h ('k') | src/regexp.js » ('j') | src/regexp.js » ('J')

Powered by Google App Engine
This is Rietveld 408576698