src/regexp/regexp-parser.cc - Issue 1571563003: [regexp] quantifier refers to the surrogate pair in unicode regexp.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1571563003: [regexp] quantifier refers to the surrogate pair in unicode regexp. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicoderegexpatom

Patch Set: add parse tests Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
49 void RegExpParser::Advance() {	49 void RegExpParser::Advance() {

50 if (next_pos_ < in()->length()) {	50 if (next_pos_ < in()->length()) {

51 StackLimitCheck check(isolate());	51 StackLimitCheck check(isolate());

52 if (check.HasOverflowed()) {	52 if (check.HasOverflowed()) {

53 ReportError(CStrVector(Isolate::kStackOverflowMessage));	53 ReportError(CStrVector(Isolate::kStackOverflowMessage));

54 } else if (zone()->excess_allocation()) {	54 } else if (zone()->excess_allocation()) {

55 ReportError(CStrVector("Regular expression too large"));	55 ReportError(CStrVector("Regular expression too large"));

56 } else {	56 } else {

57 current_ = in()->Get(next_pos_);	57 current_ = in()->Get(next_pos_);

58 next_pos_++;	58 next_pos_++;

	59 // Read the whole surrogate pair in case of unicode flag, if possible.

	60 if (unicode_ && next_pos_ < in()->length() &&

	61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {

	62 uc16 trail = in()->Get(next_pos_);

	63 if (unibrow::Utf16::IsTrailSurrogate(trail)) {

	64 current_ = unibrow::Utf16::CombineSurrogatePair(

	65 static_cast<uc16>(current_), trail);

	66 next_pos_++;

	67 }

	68 }

59 }	69 }

60 } else {	70 } else {

61 current_ = kEndMarker;	71 current_ = kEndMarker;

62 // Advance so that position() points to 1-after-the-last-character. This is	72 // Advance so that position() points to 1-after-the-last-character. This is

63 // important so that Reset() to this position works correctly.	73 // important so that Reset() to this position works correctly.

64 next_pos_ = in()->length() + 1;	74 next_pos_ = in()->length() + 1;

65 has_more_ = false;	75 has_more_ = false;

66 }	76 }

67 }	77 }

68	78

(...skipping 341 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
410 // If the 'u' flag is present, invalid escapes are not treated as	420 // If the 'u' flag is present, invalid escapes are not treated as

411 // identity escapes.	421 // identity escapes.

412 return ReportError(CStrVector("Invalid escape"));	422 return ReportError(CStrVector("Invalid escape"));

413 }	423 }

414 break;	424 break;

415 }	425 }

416 case 'u': {	426 case 'u': {

417 Advance(2);	427 Advance(2);

418 uc32 value;	428 uc32 value;

419 if (ParseUnicodeEscape(&value)) {	429 if (ParseUnicodeEscape(&value)) {

420 if (value > unibrow::Utf16::kMaxNonSurrogateCharCode) {	430 builder->AddUnicodeCharacter(value);

421 builder->AddCharacter(unibrow::Utf16::LeadSurrogate(value));

422 builder->AddCharacter(unibrow::Utf16::TrailSurrogate(value));

423 } else {

424 builder->AddCharacter(static_cast<uc16>(value));

425 }

426 } else if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {	431 } else if (!FLAG_harmony_unicode_regexps \|\| !unicode_) {

427 builder->AddCharacter('u');	432 builder->AddCharacter('u');

428 } else {	433 } else {

429 // If the 'u' flag is present, invalid escapes are not treated as	434 // If the 'u' flag is present, invalid escapes are not treated as

430 // identity escapes.	435 // identity escapes.

431 return ReportError(CStrVector("Invalid unicode escape"));	436 return ReportError(CStrVector("Invalid unicode escape"));

432 }	437 }

433 break;	438 break;

434 }	439 }

435 default:	440 default:

(...skipping 14 matching lines...) Expand all Loading...
450 }	455 }

451 break;	456 break;

452 case '{': {	457 case '{': {

453 int dummy;	458 int dummy;

454 if (ParseIntervalQuantifier(&dummy, &dummy)) {	459 if (ParseIntervalQuantifier(&dummy, &dummy)) {

455 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);	460 ReportError(CStrVector("Nothing to repeat") CHECK_FAILED);

456 }	461 }

457 // fallthrough	462 // fallthrough

458 }	463 }

459 default:	464 default:

460 builder->AddCharacter(current());	465 if (unicode_) {
	rossberg 2016/01/11 12:25:41 Nit: is this if necessary? Can't you always use Ad Nit: is this if necessary? Can't you always use AddUnicode? Yang 2016/01/11 14:41:44 Good point. Done. Show quoted text On 2016/01/11 12:25:41, rossberg wrote: > Nit: is this if necessary? Can't you always use AddUnicode? Good point. Done.
	466 builder->AddUnicodeCharacter(current());

	467 } else {

	468 builder->AddCharacter(current());

	469 }

461 Advance();	470 Advance();

462 break;	471 break;

463 } // end switch(current())	472 } // end switch(current())

464	473

465 int min;	474 int min;

466 int max;	475 int max;

467 switch (current()) {	476 switch (current()) {

468 // QuantifierPrefix ::	477 // QuantifierPrefix ::

469 // *	478 // *

470 // +	479 // +

(...skipping 581 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1052 void RegExpBuilder::AddCharacter(uc16 c) {	1061 void RegExpBuilder::AddCharacter(uc16 c) {

1053 pending_empty_ = false;	1062 pending_empty_ = false;

1054 if (characters_ == NULL) {	1063 if (characters_ == NULL) {

1055 characters_ = new (zone()) ZoneList<uc16>(4, zone());	1064 characters_ = new (zone()) ZoneList<uc16>(4, zone());

1056 }	1065 }

1057 characters_->Add(c, zone());	1066 characters_->Add(c, zone());

1058 LAST(ADD_CHAR);	1067 LAST(ADD_CHAR);

1059 }	1068 }

1060	1069

1061	1070

	1071 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

	1072 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {

	1073 ZoneList<uc16> surrogate_pair(2, zone());

	1074 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());

	1075 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());

	1076 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

	1077 AddAtom(atom);

	1078 } else {

	1079 AddCharacter(static_cast<uc16>(c));

	1080 }

	1081 }

	1082

	1083

1062 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }	1084 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

1063	1085

1064	1086

1065 void RegExpBuilder::AddAtom(RegExpTree* term) {	1087 void RegExpBuilder::AddAtom(RegExpTree* term) {

1066 if (term->IsEmpty()) {	1088 if (term->IsEmpty()) {

1067 AddEmpty();	1089 AddEmpty();

1068 return;	1090 return;

1069 }	1091 }

1070 if (term->IsTextElement()) {	1092 if (term->IsTextElement()) {

1071 FlushCharacters();	1093 FlushCharacters();

(...skipping 83 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1155 UNREACHABLE();	1177 UNREACHABLE();

1156 return;	1178 return;

1157 }	1179 }

1158 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1180 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1159 zone());	1181 zone());

1160 LAST(ADD_TERM);	1182 LAST(ADD_TERM);

1161 }	1183 }

1162	1184

1163 } // namespace internal	1185 } // namespace internal

1164 } // namespace v8	1186 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »