src/regexp/regexp-parser.cc - Issue 1618753002: Revert of [regexp] implement character classes for unicode regexps.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1618753002: Revert of [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

11 #include "src/regexp/jsregexp.h"	11 #include "src/regexp/jsregexp.h"

12 #include "src/utils.h"	12 #include "src/utils.h"

13	13

14 namespace v8 {	14 namespace v8 {

15 namespace internal {	15 namespace internal {

16	16

17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,	17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)	18 bool multiline, bool unicode, Isolate* isolate,

	19 Zone* zone)

19 : isolate_(isolate),	20 : isolate_(isolate),

20 zone_(zone),	21 zone_(zone),

21 error_(error),	22 error_(error),

22 captures_(NULL),	23 captures_(NULL),

23 in_(in),	24 in_(in),

24 current_(kEndMarker),	25 current_(kEndMarker),

25 flags_(flags),

26 next_pos_(0),	26 next_pos_(0),

27 captures_started_(0),	27 captures_started_(0),

28 capture_count_(0),	28 capture_count_(0),

29 has_more_(true),	29 has_more_(true),

	30 multiline_(multiline),

	31 unicode_(unicode),

30 simple_(false),	32 simple_(false),

31 contains_anchor_(false),	33 contains_anchor_(false),

32 is_scanned_for_captures_(false),	34 is_scanned_for_captures_(false),

33 failed_(false) {	35 failed_(false) {

34 Advance();	36 Advance();

35 }	37 }

36	38

37	39

38 template <bool update_position>

39 uc32 RegExpParser::ReadNext() {

40 int position = next_pos_;

41 uc32 c0 = in()->Get(position);

42 position++;

43 // Read the whole surrogate pair in case of unicode flag, if possible.

44 if (unicode() && position < in()->length() &&

45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {

46 uc16 c1 = in()->Get(position);

47 if (unibrow::Utf16::IsTrailSurrogate(c1)) {

48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);

49 position++;

50 }

51 }

52 if (update_position) next_pos_ = position;

53 return c0;

54 }

55

56

57 uc32 RegExpParser::Next() {	40 uc32 RegExpParser::Next() {

58 if (has_next()) {	41 if (has_next()) {

59 return ReadNext<false>();	42 return in()->Get(next_pos_);

60 } else {	43 } else {

61 return kEndMarker;	44 return kEndMarker;

62 }	45 }

63 }	46 }

64	47

65	48

66 void RegExpParser::Advance() {	49 void RegExpParser::Advance() {

67 if (has_next()) {	50 if (next_pos_ < in()->length()) {

68 StackLimitCheck check(isolate());	51 StackLimitCheck check(isolate());

69 if (check.HasOverflowed()) {	52 if (check.HasOverflowed()) {

70 ReportError(CStrVector(Isolate::kStackOverflowMessage));	53 ReportError(CStrVector(Isolate::kStackOverflowMessage));

71 } else if (zone()->excess_allocation()) {	54 } else if (zone()->excess_allocation()) {

72 ReportError(CStrVector("Regular expression too large"));	55 ReportError(CStrVector("Regular expression too large"));

73 } else {	56 } else {

74 current_ = ReadNext<true>();	57 current_ = in()->Get(next_pos_);

	58 next_pos_++;

	59 // Read the whole surrogate pair in case of unicode flag, if possible.

	60 if (unicode_ && next_pos_ < in()->length() &&

	61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {

	62 uc16 trail = in()->Get(next_pos_);

	63 if (unibrow::Utf16::IsTrailSurrogate(trail)) {

	64 current_ = unibrow::Utf16::CombineSurrogatePair(

	65 static_cast<uc16>(current_), trail);

	66 next_pos_++;

	67 }

	68 }

75 }	69 }

76 } else {	70 } else {

77 current_ = kEndMarker;	71 current_ = kEndMarker;

78 // Advance so that position() points to 1-after-the-last-character. This is	72 // Advance so that position() points to 1-after-the-last-character. This is

79 // important so that Reset() to this position works correctly.	73 // important so that Reset() to this position works correctly.

80 next_pos_ = in()->length() + 1;	74 next_pos_ = in()->length() + 1;

81 has_more_ = false;	75 has_more_ = false;

82 }	76 }

83 }	77 }

84	78

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
141 // Alternative ::	135 // Alternative ::

142 // [empty]	136 // [empty]

143 // Term Alternative	137 // Term Alternative

144 // Term ::	138 // Term ::

145 // Assertion	139 // Assertion

146 // Atom	140 // Atom

147 // Atom Quantifier	141 // Atom Quantifier

148 RegExpTree* RegExpParser::ParseDisjunction() {	142 RegExpTree* RegExpParser::ParseDisjunction() {

149 // Used to store current state while parsing subexpressions.	143 // Used to store current state while parsing subexpressions.

150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,	144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,

151 flags_, zone());	145 zone());

152 RegExpParserState* state = &initial_state;	146 RegExpParserState* state = &initial_state;

153 // Cache the builder in a local variable for quick access.	147 // Cache the builder in a local variable for quick access.

154 RegExpBuilder* builder = initial_state.builder();	148 RegExpBuilder* builder = initial_state.builder();

155 while (true) {	149 while (true) {

156 switch (current()) {	150 switch (current()) {

157 case kEndMarker:	151 case kEndMarker:

158 if (state->IsSubexpression()) {	152 if (state->IsSubexpression()) {

159 // Inside a parenthesized group when hitting end of input.	153 // Inside a parenthesized group when hitting end of input.

160 ReportError(CStrVector("Unterminated group") CHECK_FAILED);	154 ReportError(CStrVector("Unterminated group") CHECK_FAILED);

161 }	155 }

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
205 Advance();	199 Advance();

206 builder->NewAlternative();	200 builder->NewAlternative();

207 continue;	201 continue;

208 }	202 }

209 case '*':	203 case '*':

210 case '+':	204 case '+':

211 case '?':	205 case '?':

212 return ReportError(CStrVector("Nothing to repeat"));	206 return ReportError(CStrVector("Nothing to repeat"));

213 case '^': {	207 case '^': {

214 Advance();	208 Advance();

215 if (multiline()) {	209 if (multiline_) {

216 builder->AddAssertion(	210 builder->AddAssertion(

217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));	211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));

218 } else {	212 } else {

219 builder->AddAssertion(	213 builder->AddAssertion(

220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));	214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

221 set_contains_anchor();	215 set_contains_anchor();

222 }	216 }

223 continue;	217 continue;

224 }	218 }

225 case '$': {	219 case '$': {

226 Advance();	220 Advance();

227 RegExpAssertion::AssertionType assertion_type =	221 RegExpAssertion::AssertionType assertion_type =

228 multiline() ? RegExpAssertion::END_OF_LINE	222 multiline_ ? RegExpAssertion::END_OF_LINE

229 : RegExpAssertion::END_OF_INPUT;	223 : RegExpAssertion::END_OF_INPUT;

230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));	224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));

231 continue;	225 continue;

232 }	226 }

233 case '.': {	227 case '.': {

234 Advance();	228 Advance();

235 // everything except \x0a, \x0d, \u2028 and \u2029	229 // everything except \x0a, \x0d, \u2028 and \u2029

236 ZoneList<CharacterRange>* ranges =	230 ZoneList<CharacterRange>* ranges =

237 new (zone()) ZoneList<CharacterRange>(2, zone());	231 new (zone()) ZoneList<CharacterRange>(2, zone());

238 CharacterRange::AddClassEscape('.', ranges, zone());	232 CharacterRange::AddClassEscape('.', ranges, zone());

239 RegExpCharacterClass* cc =	233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

240 new (zone()) RegExpCharacterClass(ranges, false);	234 builder->AddAtom(atom);

241 builder->AddCharacterClass(cc);

242 break;	235 break;

243 }	236 }

244 case '(': {	237 case '(': {

245 SubexpressionType subexpr_type = CAPTURE;	238 SubexpressionType subexpr_type = CAPTURE;

246 RegExpLookaround::Type lookaround_type = state->lookaround_type();	239 RegExpLookaround::Type lookaround_type = state->lookaround_type();

247 Advance();	240 Advance();

248 if (current() == '?') {	241 if (current() == '?') {

249 switch (Next()) {	242 switch (Next()) {

250 case ':':	243 case ':':

251 subexpr_type = GROUPING;	244 subexpr_type = GROUPING;

(...skipping 24 matching lines...) Expand all Loading...
276 break;	269 break;

277 }	270 }

278 Advance(2);	271 Advance(2);

279 } else {	272 } else {

280 if (captures_started_ >= kMaxCaptures) {	273 if (captures_started_ >= kMaxCaptures) {

281 ReportError(CStrVector("Too many captures") CHECK_FAILED);	274 ReportError(CStrVector("Too many captures") CHECK_FAILED);

282 }	275 }

283 captures_started_++;	276 captures_started_++;

284 }	277 }

285 // Store current state and begin new disjunction parsing.	278 // Store current state and begin new disjunction parsing.

286 state =	279 state = new (zone()) RegExpParserState(

287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,	280 state, subexpr_type, lookaround_type, captures_started_, zone());

288 captures_started_, flags_, zone());

289 builder = state->builder();	281 builder = state->builder();

290 continue;	282 continue;

291 }	283 }

292 case '[': {	284 case '[': {

293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);	285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);

294 builder->AddCharacterClass(cc->AsCharacterClass());	286 builder->AddAtom(atom);

295 break;	287 break;

296 }	288 }

297 // Atom ::	289 // Atom ::

298 // \ AtomEscape	290 // \ AtomEscape

299 case '\\':	291 case '\\':

300 switch (Next()) {	292 switch (Next()) {

301 case kEndMarker:	293 case kEndMarker:

302 return ReportError(CStrVector("\\ at end of pattern"));	294 return ReportError(CStrVector("\\ at end of pattern"));

303 case 'b':	295 case 'b':

304 Advance(2);	296 Advance(2);

(...skipping 14 matching lines...) Expand all Loading...
319 case 'D':	311 case 'D':

320 case 's':	312 case 's':

321 case 'S':	313 case 'S':

322 case 'w':	314 case 'w':

323 case 'W': {	315 case 'W': {

324 uc32 c = Next();	316 uc32 c = Next();

325 Advance(2);	317 Advance(2);

326 ZoneList<CharacterRange>* ranges =	318 ZoneList<CharacterRange>* ranges =

327 new (zone()) ZoneList<CharacterRange>(2, zone());	319 new (zone()) ZoneList<CharacterRange>(2, zone());

328 CharacterRange::AddClassEscape(c, ranges, zone());	320 CharacterRange::AddClassEscape(c, ranges, zone());

329 RegExpCharacterClass* cc =	321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);

330 new (zone()) RegExpCharacterClass(ranges, false);	322 builder->AddAtom(atom);

331 builder->AddCharacterClass(cc);

332 break;	323 break;

333 }	324 }

334 case '1':	325 case '1':

335 case '2':	326 case '2':

336 case '3':	327 case '3':

337 case '4':	328 case '4':

338 case '5':	329 case '5':

339 case '6':	330 case '6':

340 case '7':	331 case '7':

341 case '8':	332 case '8':

(...skipping 13 matching lines...) Expand all Loading...
355 builder->AddAtom(atom);	346 builder->AddAtom(atom);

356 }	347 }

357 break;	348 break;

358 }	349 }

359 uc32 first_digit = Next();	350 uc32 first_digit = Next();

360 if (first_digit == '8' \|\| first_digit == '9') {	351 if (first_digit == '8' \|\| first_digit == '9') {

361 // If the 'u' flag is present, only syntax characters can be	352 // If the 'u' flag is present, only syntax characters can be

362 // escaped,	353 // escaped,

363 // no other identity escapes are allowed. If the 'u' flag is not	354 // no other identity escapes are allowed. If the 'u' flag is not

364 // present, all identity escapes are allowed.	355 // present, all identity escapes are allowed.

365 if (!unicode()) {	356 if (!unicode_) {

366 builder->AddCharacter(first_digit);	357 builder->AddCharacter(first_digit);

367 Advance(2);	358 Advance(2);

368 } else {	359 } else {

369 return ReportError(CStrVector("Invalid escape"));	360 return ReportError(CStrVector("Invalid escape"));

370 }	361 }

371 break;	362 break;

372 }	363 }

373 }	364 }

374 // FALLTHROUGH	365 // FALLTHROUGH

375 case '0': {	366 case '0': {

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
416 Advance(2);	407 Advance(2);

417 builder->AddCharacter(controlLetter & 0x1f);	408 builder->AddCharacter(controlLetter & 0x1f);

418 }	409 }

419 break;	410 break;

420 }	411 }

421 case 'x': {	412 case 'x': {

422 Advance(2);	413 Advance(2);

423 uc32 value;	414 uc32 value;

424 if (ParseHexEscape(2, &value)) {	415 if (ParseHexEscape(2, &value)) {

425 builder->AddCharacter(value);	416 builder->AddCharacter(value);

426 } else if (!unicode()) {	417 } else if (!unicode_) {

427 builder->AddCharacter('x');	418 builder->AddCharacter('x');

428 } else {	419 } else {

429 // If the 'u' flag is present, invalid escapes are not treated as	420 // If the 'u' flag is present, invalid escapes are not treated as

430 // identity escapes.	421 // identity escapes.

431 return ReportError(CStrVector("Invalid escape"));	422 return ReportError(CStrVector("Invalid escape"));

432 }	423 }

433 break;	424 break;

434 }	425 }

435 case 'u': {	426 case 'u': {

436 Advance(2);	427 Advance(2);

437 uc32 value;	428 uc32 value;

438 if (ParseUnicodeEscape(&value)) {	429 if (ParseUnicodeEscape(&value)) {

439 builder->AddUnicodeCharacter(value);	430 builder->AddUnicodeCharacter(value);

440 } else if (!unicode()) {	431 } else if (!unicode_) {

441 builder->AddCharacter('u');	432 builder->AddCharacter('u');

442 } else {	433 } else {

443 // If the 'u' flag is present, invalid escapes are not treated as	434 // If the 'u' flag is present, invalid escapes are not treated as

444 // identity escapes.	435 // identity escapes.

445 return ReportError(CStrVector("Invalid unicode escape"));	436 return ReportError(CStrVector("Invalid unicode escape"));

446 }	437 }

447 break;	438 break;

448 }	439 }

449 default:	440 default:

450 Advance();	441 Advance();

451 // If the 'u' flag is present, only syntax characters can be	442 // If the 'u' flag is present, only syntax characters can be

452 // escaped, no	443 // escaped, no

453 // other identity escapes are allowed. If the 'u' flag is not	444 // other identity escapes are allowed. If the 'u' flag is not

454 // present,	445 // present,

455 // all identity escapes are allowed.	446 // all identity escapes are allowed.

456 if (!unicode() \|\| IsSyntaxCharacter(current())) {	447 if (!unicode_ \|\| IsSyntaxCharacter(current())) {

457 builder->AddCharacter(current());	448 builder->AddCharacter(current());

458 Advance();	449 Advance();

459 } else {	450 } else {

460 return ReportError(CStrVector("Invalid escape"));	451 return ReportError(CStrVector("Invalid escape"));

461 }	452 }

462 break;	453 break;

463 }	454 }

464 break;	455 break;

465 case '{': {	456 case '{': {

466 int dummy;	457 int dummy;

(...skipping 280 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
747 }	738 }

748 *value = val;	739 *value = val;

749 return true;	740 return true;

750 }	741 }

751	742

752	743

753 bool RegExpParser::ParseUnicodeEscape(uc32* value) {	744 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are	745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

755 // allowed). In the latter case, the number of hex digits between { } is	746 // allowed). In the latter case, the number of hex digits between { } is

756 // arbitrary. \ and u have already been read.	747 // arbitrary. \ and u have already been read.

757 if (current() == '{' && unicode()) {	748 if (current() == '{' && unicode_) {

758 int start = position();	749 int start = position();

759 Advance();	750 Advance();

760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {	751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

761 if (current() == '}') {	752 if (current() == '}') {

762 Advance();	753 Advance();

763 return true;	754 return true;

764 }	755 }

765 }	756 }

766 Reset(start);	757 Reset(start);

767 return false;	758 return false;

(...skipping 74 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
842 // For compatibility, we interpret a decimal escape that isn't	833 // For compatibility, we interpret a decimal escape that isn't

843 // a back reference (and therefore either \0 or not valid according	834 // a back reference (and therefore either \0 or not valid according

844 // to the specification) as a 1..3 digit octal character code.	835 // to the specification) as a 1..3 digit octal character code.

845 return ParseOctalLiteral();	836 return ParseOctalLiteral();

846 case 'x': {	837 case 'x': {

847 Advance();	838 Advance();

848 uc32 value;	839 uc32 value;

849 if (ParseHexEscape(2, &value)) {	840 if (ParseHexEscape(2, &value)) {

850 return value;	841 return value;

851 }	842 }

852 if (!unicode()) {	843 if (!unicode_) {

853 // If \x is not followed by a two-digit hexadecimal, treat it	844 // If \x is not followed by a two-digit hexadecimal, treat it

854 // as an identity escape.	845 // as an identity escape.

855 return 'x';	846 return 'x';

856 }	847 }

857 // If the 'u' flag is present, invalid escapes are not treated as	848 // If the 'u' flag is present, invalid escapes are not treated as

858 // identity escapes.	849 // identity escapes.

859 ReportError(CStrVector("Invalid escape"));	850 ReportError(CStrVector("Invalid escape"));

860 return 0;	851 return 0;

861 }	852 }

862 case 'u': {	853 case 'u': {

863 Advance();	854 Advance();

864 uc32 value;	855 uc32 value;

865 if (ParseUnicodeEscape(&value)) {	856 if (ParseUnicodeEscape(&value)) {

866 return value;	857 return value;

867 }	858 }

868 if (!unicode()) {	859 if (!unicode_) {

869 return 'u';	860 return 'u';

870 }	861 }

871 // If the 'u' flag is present, invalid escapes are not treated as	862 // If the 'u' flag is present, invalid escapes are not treated as

872 // identity escapes.	863 // identity escapes.

873 ReportError(CStrVector("Invalid unicode escape"));	864 ReportError(CStrVector("Invalid unicode escape"));

874 return 0;	865 return 0;

875 }	866 }

876 default: {	867 default: {

877 uc32 result = current();	868 uc32 result = current();

878 // If the 'u' flag is present, only syntax characters can be escaped, no	869 // If the 'u' flag is present, only syntax characters can be escaped, no

879 // other identity escapes are allowed. If the 'u' flag is not present, all	870 // other identity escapes are allowed. If the 'u' flag is not present, all

880 // identity escapes are allowed.	871 // identity escapes are allowed.

881 if (!unicode() \|\| IsSyntaxCharacter(result)) {	872 if (!unicode_ \|\| IsSyntaxCharacter(result)) {

882 Advance();	873 Advance();

883 return result;	874 return result;

884 }	875 }

885 ReportError(CStrVector("Invalid escape"));	876 ReportError(CStrVector("Invalid escape"));

886 return 0;	877 return 0;

887 }	878 }

888 }	879 }

889 return 0;	880 return 0;

890 }	881 }

891	882

892	883

893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {	884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

894 DCHECK_EQ(0, *char_class);	885 DCHECK_EQ(0, *char_class);

895 uc32 first = current();	886 uc32 first = current();

896 if (first == '\\') {	887 if (first == '\\') {

897 switch (Next()) {	888 switch (Next()) {

898 case 'w':	889 case 'w':

899 case 'W':	890 case 'W':

900 case 'd':	891 case 'd':

901 case 'D':	892 case 'D':

902 case 's':	893 case 's':

903 case 'S': {	894 case 'S': {

904 *char_class = Next();	895 *char_class = Next();

905 Advance(2);	896 Advance(2);

906 return CharacterRange::Singleton(0); // Return dummy value.	897 return CharacterRange::Singleton(0); // Return dummy value.

907 }	898 }

908 case kEndMarker:	899 case kEndMarker:

909 return ReportError(CStrVector("\\ at end of pattern"));	900 return ReportError(CStrVector("\\ at end of pattern"));

910 default:	901 default:

911 first = ParseClassCharacterEscape(CHECK_FAILED);	902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);

	903 return CharacterRange::Singleton(c);

912 }	904 }

913 } else {	905 } else {

914 Advance();	906 Advance();

	907 return CharacterRange::Singleton(first);

915 }	908 }

916

917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

918 // Combine with possibly following trail surrogate.

919 int start = position();

920 uc32 second = current();

921 if (second == '\\') {

922 second = ParseClassCharacterEscape(CHECK_FAILED);

923 } else {

924 Advance();

925 }

926 if (unibrow::Utf16::IsTrailSurrogate(second)) {

927 first = unibrow::Utf16::CombineSurrogatePair(first, second);

928 } else {

929 Reset(start);

930 }

931 }

932

933 return CharacterRange::Singleton(first);

934 }	909 }

935	910

936	911

937 static const uc16 kNoCharClass = 0;	912 static const uc16 kNoCharClass = 0;

938	913

939 // Adds range or pre-defined character class to character ranges.	914 // Adds range or pre-defined character class to character ranges.

940 // If char_class is not kInvalidClass, it's interpreted as a class	915 // If char_class is not kInvalidClass, it's interpreted as a class

941 // escape (i.e., 's' means whitespace, from '\s').	916 // escape (i.e., 's' means whitespace, from '\s').

942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,	917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

943 uc16 char_class, CharacterRange range,	918 uc16 char_class, CharacterRange range,

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1003 is_negated = !is_negated;	978 is_negated = !is_negated;

1004 }	979 }

1005 return new (zone()) RegExpCharacterClass(ranges, is_negated);	980 return new (zone()) RegExpCharacterClass(ranges, is_negated);

1006 }	981 }

1007	982

1008	983

1009 #undef CHECK_FAILED	984 #undef CHECK_FAILED

1010	985

1011	986

1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,	987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

1013 FlatStringReader* input, JSRegExp::Flags flags,	988 FlatStringReader* input, bool multiline,

1014 RegExpCompileData* result) {	989 bool unicode, RegExpCompileData* result) {

1015 DCHECK(result != NULL);	990 DCHECK(result != NULL);

1016 RegExpParser parser(input, &result->error, flags, isolate, zone);	991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);

1017 RegExpTree* tree = parser.ParsePattern();	992 RegExpTree* tree = parser.ParsePattern();

1018 if (parser.failed()) {	993 if (parser.failed()) {

1019 DCHECK(tree == NULL);	994 DCHECK(tree == NULL);

1020 DCHECK(!result->error.is_null());	995 DCHECK(!result->error.is_null());

1021 } else {	996 } else {

1022 DCHECK(tree != NULL);	997 DCHECK(tree != NULL);

1023 DCHECK(result->error.is_null());	998 DCHECK(result->error.is_null());

1024 if (FLAG_trace_regexp_parser) {	999 if (FLAG_trace_regexp_parser) {

1025 OFStream os(stdout);	1000 OFStream os(stdout);

1026 tree->Print(os, zone);	1001 tree->Print(os, zone);

1027 os << "\n";	1002 os << "\n";

1028 }	1003 }

1029 result->tree = tree;	1004 result->tree = tree;

1030 int capture_count = parser.captures_started();	1005 int capture_count = parser.captures_started();

1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;	1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

1032 result->contains_anchor = parser.contains_anchor();	1007 result->contains_anchor = parser.contains_anchor();

1033 result->capture_count = capture_count;	1008 result->capture_count = capture_count;

1034 }	1009 }

1035 return !parser.failed();	1010 return !parser.failed();

1036 }	1011 }

1037	1012

1038	1013

1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)	1014 RegExpBuilder::RegExpBuilder(Zone* zone)

1040 : zone_(zone),	1015 : zone_(zone),

1041 pending_empty_(false),	1016 pending_empty_(false),

1042 flags_(flags),

1043 characters_(NULL),	1017 characters_(NULL),

1044 pending_surrogate_(kNoPendingSurrogate),

1045 terms_(),	1018 terms_(),

1046 alternatives_()	1019 alternatives_()

1047 #ifdef DEBUG	1020 #ifdef DEBUG

1048 ,	1021 ,

1049 last_added_(ADD_NONE)	1022 last_added_(ADD_NONE)

1050 #endif	1023 #endif

1051 {	1024 {

1052 }	1025 }

1053	1026

1054	1027

1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {

1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

1057 FlushPendingSurrogate();

1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.

1059 pending_surrogate_ = lead_surrogate;

1060 }

1061

1062

1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {

1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));

1065 if (pending_surrogate_ != kNoPendingSurrogate) {

1066 uc16 lead_surrogate = pending_surrogate_;

1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

1068 ZoneList<uc16> surrogate_pair(2, zone());

1069 surrogate_pair.Add(lead_surrogate, zone());

1070 surrogate_pair.Add(trail_surrogate, zone());

1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

1072 pending_surrogate_ = kNoPendingSurrogate;

1073 AddAtom(atom);

1074 } else {

1075 pending_surrogate_ = trail_surrogate;

1076 FlushPendingSurrogate();

1077 }

1078 }

1079

1080

1081 void RegExpBuilder::FlushPendingSurrogate() {

1082 if (pending_surrogate_ != kNoPendingSurrogate) {

1083 // Use character class to desugar lone surrogate matching.

1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(

1085 CharacterRange::List(zone(),

1086 CharacterRange::Singleton(pending_surrogate_)),

1087 false);

1088 pending_surrogate_ = kNoPendingSurrogate;

1089 DCHECK(unicode());

1090 AddCharacterClass(cc);

1091 }

1092 }

1093

1094

1095 void RegExpBuilder::FlushCharacters() {	1028 void RegExpBuilder::FlushCharacters() {

1096 FlushPendingSurrogate();

1097 pending_empty_ = false;	1029 pending_empty_ = false;

1098 if (characters_ != NULL) {	1030 if (characters_ != NULL) {

1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());	1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());

1100 characters_ = NULL;	1032 characters_ = NULL;

1101 text_.Add(atom, zone());	1033 text_.Add(atom, zone());

1102 LAST(ADD_ATOM);	1034 LAST(ADD_ATOM);

1103 }	1035 }

1104 }	1036 }

1105	1037

1106	1038

1107 void RegExpBuilder::FlushText() {	1039 void RegExpBuilder::FlushText() {

1108 FlushCharacters();	1040 FlushCharacters();

1109 int num_text = text_.length();	1041 int num_text = text_.length();

1110 if (num_text == 0) {	1042 if (num_text == 0) {

1111 return;	1043 return;

1112 } else if (num_text == 1) {	1044 } else if (num_text == 1) {

1113 terms_.Add(text_.last(), zone());	1045 terms_.Add(text_.last(), zone());

1114 } else {	1046 } else {

1115 RegExpText* text = new (zone()) RegExpText(zone());	1047 RegExpText* text = new (zone()) RegExpText(zone());

1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());	1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());

1117 terms_.Add(text, zone());	1049 terms_.Add(text, zone());

1118 }	1050 }

1119 text_.Clear();	1051 text_.Clear();

1120 }	1052 }

1121	1053

1122	1054

1123 void RegExpBuilder::AddCharacter(uc16 c) {	1055 void RegExpBuilder::AddCharacter(uc16 c) {

1124 FlushPendingSurrogate();

1125 pending_empty_ = false;	1056 pending_empty_ = false;

1126 if (characters_ == NULL) {	1057 if (characters_ == NULL) {

1127 characters_ = new (zone()) ZoneList<uc16>(4, zone());	1058 characters_ = new (zone()) ZoneList<uc16>(4, zone());

1128 }	1059 }

1129 characters_->Add(c, zone());	1060 characters_->Add(c, zone());

1130 LAST(ADD_CHAR);	1061 LAST(ADD_CHAR);

1131 }	1062 }

1132	1063

1133	1064

1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {	1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {	1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {

1136 DCHECK(unicode());	1067 ZoneList<uc16> surrogate_pair(2, zone());

1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));	1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());

1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));	1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());

1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {	1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

1140 AddLeadSurrogate(c);	1071 AddAtom(atom);

1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

1142 AddTrailSurrogate(c);

1143 } else {	1072 } else {

1144 AddCharacter(static_cast<uc16>(c));	1073 AddCharacter(static_cast<uc16>(c));

1145 }	1074 }

1146 }	1075 }

1147	1076

1148	1077

1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }	1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

1150	1079

1151	1080

1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {

1154 // In unicode mode, character class needs to be desugared, so it

1155 // must be a standalone term instead of being part of a RegExpText.

1156 AddTerm(cc);

1157 } else {

1158 AddAtom(cc);

1159 }

1160 }

1161

1162

1163 void RegExpBuilder::AddAtom(RegExpTree* term) {	1081 void RegExpBuilder::AddAtom(RegExpTree* term) {

1164 if (term->IsEmpty()) {	1082 if (term->IsEmpty()) {

1165 AddEmpty();	1083 AddEmpty();

1166 return;	1084 return;

1167 }	1085 }

1168 if (term->IsTextElement()) {	1086 if (term->IsTextElement()) {

1169 FlushCharacters();	1087 FlushCharacters();

1170 text_.Add(term, zone());	1088 text_.Add(term, zone());

1171 } else {	1089 } else {

1172 FlushText();	1090 FlushText();

1173 terms_.Add(term, zone());	1091 terms_.Add(term, zone());

1174 }	1092 }

1175 LAST(ADD_ATOM);	1093 LAST(ADD_ATOM);

1176 }	1094 }

1177	1095

1178

1179 void RegExpBuilder::AddTerm(RegExpTree* term) {

1180 FlushText();

1181 terms_.Add(term, zone());

1182 LAST(ADD_ATOM);

1183 }

1184

1185	1096

1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) {	1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

1187 FlushText();	1098 FlushText();

1188 terms_.Add(assert, zone());	1099 terms_.Add(assert, zone());

1189 LAST(ADD_ASSERT);	1100 LAST(ADD_ASSERT);

1190 }	1101 }

1191	1102

1192	1103

1193 void RegExpBuilder::NewAlternative() { FlushTerms(); }	1104 void RegExpBuilder::NewAlternative() { FlushTerms(); }

1194	1105

(...skipping 19 matching lines...) Expand all Loading...
1214 FlushTerms();	1125 FlushTerms();

1215 int num_alternatives = alternatives_.length();	1126 int num_alternatives = alternatives_.length();

1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty();	1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty();

1217 if (num_alternatives == 1) return alternatives_.last();	1128 if (num_alternatives == 1) return alternatives_.last();

1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));	1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));

1219 }	1130 }

1220	1131

1221	1132

1222 void RegExpBuilder::AddQuantifierToAtom(	1133 void RegExpBuilder::AddQuantifierToAtom(

1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {	1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

1224 FlushPendingSurrogate();

1225 if (pending_empty_) {	1135 if (pending_empty_) {

1226 pending_empty_ = false;	1136 pending_empty_ = false;

1227 return;	1137 return;

1228 }	1138 }

1229 RegExpTree* atom;	1139 RegExpTree* atom;

1230 if (characters_ != NULL) {	1140 if (characters_ != NULL) {

1231 DCHECK(last_added_ == ADD_CHAR);	1141 DCHECK(last_added_ == ADD_CHAR);

1232 // Last atom was character.	1142 // Last atom was character.

1233 Vector<const uc16> char_vector = characters_->ToConstVector();	1143 Vector<const uc16> char_vector = characters_->ToConstVector();

1234 int num_chars = char_vector.length();	1144 int num_chars = char_vector.length();

(...skipping 26 matching lines...) Expand all Loading...
1261 UNREACHABLE();	1171 UNREACHABLE();

1262 return;	1172 return;

1263 }	1173 }

1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1265 zone());	1175 zone());

1266 LAST(ADD_TERM);	1176 LAST(ADD_TERM);

1267 }	1177 }

1268	1178

1269 } // namespace internal	1179 } // namespace internal

1270 } // namespace v8	1180 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »