src/regexp/regexp-parser.cc - Issue 1578253005: [regexp] implement character classes for unicode regexps.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: more tests Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

11 #include "src/regexp/jsregexp.h"	11 #include "src/regexp/jsregexp.h"

12 #include "src/utils.h"	12 #include "src/utils.h"

13	13

14 namespace v8 {	14 namespace v8 {

15 namespace internal {	15 namespace internal {

16	16

17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,	17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

18 bool multiline, bool unicode, Isolate* isolate,	18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)

19 Zone* zone)

20 : isolate_(isolate),	19 : isolate_(isolate),

21 zone_(zone),	20 zone_(zone),

22 error_(error),	21 error_(error),

23 captures_(NULL),	22 captures_(NULL),

24 in_(in),	23 in_(in),

25 current_(kEndMarker),	24 current_(kEndMarker),

	25 flags_(flags),

26 next_pos_(0),	26 next_pos_(0),

27 captures_started_(0),	27 captures_started_(0),

28 capture_count_(0),	28 capture_count_(0),

29 has_more_(true),	29 has_more_(true),

30 multiline_(multiline),

31 unicode_(unicode),

32 simple_(false),	30 simple_(false),

33 contains_anchor_(false),	31 contains_anchor_(false),

34 is_scanned_for_captures_(false),	32 is_scanned_for_captures_(false),

35 failed_(false) {	33 failed_(false) {

36 Advance();	34 Advance();

37 }	35 }

38	36

39	37

	38 template <bool update_position>

	39 uc32 RegExpParser::ReadNext() {

	40 int position = next_pos_;

	41 uc32 c0 = in()->Get(position);

	42 position++;

	43 // Read the whole surrogate pair in case of unicode flag, if possible.

	44 if (unicode() && position < in()->length() &&

	45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {

	46 uc16 c1 = in()->Get(position);

	47 if (unibrow::Utf16::IsTrailSurrogate(c1)) {

	48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);

	49 position++;

	50 }

	51 }

	52 if (update_position) next_pos_ = position;

	53 return c0;

	54 }

	55

	56

40 uc32 RegExpParser::Next() {	57 uc32 RegExpParser::Next() {

41 if (has_next()) {	58 if (has_next()) {

42 return in()->Get(next_pos_);	59 return ReadNext<false>();

43 } else {	60 } else {

44 return kEndMarker;	61 return kEndMarker;

45 }	62 }

46 }	63 }

47	64

48	65

49 void RegExpParser::Advance() {	66 void RegExpParser::Advance() {

50 if (next_pos_ < in()->length()) {	67 if (has_next()) {

51 StackLimitCheck check(isolate());	68 StackLimitCheck check(isolate());

52 if (check.HasOverflowed()) {	69 if (check.HasOverflowed()) {

53 ReportError(CStrVector(Isolate::kStackOverflowMessage));	70 ReportError(CStrVector(Isolate::kStackOverflowMessage));

54 } else if (zone()->excess_allocation()) {	71 } else if (zone()->excess_allocation()) {

55 ReportError(CStrVector("Regular expression too large"));	72 ReportError(CStrVector("Regular expression too large"));

56 } else {	73 } else {

57 current_ = in()->Get(next_pos_);	74 current_ = ReadNext<true>();

58 next_pos_++;

59 // Read the whole surrogate pair in case of unicode flag, if possible.

60 if (unicode_ && next_pos_ < in()->length() &&

61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {

62 uc16 trail = in()->Get(next_pos_);

63 if (unibrow::Utf16::IsTrailSurrogate(trail)) {

64 current_ = unibrow::Utf16::CombineSurrogatePair(

65 static_cast<uc16>(current_), trail);

66 next_pos_++;

67 }

68 }

69 }	75 }

70 } else {	76 } else {

71 current_ = kEndMarker;	77 current_ = kEndMarker;

72 // Advance so that position() points to 1-after-the-last-character. This is	78 // Advance so that position() points to 1-after-the-last-character. This is

73 // important so that Reset() to this position works correctly.	79 // important so that Reset() to this position works correctly.

74 next_pos_ = in()->length() + 1;	80 next_pos_ = in()->length() + 1;

75 has_more_ = false;	81 has_more_ = false;

76 }	82 }

77 }	83 }

78	84

(...skipping 56 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
135 // Alternative ::	141 // Alternative ::

136 // [empty]	142 // [empty]

137 // Term Alternative	143 // Term Alternative

138 // Term ::	144 // Term ::

139 // Assertion	145 // Assertion

140 // Atom	146 // Atom

141 // Atom Quantifier	147 // Atom Quantifier

142 RegExpTree* RegExpParser::ParseDisjunction() {	148 RegExpTree* RegExpParser::ParseDisjunction() {

143 // Used to store current state while parsing subexpressions.	149 // Used to store current state while parsing subexpressions.

144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,	150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,

145 zone());	151 flags_, zone());

146 RegExpParserState* state = &initial_state;	152 RegExpParserState* state = &initial_state;

147 // Cache the builder in a local variable for quick access.	153 // Cache the builder in a local variable for quick access.

148 RegExpBuilder* builder = initial_state.builder();	154 RegExpBuilder* builder = initial_state.builder();

149 while (true) {	155 while (true) {

150 switch (current()) {	156 switch (current()) {

151 case kEndMarker:	157 case kEndMarker:

152 if (state->IsSubexpression()) {	158 if (state->IsSubexpression()) {

153 // Inside a parenthesized group when hitting end of input.	159 // Inside a parenthesized group when hitting end of input.

154 ReportError(CStrVector("Unterminated group") CHECK_FAILED);	160 ReportError(CStrVector("Unterminated group") CHECK_FAILED);

155 }	161 }

(...skipping 43 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
199 Advance();	205 Advance();

200 builder->NewAlternative();	206 builder->NewAlternative();

201 continue;	207 continue;

202 }	208 }

203 case '*':	209 case '*':

204 case '+':	210 case '+':

205 case '?':	211 case '?':

206 return ReportError(CStrVector("Nothing to repeat"));	212 return ReportError(CStrVector("Nothing to repeat"));

207 case '^': {	213 case '^': {

208 Advance();	214 Advance();

209 if (multiline_) {	215 if (multiline()) {

210 builder->AddAssertion(	216 builder->AddAssertion(

211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));	217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));

212 } else {	218 } else {

213 builder->AddAssertion(	219 builder->AddAssertion(

214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));	220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));

215 set_contains_anchor();	221 set_contains_anchor();

216 }	222 }

217 continue;	223 continue;

218 }	224 }

219 case '$': {	225 case '$': {

220 Advance();	226 Advance();

221 RegExpAssertion::AssertionType assertion_type =	227 RegExpAssertion::AssertionType assertion_type =

222 multiline_ ? RegExpAssertion::END_OF_LINE	228 multiline() ? RegExpAssertion::END_OF_LINE

223 : RegExpAssertion::END_OF_INPUT;	229 : RegExpAssertion::END_OF_INPUT;

224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));	230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));

225 continue;	231 continue;

226 }	232 }

227 case '.': {	233 case '.': {

228 Advance();	234 Advance();

229 // everything except \x0a, \x0d, \u2028 and \u2029	235 // everything except \x0a, \x0d, \u2028 and \u2029

230 ZoneList<CharacterRange>* ranges =	236 ZoneList<CharacterRange>* ranges =

231 new (zone()) ZoneList<CharacterRange>(2, zone());	237 new (zone()) ZoneList<CharacterRange>(2, zone());

232 CharacterRange::AddClassEscape('.', ranges, zone());	238 CharacterRange::AddClassEscape('.', ranges, zone());

233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);	239 RegExpCharacterClass* cc =

234 builder->AddAtom(atom);	240 new (zone()) RegExpCharacterClass(ranges, false);

	241 builder->AddCharacterClass(cc);

235 break;	242 break;

236 }	243 }

237 case '(': {	244 case '(': {

238 SubexpressionType subexpr_type = CAPTURE;	245 SubexpressionType subexpr_type = CAPTURE;

239 RegExpLookaround::Type lookaround_type = state->lookaround_type();	246 RegExpLookaround::Type lookaround_type = state->lookaround_type();

240 Advance();	247 Advance();

241 if (current() == '?') {	248 if (current() == '?') {

242 switch (Next()) {	249 switch (Next()) {

243 case ':':	250 case ':':

244 subexpr_type = GROUPING;	251 subexpr_type = GROUPING;

(...skipping 24 matching lines...) Expand all Loading...
269 break;	276 break;

270 }	277 }

271 Advance(2);	278 Advance(2);

272 } else {	279 } else {

273 if (captures_started_ >= kMaxCaptures) {	280 if (captures_started_ >= kMaxCaptures) {

274 ReportError(CStrVector("Too many captures") CHECK_FAILED);	281 ReportError(CStrVector("Too many captures") CHECK_FAILED);

275 }	282 }

276 captures_started_++;	283 captures_started_++;

277 }	284 }

278 // Store current state and begin new disjunction parsing.	285 // Store current state and begin new disjunction parsing.

279 state = new (zone()) RegExpParserState(	286 state =

280 state, subexpr_type, lookaround_type, captures_started_, zone());	287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,

	288 captures_started_, flags_, zone());

281 builder = state->builder();	289 builder = state->builder();

282 continue;	290 continue;

283 }	291 }

284 case '[': {	292 case '[': {

285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);	293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);

286 builder->AddAtom(atom);	294 builder->AddCharacterClass(cc->AsCharacterClass());

287 break;	295 break;

288 }	296 }

289 // Atom ::	297 // Atom ::

290 // \ AtomEscape	298 // \ AtomEscape

291 case '\\':	299 case '\\':

292 switch (Next()) {	300 switch (Next()) {

293 case kEndMarker:	301 case kEndMarker:

294 return ReportError(CStrVector("\\ at end of pattern"));	302 return ReportError(CStrVector("\\ at end of pattern"));

295 case 'b':	303 case 'b':

296 Advance(2);	304 Advance(2);

(...skipping 14 matching lines...) Expand all Loading...
311 case 'D':	319 case 'D':

312 case 's':	320 case 's':

313 case 'S':	321 case 'S':

314 case 'w':	322 case 'w':

315 case 'W': {	323 case 'W': {

316 uc32 c = Next();	324 uc32 c = Next();

317 Advance(2);	325 Advance(2);

318 ZoneList<CharacterRange>* ranges =	326 ZoneList<CharacterRange>* ranges =

319 new (zone()) ZoneList<CharacterRange>(2, zone());	327 new (zone()) ZoneList<CharacterRange>(2, zone());

320 CharacterRange::AddClassEscape(c, ranges, zone());	328 CharacterRange::AddClassEscape(c, ranges, zone());

321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);	329 RegExpCharacterClass* cc =

322 builder->AddAtom(atom);	330 new (zone()) RegExpCharacterClass(ranges, false);

	331 builder->AddCharacterClass(cc);

323 break;	332 break;

324 }	333 }

325 case '1':	334 case '1':

326 case '2':	335 case '2':

327 case '3':	336 case '3':

328 case '4':	337 case '4':

329 case '5':	338 case '5':

330 case '6':	339 case '6':

331 case '7':	340 case '7':

332 case '8':	341 case '8':

(...skipping 13 matching lines...) Expand all Loading...
346 builder->AddAtom(atom);	355 builder->AddAtom(atom);

347 }	356 }

348 break;	357 break;

349 }	358 }

350 uc32 first_digit = Next();	359 uc32 first_digit = Next();

351 if (first_digit == '8' \|\| first_digit == '9') {	360 if (first_digit == '8' \|\| first_digit == '9') {

352 // If the 'u' flag is present, only syntax characters can be	361 // If the 'u' flag is present, only syntax characters can be

353 // escaped,	362 // escaped,

354 // no other identity escapes are allowed. If the 'u' flag is not	363 // no other identity escapes are allowed. If the 'u' flag is not

355 // present, all identity escapes are allowed.	364 // present, all identity escapes are allowed.

356 if (!unicode_) {	365 if (!unicode()) {

357 builder->AddCharacter(first_digit);	366 builder->AddCharacter(first_digit);

358 Advance(2);	367 Advance(2);

359 } else {	368 } else {

360 return ReportError(CStrVector("Invalid escape"));	369 return ReportError(CStrVector("Invalid escape"));

361 }	370 }

362 break;	371 break;

363 }	372 }

364 }	373 }

365 // FALLTHROUGH	374 // FALLTHROUGH

366 case '0': {	375 case '0': {

(...skipping 40 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
407 Advance(2);	416 Advance(2);

408 builder->AddCharacter(controlLetter & 0x1f);	417 builder->AddCharacter(controlLetter & 0x1f);

409 }	418 }

410 break;	419 break;

411 }	420 }

412 case 'x': {	421 case 'x': {

413 Advance(2);	422 Advance(2);

414 uc32 value;	423 uc32 value;

415 if (ParseHexEscape(2, &value)) {	424 if (ParseHexEscape(2, &value)) {

416 builder->AddCharacter(value);	425 builder->AddCharacter(value);

417 } else if (!unicode_) {	426 } else if (!unicode()) {

418 builder->AddCharacter('x');	427 builder->AddCharacter('x');

419 } else {	428 } else {

420 // If the 'u' flag is present, invalid escapes are not treated as	429 // If the 'u' flag is present, invalid escapes are not treated as

421 // identity escapes.	430 // identity escapes.

422 return ReportError(CStrVector("Invalid escape"));	431 return ReportError(CStrVector("Invalid escape"));

423 }	432 }

424 break;	433 break;

425 }	434 }

426 case 'u': {	435 case 'u': {

427 Advance(2);	436 Advance(2);

428 uc32 value;	437 uc32 value;

429 if (ParseUnicodeEscape(&value)) {	438 if (ParseUnicodeEscape(&value)) {

430 builder->AddUnicodeCharacter(value);	439 builder->AddUnicodeCharacter(value);

431 } else if (!unicode_) {	440 } else if (!unicode()) {

432 builder->AddCharacter('u');	441 builder->AddCharacter('u');

433 } else {	442 } else {

434 // If the 'u' flag is present, invalid escapes are not treated as	443 // If the 'u' flag is present, invalid escapes are not treated as

435 // identity escapes.	444 // identity escapes.

436 return ReportError(CStrVector("Invalid unicode escape"));	445 return ReportError(CStrVector("Invalid unicode escape"));

437 }	446 }

438 break;	447 break;

439 }	448 }

440 default:	449 default:

441 Advance();	450 Advance();

442 // If the 'u' flag is present, only syntax characters can be	451 // If the 'u' flag is present, only syntax characters can be

443 // escaped, no	452 // escaped, no

444 // other identity escapes are allowed. If the 'u' flag is not	453 // other identity escapes are allowed. If the 'u' flag is not

445 // present,	454 // present,

446 // all identity escapes are allowed.	455 // all identity escapes are allowed.

447 if (!unicode_ \|\| IsSyntaxCharacter(current())) {	456 if (!unicode() \|\| IsSyntaxCharacter(current())) {

448 builder->AddCharacter(current());	457 builder->AddCharacter(current());

449 Advance();	458 Advance();

450 } else {	459 } else {

451 return ReportError(CStrVector("Invalid escape"));	460 return ReportError(CStrVector("Invalid escape"));

452 }	461 }

453 break;	462 break;

454 }	463 }

455 break;	464 break;

456 case '{': {	465 case '{': {

457 int dummy;	466 int dummy;

(...skipping 280 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
738 }	747 }

739 *value = val;	748 *value = val;

740 return true;	749 return true;

741 }	750 }

742	751

743	752

744 bool RegExpParser::ParseUnicodeEscape(uc32* value) {	753 bool RegExpParser::ParseUnicodeEscape(uc32* value) {

745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are	754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are

746 // allowed). In the latter case, the number of hex digits between { } is	755 // allowed). In the latter case, the number of hex digits between { } is

747 // arbitrary. \ and u have already been read.	756 // arbitrary. \ and u have already been read.

748 if (current() == '{' && unicode_) {	757 if (current() == '{' && unicode()) {

749 int start = position();	758 int start = position();

750 Advance();	759 Advance();

751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {	760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {

752 if (current() == '}') {	761 if (current() == '}') {

753 Advance();	762 Advance();

754 return true;	763 return true;

755 }	764 }

756 }	765 }

757 Reset(start);	766 Reset(start);

758 return false;	767 return false;

(...skipping 74 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
833 // For compatibility, we interpret a decimal escape that isn't	842 // For compatibility, we interpret a decimal escape that isn't

834 // a back reference (and therefore either \0 or not valid according	843 // a back reference (and therefore either \0 or not valid according

835 // to the specification) as a 1..3 digit octal character code.	844 // to the specification) as a 1..3 digit octal character code.

836 return ParseOctalLiteral();	845 return ParseOctalLiteral();

837 case 'x': {	846 case 'x': {

838 Advance();	847 Advance();

839 uc32 value;	848 uc32 value;

840 if (ParseHexEscape(2, &value)) {	849 if (ParseHexEscape(2, &value)) {

841 return value;	850 return value;

842 }	851 }

843 if (!unicode_) {	852 if (!unicode()) {

844 // If \x is not followed by a two-digit hexadecimal, treat it	853 // If \x is not followed by a two-digit hexadecimal, treat it

845 // as an identity escape.	854 // as an identity escape.

846 return 'x';	855 return 'x';

847 }	856 }

848 // If the 'u' flag is present, invalid escapes are not treated as	857 // If the 'u' flag is present, invalid escapes are not treated as

849 // identity escapes.	858 // identity escapes.

850 ReportError(CStrVector("Invalid escape"));	859 ReportError(CStrVector("Invalid escape"));

851 return 0;	860 return 0;

852 }	861 }

853 case 'u': {	862 case 'u': {

854 Advance();	863 Advance();

855 uc32 value;	864 uc32 value;

856 if (ParseUnicodeEscape(&value)) {	865 if (ParseUnicodeEscape(&value)) {

857 return value;	866 return value;

858 }	867 }

859 if (!unicode_) {	868 if (!unicode()) {

860 return 'u';	869 return 'u';

861 }	870 }

862 // If the 'u' flag is present, invalid escapes are not treated as	871 // If the 'u' flag is present, invalid escapes are not treated as

863 // identity escapes.	872 // identity escapes.

864 ReportError(CStrVector("Invalid unicode escape"));	873 ReportError(CStrVector("Invalid unicode escape"));

865 return 0;	874 return 0;

866 }	875 }

867 default: {	876 default: {

868 uc32 result = current();	877 uc32 result = current();

869 // If the 'u' flag is present, only syntax characters can be escaped, no	878 // If the 'u' flag is present, only syntax characters can be escaped, no

870 // other identity escapes are allowed. If the 'u' flag is not present, all	879 // other identity escapes are allowed. If the 'u' flag is not present, all

871 // identity escapes are allowed.	880 // identity escapes are allowed.

872 if (!unicode_ \|\| IsSyntaxCharacter(result)) {	881 if (!unicode() \|\| IsSyntaxCharacter(result)) {

873 Advance();	882 Advance();

874 return result;	883 return result;

875 }	884 }

876 ReportError(CStrVector("Invalid escape"));	885 ReportError(CStrVector("Invalid escape"));

877 return 0;	886 return 0;

878 }	887 }

879 }	888 }

880 return 0;	889 return 0;

881 }	890 }

882	891

883	892

884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {	893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {

885 DCHECK_EQ(0, *char_class);	894 DCHECK_EQ(0, *char_class);

886 uc32 first = current();	895 uc32 first = current();

887 if (first == '\\') {	896 if (first == '\\') {

888 switch (Next()) {	897 switch (Next()) {

889 case 'w':	898 case 'w':

890 case 'W':	899 case 'W':

891 case 'd':	900 case 'd':

892 case 'D':	901 case 'D':

893 case 's':	902 case 's':

894 case 'S': {	903 case 'S': {

895 *char_class = Next();	904 *char_class = Next();

896 Advance(2);	905 Advance(2);

897 return CharacterRange::Singleton(0); // Return dummy value.	906 return CharacterRange::Singleton(0); // Return dummy value.

898 }	907 }

899 case kEndMarker:	908 case kEndMarker:

900 return ReportError(CStrVector("\\ at end of pattern"));	909 return ReportError(CStrVector("\\ at end of pattern"));

901 default:	910 default:

902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);	911 first = ParseClassCharacterEscape(CHECK_FAILED);

903 return CharacterRange::Singleton(c);

904 }	912 }

905 } else {	913 } else {

906 Advance();	914 Advance();

907 return CharacterRange::Singleton(first);

908 }	915 }

	916

	917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {

	918 // Combine with possibly following trail surrogate.

	919 int start = position();

	920 uc32 second = current();

	921 if (second == '\\') {

	922 second = ParseClassCharacterEscape(CHECK_FAILED);

	923 } else {

	924 Advance();

	925 }

	926 if (unibrow::Utf16::IsTrailSurrogate(second)) {

	927 first = unibrow::Utf16::CombineSurrogatePair(first, second);

	928 } else {

	929 Reset(start);

	930 }

	931 }

	932

	933 return CharacterRange::Singleton(first);

909 }	934 }

910	935

911	936

912 static const uc16 kNoCharClass = 0;	937 static const uc16 kNoCharClass = 0;

913	938

914 // Adds range or pre-defined character class to character ranges.	939 // Adds range or pre-defined character class to character ranges.

915 // If char_class is not kInvalidClass, it's interpreted as a class	940 // If char_class is not kInvalidClass, it's interpreted as a class

916 // escape (i.e., 's' means whitespace, from '\s').	941 // escape (i.e., 's' means whitespace, from '\s').

917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,	942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,

918 uc16 char_class, CharacterRange range,	943 uc16 char_class, CharacterRange range,

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
978 is_negated = !is_negated;	1003 is_negated = !is_negated;

979 }	1004 }

980 return new (zone()) RegExpCharacterClass(ranges, is_negated);	1005 return new (zone()) RegExpCharacterClass(ranges, is_negated);

981 }	1006 }

982	1007

983	1008

984 #undef CHECK_FAILED	1009 #undef CHECK_FAILED

985	1010

986	1011

987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,	1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,

988 FlatStringReader* input, bool multiline,	1013 FlatStringReader* input, JSRegExp::Flags flags,

989 bool unicode, RegExpCompileData* result) {	1014 RegExpCompileData* result) {

990 DCHECK(result != NULL);	1015 DCHECK(result != NULL);

991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);	1016 RegExpParser parser(input, &result->error, flags, isolate, zone);

992 RegExpTree* tree = parser.ParsePattern();	1017 RegExpTree* tree = parser.ParsePattern();

993 if (parser.failed()) {	1018 if (parser.failed()) {

994 DCHECK(tree == NULL);	1019 DCHECK(tree == NULL);

995 DCHECK(!result->error.is_null());	1020 DCHECK(!result->error.is_null());

996 } else {	1021 } else {

997 DCHECK(tree != NULL);	1022 DCHECK(tree != NULL);

998 DCHECK(result->error.is_null());	1023 DCHECK(result->error.is_null());

999 if (FLAG_trace_regexp_parser) {	1024 if (FLAG_trace_regexp_parser) {

1000 OFStream os(stdout);	1025 OFStream os(stdout);

1001 tree->Print(os, zone);	1026 tree->Print(os, zone);

1002 os << "\n";	1027 os << "\n";

1003 }	1028 }

1004 result->tree = tree;	1029 result->tree = tree;

1005 int capture_count = parser.captures_started();	1030 int capture_count = parser.captures_started();

1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;	1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;

1007 result->contains_anchor = parser.contains_anchor();	1032 result->contains_anchor = parser.contains_anchor();

1008 result->capture_count = capture_count;	1033 result->capture_count = capture_count;

1009 }	1034 }

1010 return !parser.failed();	1035 return !parser.failed();

1011 }	1036 }

1012	1037

1013	1038

1014 RegExpBuilder::RegExpBuilder(Zone* zone)	1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)

1015 : zone_(zone),	1040 : zone_(zone),

1016 pending_empty_(false),	1041 pending_empty_(false),

	1042 flags_(flags),

1017 characters_(NULL),	1043 characters_(NULL),

	1044 pending_surrogate_(kNoPendingSurrogate),

1018 terms_(),	1045 terms_(),

1019 alternatives_()	1046 alternatives_()

1020 #ifdef DEBUG	1047 #ifdef DEBUG

1021 ,	1048 ,

1022 last_added_(ADD_NONE)	1049 last_added_(ADD_NONE)

1023 #endif	1050 #endif

1024 {	1051 {

1025 }	1052 }

1026	1053

1027	1054

	1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {

	1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

	1057 FlushPendingSurrogate();

	1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.

	1059 pending_surrogate_ = lead_surrogate;

	1060 }

	1061

	1062

	1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {

	1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));

	1065 if (pending_surrogate_ != kNoPendingSurrogate) {

	1066 uc16 lead_surrogate = pending_surrogate_;

	1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

	1068 ZoneList<uc16> surrogate_pair(2, zone());

	1069 surrogate_pair.Add(lead_surrogate, zone());

	1070 surrogate_pair.Add(trail_surrogate, zone());

	1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

	1072 pending_surrogate_ = kNoPendingSurrogate;

	1073 AddAtom(atom);

	1074 } else {

	1075 pending_surrogate_ = trail_surrogate;

	1076 FlushPendingSurrogate();

	1077 }

	1078 }

	1079

	1080

	1081 void RegExpBuilder::FlushPendingSurrogate() {

	1082 if (pending_surrogate_ != kNoPendingSurrogate) {

	1083 // Use character class to desugar lone surrogate matching.

	1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(

	1085 CharacterRange::List(zone(),

	1086 CharacterRange::Singleton(pending_surrogate_)),

	1087 false);

	1088 pending_surrogate_ = kNoPendingSurrogate;

	1089 DCHECK(unicode());

	1090 AddCharacterClass(cc);

	1091 }

	1092 }

	1093

	1094

1028 void RegExpBuilder::FlushCharacters() {	1095 void RegExpBuilder::FlushCharacters() {

	1096 FlushPendingSurrogate();

1029 pending_empty_ = false;	1097 pending_empty_ = false;

1030 if (characters_ != NULL) {	1098 if (characters_ != NULL) {

1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());	1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());

1032 characters_ = NULL;	1100 characters_ = NULL;

1033 text_.Add(atom, zone());	1101 text_.Add(atom, zone());

1034 LAST(ADD_ATOM);	1102 LAST(ADD_ATOM);

1035 }	1103 }

1036 }	1104 }

1037	1105

1038	1106

1039 void RegExpBuilder::FlushText() {	1107 void RegExpBuilder::FlushText() {

1040 FlushCharacters();	1108 FlushCharacters();

1041 int num_text = text_.length();	1109 int num_text = text_.length();

1042 if (num_text == 0) {	1110 if (num_text == 0) {

1043 return;	1111 return;

1044 } else if (num_text == 1) {	1112 } else if (num_text == 1) {

1045 terms_.Add(text_.last(), zone());	1113 terms_.Add(text_.last(), zone());

1046 } else {	1114 } else {

1047 RegExpText* text = new (zone()) RegExpText(zone());	1115 RegExpText* text = new (zone()) RegExpText(zone());

1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());	1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());

1049 terms_.Add(text, zone());	1117 terms_.Add(text, zone());

1050 }	1118 }

1051 text_.Clear();	1119 text_.Clear();

1052 }	1120 }

1053	1121

1054	1122

1055 void RegExpBuilder::AddCharacter(uc16 c) {	1123 void RegExpBuilder::AddCharacter(uc16 c) {

	1124 FlushPendingSurrogate();

1056 pending_empty_ = false;	1125 pending_empty_ = false;

1057 if (characters_ == NULL) {	1126 if (characters_ == NULL) {

1058 characters_ = new (zone()) ZoneList<uc16>(4, zone());	1127 characters_ = new (zone()) ZoneList<uc16>(4, zone());

1059 }	1128 }

1060 characters_->Add(c, zone());	1129 characters_->Add(c, zone());

1061 LAST(ADD_CHAR);	1130 LAST(ADD_CHAR);

1062 }	1131 }

1063	1132

1064	1133

1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {	1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {	1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {

1067 ZoneList<uc16> surrogate_pair(2, zone());	1136 DCHECK(unicode());

1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());	1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));

1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());	1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());	1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {

1071 AddAtom(atom);	1140 AddLeadSurrogate(c);

	1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

	1142 AddTrailSurrogate(c);

1072 } else {	1143 } else {

1073 AddCharacter(static_cast<uc16>(c));	1144 AddCharacter(static_cast<uc16>(c));

1074 }	1145 }

1075 }	1146 }

1076	1147

1077	1148

1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }	1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

1079	1150

1080	1151

	1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

	1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {

	1154 // In unicode mode, character class needs to be desugared, so it

	1155 // must be a standalone term instead of being part of a RegExpText.

	1156 AddTerm(cc);

	1157 } else {

	1158 AddAtom(cc);

	1159 }

	1160 }

	1161

	1162

1081 void RegExpBuilder::AddAtom(RegExpTree* term) {	1163 void RegExpBuilder::AddAtom(RegExpTree* term) {

1082 if (term->IsEmpty()) {	1164 if (term->IsEmpty()) {

1083 AddEmpty();	1165 AddEmpty();

1084 return;	1166 return;

1085 }	1167 }

1086 if (term->IsTextElement()) {	1168 if (term->IsTextElement()) {

1087 FlushCharacters();	1169 FlushCharacters();

1088 text_.Add(term, zone());	1170 text_.Add(term, zone());

1089 } else {	1171 } else {

1090 FlushText();	1172 FlushText();

1091 terms_.Add(term, zone());	1173 terms_.Add(term, zone());

1092 }	1174 }

1093 LAST(ADD_ATOM);	1175 LAST(ADD_ATOM);

1094 }	1176 }

1095	1177

1096	1178

	1179 void RegExpBuilder::AddTerm(RegExpTree* term) {

	1180 FlushText();

	1181 terms_.Add(term, zone());

	1182 LAST(ADD_ATOM);

	1183 }

	1184

	1185

1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) {	1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) {

1098 FlushText();	1187 FlushText();

1099 terms_.Add(assert, zone());	1188 terms_.Add(assert, zone());

1100 LAST(ADD_ASSERT);	1189 LAST(ADD_ASSERT);

1101 }	1190 }

1102	1191

1103	1192

1104 void RegExpBuilder::NewAlternative() { FlushTerms(); }	1193 void RegExpBuilder::NewAlternative() { FlushTerms(); }

1105	1194

1106	1195

(...skipping 18 matching lines...) Expand all Loading...
1125 FlushTerms();	1214 FlushTerms();

1126 int num_alternatives = alternatives_.length();	1215 int num_alternatives = alternatives_.length();

1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty();	1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty();

1128 if (num_alternatives == 1) return alternatives_.last();	1217 if (num_alternatives == 1) return alternatives_.last();

1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));	1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));

1130 }	1219 }

1131	1220

1132	1221

1133 void RegExpBuilder::AddQuantifierToAtom(	1222 void RegExpBuilder::AddQuantifierToAtom(

1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {	1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {

	1224 FlushPendingSurrogate();

1135 if (pending_empty_) {	1225 if (pending_empty_) {

1136 pending_empty_ = false;	1226 pending_empty_ = false;

1137 return;	1227 return;

1138 }	1228 }

1139 RegExpTree* atom;	1229 RegExpTree* atom;

1140 if (characters_ != NULL) {	1230 if (characters_ != NULL) {

1141 DCHECK(last_added_ == ADD_CHAR);	1231 DCHECK(last_added_ == ADD_CHAR);

1142 // Last atom was character.	1232 // Last atom was character.

1143 Vector<const uc16> char_vector = characters_->ToConstVector();	1233 Vector<const uc16> char_vector = characters_->ToConstVector();

1144 int num_chars = char_vector.length();	1234 int num_chars = char_vector.length();

(...skipping 26 matching lines...) Expand all Loading...
1171 UNREACHABLE();	1261 UNREACHABLE();

1172 return;	1262 return;

1173 }	1263 }

1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1175 zone());	1265 zone());

1176 LAST(ADD_TERM);	1266 LAST(ADD_TERM);

1177 }	1267 }

1178	1268

1179 } // namespace internal	1269 } // namespace internal

1180 } // namespace v8	1270 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »