Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(1)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1578253005: [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: more tests Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
11 #include "src/regexp/jsregexp.h" 11 #include "src/regexp/jsregexp.h"
12 #include "src/utils.h" 12 #include "src/utils.h"
13 13
14 namespace v8 { 14 namespace v8 {
15 namespace internal { 15 namespace internal {
16 16
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
18 bool multiline, bool unicode, Isolate* isolate, 18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
19 Zone* zone)
20 : isolate_(isolate), 19 : isolate_(isolate),
21 zone_(zone), 20 zone_(zone),
22 error_(error), 21 error_(error),
23 captures_(NULL), 22 captures_(NULL),
24 in_(in), 23 in_(in),
25 current_(kEndMarker), 24 current_(kEndMarker),
25 flags_(flags),
26 next_pos_(0), 26 next_pos_(0),
27 captures_started_(0), 27 captures_started_(0),
28 capture_count_(0), 28 capture_count_(0),
29 has_more_(true), 29 has_more_(true),
30 multiline_(multiline),
31 unicode_(unicode),
32 simple_(false), 30 simple_(false),
33 contains_anchor_(false), 31 contains_anchor_(false),
34 is_scanned_for_captures_(false), 32 is_scanned_for_captures_(false),
35 failed_(false) { 33 failed_(false) {
36 Advance(); 34 Advance();
37 } 35 }
38 36
39 37
38 template <bool update_position>
39 uc32 RegExpParser::ReadNext() {
40 int position = next_pos_;
41 uc32 c0 = in()->Get(position);
42 position++;
43 // Read the whole surrogate pair in case of unicode flag, if possible.
44 if (unicode() && position < in()->length() &&
45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
46 uc16 c1 = in()->Get(position);
47 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
49 position++;
50 }
51 }
52 if (update_position) next_pos_ = position;
53 return c0;
54 }
55
56
40 uc32 RegExpParser::Next() { 57 uc32 RegExpParser::Next() {
41 if (has_next()) { 58 if (has_next()) {
42 return in()->Get(next_pos_); 59 return ReadNext<false>();
43 } else { 60 } else {
44 return kEndMarker; 61 return kEndMarker;
45 } 62 }
46 } 63 }
47 64
48 65
49 void RegExpParser::Advance() { 66 void RegExpParser::Advance() {
50 if (next_pos_ < in()->length()) { 67 if (has_next()) {
51 StackLimitCheck check(isolate()); 68 StackLimitCheck check(isolate());
52 if (check.HasOverflowed()) { 69 if (check.HasOverflowed()) {
53 ReportError(CStrVector(Isolate::kStackOverflowMessage)); 70 ReportError(CStrVector(Isolate::kStackOverflowMessage));
54 } else if (zone()->excess_allocation()) { 71 } else if (zone()->excess_allocation()) {
55 ReportError(CStrVector("Regular expression too large")); 72 ReportError(CStrVector("Regular expression too large"));
56 } else { 73 } else {
57 current_ = in()->Get(next_pos_); 74 current_ = ReadNext<true>();
58 next_pos_++;
59 // Read the whole surrogate pair in case of unicode flag, if possible.
60 if (unicode_ && next_pos_ < in()->length() &&
61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
62 uc16 trail = in()->Get(next_pos_);
63 if (unibrow::Utf16::IsTrailSurrogate(trail)) {
64 current_ = unibrow::Utf16::CombineSurrogatePair(
65 static_cast<uc16>(current_), trail);
66 next_pos_++;
67 }
68 }
69 } 75 }
70 } else { 76 } else {
71 current_ = kEndMarker; 77 current_ = kEndMarker;
72 // Advance so that position() points to 1-after-the-last-character. This is 78 // Advance so that position() points to 1-after-the-last-character. This is
73 // important so that Reset() to this position works correctly. 79 // important so that Reset() to this position works correctly.
74 next_pos_ = in()->length() + 1; 80 next_pos_ = in()->length() + 1;
75 has_more_ = false; 81 has_more_ = false;
76 } 82 }
77 } 83 }
78 84
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
135 // Alternative :: 141 // Alternative ::
136 // [empty] 142 // [empty]
137 // Term Alternative 143 // Term Alternative
138 // Term :: 144 // Term ::
139 // Assertion 145 // Assertion
140 // Atom 146 // Atom
141 // Atom Quantifier 147 // Atom Quantifier
142 RegExpTree* RegExpParser::ParseDisjunction() { 148 RegExpTree* RegExpParser::ParseDisjunction() {
143 // Used to store current state while parsing subexpressions. 149 // Used to store current state while parsing subexpressions.
144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, 150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
145 zone()); 151 flags_, zone());
146 RegExpParserState* state = &initial_state; 152 RegExpParserState* state = &initial_state;
147 // Cache the builder in a local variable for quick access. 153 // Cache the builder in a local variable for quick access.
148 RegExpBuilder* builder = initial_state.builder(); 154 RegExpBuilder* builder = initial_state.builder();
149 while (true) { 155 while (true) {
150 switch (current()) { 156 switch (current()) {
151 case kEndMarker: 157 case kEndMarker:
152 if (state->IsSubexpression()) { 158 if (state->IsSubexpression()) {
153 // Inside a parenthesized group when hitting end of input. 159 // Inside a parenthesized group when hitting end of input.
154 ReportError(CStrVector("Unterminated group") CHECK_FAILED); 160 ReportError(CStrVector("Unterminated group") CHECK_FAILED);
155 } 161 }
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
199 Advance(); 205 Advance();
200 builder->NewAlternative(); 206 builder->NewAlternative();
201 continue; 207 continue;
202 } 208 }
203 case '*': 209 case '*':
204 case '+': 210 case '+':
205 case '?': 211 case '?':
206 return ReportError(CStrVector("Nothing to repeat")); 212 return ReportError(CStrVector("Nothing to repeat"));
207 case '^': { 213 case '^': {
208 Advance(); 214 Advance();
209 if (multiline_) { 215 if (multiline()) {
210 builder->AddAssertion( 216 builder->AddAssertion(
211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); 217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
212 } else { 218 } else {
213 builder->AddAssertion( 219 builder->AddAssertion(
214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); 220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
215 set_contains_anchor(); 221 set_contains_anchor();
216 } 222 }
217 continue; 223 continue;
218 } 224 }
219 case '$': { 225 case '$': {
220 Advance(); 226 Advance();
221 RegExpAssertion::AssertionType assertion_type = 227 RegExpAssertion::AssertionType assertion_type =
222 multiline_ ? RegExpAssertion::END_OF_LINE 228 multiline() ? RegExpAssertion::END_OF_LINE
223 : RegExpAssertion::END_OF_INPUT; 229 : RegExpAssertion::END_OF_INPUT;
224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); 230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
225 continue; 231 continue;
226 } 232 }
227 case '.': { 233 case '.': {
228 Advance(); 234 Advance();
229 // everything except \x0a, \x0d, \u2028 and \u2029 235 // everything except \x0a, \x0d, \u2028 and \u2029
230 ZoneList<CharacterRange>* ranges = 236 ZoneList<CharacterRange>* ranges =
231 new (zone()) ZoneList<CharacterRange>(2, zone()); 237 new (zone()) ZoneList<CharacterRange>(2, zone());
232 CharacterRange::AddClassEscape('.', ranges, zone()); 238 CharacterRange::AddClassEscape('.', ranges, zone());
233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); 239 RegExpCharacterClass* cc =
234 builder->AddAtom(atom); 240 new (zone()) RegExpCharacterClass(ranges, false);
241 builder->AddCharacterClass(cc);
235 break; 242 break;
236 } 243 }
237 case '(': { 244 case '(': {
238 SubexpressionType subexpr_type = CAPTURE; 245 SubexpressionType subexpr_type = CAPTURE;
239 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 246 RegExpLookaround::Type lookaround_type = state->lookaround_type();
240 Advance(); 247 Advance();
241 if (current() == '?') { 248 if (current() == '?') {
242 switch (Next()) { 249 switch (Next()) {
243 case ':': 250 case ':':
244 subexpr_type = GROUPING; 251 subexpr_type = GROUPING;
(...skipping 24 matching lines...) Expand all
269 break; 276 break;
270 } 277 }
271 Advance(2); 278 Advance(2);
272 } else { 279 } else {
273 if (captures_started_ >= kMaxCaptures) { 280 if (captures_started_ >= kMaxCaptures) {
274 ReportError(CStrVector("Too many captures") CHECK_FAILED); 281 ReportError(CStrVector("Too many captures") CHECK_FAILED);
275 } 282 }
276 captures_started_++; 283 captures_started_++;
277 } 284 }
278 // Store current state and begin new disjunction parsing. 285 // Store current state and begin new disjunction parsing.
279 state = new (zone()) RegExpParserState( 286 state =
280 state, subexpr_type, lookaround_type, captures_started_, zone()); 287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type,
288 captures_started_, flags_, zone());
281 builder = state->builder(); 289 builder = state->builder();
282 continue; 290 continue;
283 } 291 }
284 case '[': { 292 case '[': {
285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); 293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED);
286 builder->AddAtom(atom); 294 builder->AddCharacterClass(cc->AsCharacterClass());
287 break; 295 break;
288 } 296 }
289 // Atom :: 297 // Atom ::
290 // \ AtomEscape 298 // \ AtomEscape
291 case '\\': 299 case '\\':
292 switch (Next()) { 300 switch (Next()) {
293 case kEndMarker: 301 case kEndMarker:
294 return ReportError(CStrVector("\\ at end of pattern")); 302 return ReportError(CStrVector("\\ at end of pattern"));
295 case 'b': 303 case 'b':
296 Advance(2); 304 Advance(2);
(...skipping 14 matching lines...) Expand all
311 case 'D': 319 case 'D':
312 case 's': 320 case 's':
313 case 'S': 321 case 'S':
314 case 'w': 322 case 'w':
315 case 'W': { 323 case 'W': {
316 uc32 c = Next(); 324 uc32 c = Next();
317 Advance(2); 325 Advance(2);
318 ZoneList<CharacterRange>* ranges = 326 ZoneList<CharacterRange>* ranges =
319 new (zone()) ZoneList<CharacterRange>(2, zone()); 327 new (zone()) ZoneList<CharacterRange>(2, zone());
320 CharacterRange::AddClassEscape(c, ranges, zone()); 328 CharacterRange::AddClassEscape(c, ranges, zone());
321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); 329 RegExpCharacterClass* cc =
322 builder->AddAtom(atom); 330 new (zone()) RegExpCharacterClass(ranges, false);
331 builder->AddCharacterClass(cc);
323 break; 332 break;
324 } 333 }
325 case '1': 334 case '1':
326 case '2': 335 case '2':
327 case '3': 336 case '3':
328 case '4': 337 case '4':
329 case '5': 338 case '5':
330 case '6': 339 case '6':
331 case '7': 340 case '7':
332 case '8': 341 case '8':
(...skipping 13 matching lines...) Expand all
346 builder->AddAtom(atom); 355 builder->AddAtom(atom);
347 } 356 }
348 break; 357 break;
349 } 358 }
350 uc32 first_digit = Next(); 359 uc32 first_digit = Next();
351 if (first_digit == '8' || first_digit == '9') { 360 if (first_digit == '8' || first_digit == '9') {
352 // If the 'u' flag is present, only syntax characters can be 361 // If the 'u' flag is present, only syntax characters can be
353 // escaped, 362 // escaped,
354 // no other identity escapes are allowed. If the 'u' flag is not 363 // no other identity escapes are allowed. If the 'u' flag is not
355 // present, all identity escapes are allowed. 364 // present, all identity escapes are allowed.
356 if (!unicode_) { 365 if (!unicode()) {
357 builder->AddCharacter(first_digit); 366 builder->AddCharacter(first_digit);
358 Advance(2); 367 Advance(2);
359 } else { 368 } else {
360 return ReportError(CStrVector("Invalid escape")); 369 return ReportError(CStrVector("Invalid escape"));
361 } 370 }
362 break; 371 break;
363 } 372 }
364 } 373 }
365 // FALLTHROUGH 374 // FALLTHROUGH
366 case '0': { 375 case '0': {
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
407 Advance(2); 416 Advance(2);
408 builder->AddCharacter(controlLetter & 0x1f); 417 builder->AddCharacter(controlLetter & 0x1f);
409 } 418 }
410 break; 419 break;
411 } 420 }
412 case 'x': { 421 case 'x': {
413 Advance(2); 422 Advance(2);
414 uc32 value; 423 uc32 value;
415 if (ParseHexEscape(2, &value)) { 424 if (ParseHexEscape(2, &value)) {
416 builder->AddCharacter(value); 425 builder->AddCharacter(value);
417 } else if (!unicode_) { 426 } else if (!unicode()) {
418 builder->AddCharacter('x'); 427 builder->AddCharacter('x');
419 } else { 428 } else {
420 // If the 'u' flag is present, invalid escapes are not treated as 429 // If the 'u' flag is present, invalid escapes are not treated as
421 // identity escapes. 430 // identity escapes.
422 return ReportError(CStrVector("Invalid escape")); 431 return ReportError(CStrVector("Invalid escape"));
423 } 432 }
424 break; 433 break;
425 } 434 }
426 case 'u': { 435 case 'u': {
427 Advance(2); 436 Advance(2);
428 uc32 value; 437 uc32 value;
429 if (ParseUnicodeEscape(&value)) { 438 if (ParseUnicodeEscape(&value)) {
430 builder->AddUnicodeCharacter(value); 439 builder->AddUnicodeCharacter(value);
431 } else if (!unicode_) { 440 } else if (!unicode()) {
432 builder->AddCharacter('u'); 441 builder->AddCharacter('u');
433 } else { 442 } else {
434 // If the 'u' flag is present, invalid escapes are not treated as 443 // If the 'u' flag is present, invalid escapes are not treated as
435 // identity escapes. 444 // identity escapes.
436 return ReportError(CStrVector("Invalid unicode escape")); 445 return ReportError(CStrVector("Invalid unicode escape"));
437 } 446 }
438 break; 447 break;
439 } 448 }
440 default: 449 default:
441 Advance(); 450 Advance();
442 // If the 'u' flag is present, only syntax characters can be 451 // If the 'u' flag is present, only syntax characters can be
443 // escaped, no 452 // escaped, no
444 // other identity escapes are allowed. If the 'u' flag is not 453 // other identity escapes are allowed. If the 'u' flag is not
445 // present, 454 // present,
446 // all identity escapes are allowed. 455 // all identity escapes are allowed.
447 if (!unicode_ || IsSyntaxCharacter(current())) { 456 if (!unicode() || IsSyntaxCharacter(current())) {
448 builder->AddCharacter(current()); 457 builder->AddCharacter(current());
449 Advance(); 458 Advance();
450 } else { 459 } else {
451 return ReportError(CStrVector("Invalid escape")); 460 return ReportError(CStrVector("Invalid escape"));
452 } 461 }
453 break; 462 break;
454 } 463 }
455 break; 464 break;
456 case '{': { 465 case '{': {
457 int dummy; 466 int dummy;
(...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after
738 } 747 }
739 *value = val; 748 *value = val;
740 return true; 749 return true;
741 } 750 }
742 751
743 752
744 bool RegExpParser::ParseUnicodeEscape(uc32* value) { 753 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
746 // allowed). In the latter case, the number of hex digits between { } is 755 // allowed). In the latter case, the number of hex digits between { } is
747 // arbitrary. \ and u have already been read. 756 // arbitrary. \ and u have already been read.
748 if (current() == '{' && unicode_) { 757 if (current() == '{' && unicode()) {
749 int start = position(); 758 int start = position();
750 Advance(); 759 Advance();
751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
752 if (current() == '}') { 761 if (current() == '}') {
753 Advance(); 762 Advance();
754 return true; 763 return true;
755 } 764 }
756 } 765 }
757 Reset(start); 766 Reset(start);
758 return false; 767 return false;
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
833 // For compatibility, we interpret a decimal escape that isn't 842 // For compatibility, we interpret a decimal escape that isn't
834 // a back reference (and therefore either \0 or not valid according 843 // a back reference (and therefore either \0 or not valid according
835 // to the specification) as a 1..3 digit octal character code. 844 // to the specification) as a 1..3 digit octal character code.
836 return ParseOctalLiteral(); 845 return ParseOctalLiteral();
837 case 'x': { 846 case 'x': {
838 Advance(); 847 Advance();
839 uc32 value; 848 uc32 value;
840 if (ParseHexEscape(2, &value)) { 849 if (ParseHexEscape(2, &value)) {
841 return value; 850 return value;
842 } 851 }
843 if (!unicode_) { 852 if (!unicode()) {
844 // If \x is not followed by a two-digit hexadecimal, treat it 853 // If \x is not followed by a two-digit hexadecimal, treat it
845 // as an identity escape. 854 // as an identity escape.
846 return 'x'; 855 return 'x';
847 } 856 }
848 // If the 'u' flag is present, invalid escapes are not treated as 857 // If the 'u' flag is present, invalid escapes are not treated as
849 // identity escapes. 858 // identity escapes.
850 ReportError(CStrVector("Invalid escape")); 859 ReportError(CStrVector("Invalid escape"));
851 return 0; 860 return 0;
852 } 861 }
853 case 'u': { 862 case 'u': {
854 Advance(); 863 Advance();
855 uc32 value; 864 uc32 value;
856 if (ParseUnicodeEscape(&value)) { 865 if (ParseUnicodeEscape(&value)) {
857 return value; 866 return value;
858 } 867 }
859 if (!unicode_) { 868 if (!unicode()) {
860 return 'u'; 869 return 'u';
861 } 870 }
862 // If the 'u' flag is present, invalid escapes are not treated as 871 // If the 'u' flag is present, invalid escapes are not treated as
863 // identity escapes. 872 // identity escapes.
864 ReportError(CStrVector("Invalid unicode escape")); 873 ReportError(CStrVector("Invalid unicode escape"));
865 return 0; 874 return 0;
866 } 875 }
867 default: { 876 default: {
868 uc32 result = current(); 877 uc32 result = current();
869 // If the 'u' flag is present, only syntax characters can be escaped, no 878 // If the 'u' flag is present, only syntax characters can be escaped, no
870 // other identity escapes are allowed. If the 'u' flag is not present, all 879 // other identity escapes are allowed. If the 'u' flag is not present, all
871 // identity escapes are allowed. 880 // identity escapes are allowed.
872 if (!unicode_ || IsSyntaxCharacter(result)) { 881 if (!unicode() || IsSyntaxCharacter(result)) {
873 Advance(); 882 Advance();
874 return result; 883 return result;
875 } 884 }
876 ReportError(CStrVector("Invalid escape")); 885 ReportError(CStrVector("Invalid escape"));
877 return 0; 886 return 0;
878 } 887 }
879 } 888 }
880 return 0; 889 return 0;
881 } 890 }
882 891
883 892
884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { 893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
885 DCHECK_EQ(0, *char_class); 894 DCHECK_EQ(0, *char_class);
886 uc32 first = current(); 895 uc32 first = current();
887 if (first == '\\') { 896 if (first == '\\') {
888 switch (Next()) { 897 switch (Next()) {
889 case 'w': 898 case 'w':
890 case 'W': 899 case 'W':
891 case 'd': 900 case 'd':
892 case 'D': 901 case 'D':
893 case 's': 902 case 's':
894 case 'S': { 903 case 'S': {
895 *char_class = Next(); 904 *char_class = Next();
896 Advance(2); 905 Advance(2);
897 return CharacterRange::Singleton(0); // Return dummy value. 906 return CharacterRange::Singleton(0); // Return dummy value.
898 } 907 }
899 case kEndMarker: 908 case kEndMarker:
900 return ReportError(CStrVector("\\ at end of pattern")); 909 return ReportError(CStrVector("\\ at end of pattern"));
901 default: 910 default:
902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); 911 first = ParseClassCharacterEscape(CHECK_FAILED);
903 return CharacterRange::Singleton(c);
904 } 912 }
905 } else { 913 } else {
906 Advance(); 914 Advance();
907 return CharacterRange::Singleton(first);
908 } 915 }
916
917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
918 // Combine with possibly following trail surrogate.
919 int start = position();
920 uc32 second = current();
921 if (second == '\\') {
922 second = ParseClassCharacterEscape(CHECK_FAILED);
923 } else {
924 Advance();
925 }
926 if (unibrow::Utf16::IsTrailSurrogate(second)) {
927 first = unibrow::Utf16::CombineSurrogatePair(first, second);
928 } else {
929 Reset(start);
930 }
931 }
932
933 return CharacterRange::Singleton(first);
909 } 934 }
910 935
911 936
912 static const uc16 kNoCharClass = 0; 937 static const uc16 kNoCharClass = 0;
913 938
914 // Adds range or pre-defined character class to character ranges. 939 // Adds range or pre-defined character class to character ranges.
915 // If char_class is not kInvalidClass, it's interpreted as a class 940 // If char_class is not kInvalidClass, it's interpreted as a class
916 // escape (i.e., 's' means whitespace, from '\s'). 941 // escape (i.e., 's' means whitespace, from '\s').
917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
918 uc16 char_class, CharacterRange range, 943 uc16 char_class, CharacterRange range,
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
978 is_negated = !is_negated; 1003 is_negated = !is_negated;
979 } 1004 }
980 return new (zone()) RegExpCharacterClass(ranges, is_negated); 1005 return new (zone()) RegExpCharacterClass(ranges, is_negated);
981 } 1006 }
982 1007
983 1008
984 #undef CHECK_FAILED 1009 #undef CHECK_FAILED
985 1010
986 1011
987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, 1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
988 FlatStringReader* input, bool multiline, 1013 FlatStringReader* input, JSRegExp::Flags flags,
989 bool unicode, RegExpCompileData* result) { 1014 RegExpCompileData* result) {
990 DCHECK(result != NULL); 1015 DCHECK(result != NULL);
991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); 1016 RegExpParser parser(input, &result->error, flags, isolate, zone);
992 RegExpTree* tree = parser.ParsePattern(); 1017 RegExpTree* tree = parser.ParsePattern();
993 if (parser.failed()) { 1018 if (parser.failed()) {
994 DCHECK(tree == NULL); 1019 DCHECK(tree == NULL);
995 DCHECK(!result->error.is_null()); 1020 DCHECK(!result->error.is_null());
996 } else { 1021 } else {
997 DCHECK(tree != NULL); 1022 DCHECK(tree != NULL);
998 DCHECK(result->error.is_null()); 1023 DCHECK(result->error.is_null());
999 if (FLAG_trace_regexp_parser) { 1024 if (FLAG_trace_regexp_parser) {
1000 OFStream os(stdout); 1025 OFStream os(stdout);
1001 tree->Print(os, zone); 1026 tree->Print(os, zone);
1002 os << "\n"; 1027 os << "\n";
1003 } 1028 }
1004 result->tree = tree; 1029 result->tree = tree;
1005 int capture_count = parser.captures_started(); 1030 int capture_count = parser.captures_started();
1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1007 result->contains_anchor = parser.contains_anchor(); 1032 result->contains_anchor = parser.contains_anchor();
1008 result->capture_count = capture_count; 1033 result->capture_count = capture_count;
1009 } 1034 }
1010 return !parser.failed(); 1035 return !parser.failed();
1011 } 1036 }
1012 1037
1013 1038
1014 RegExpBuilder::RegExpBuilder(Zone* zone) 1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags)
1015 : zone_(zone), 1040 : zone_(zone),
1016 pending_empty_(false), 1041 pending_empty_(false),
1042 flags_(flags),
1017 characters_(NULL), 1043 characters_(NULL),
1044 pending_surrogate_(kNoPendingSurrogate),
1018 terms_(), 1045 terms_(),
1019 alternatives_() 1046 alternatives_()
1020 #ifdef DEBUG 1047 #ifdef DEBUG
1021 , 1048 ,
1022 last_added_(ADD_NONE) 1049 last_added_(ADD_NONE)
1023 #endif 1050 #endif
1024 { 1051 {
1025 } 1052 }
1026 1053
1027 1054
1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1057 FlushPendingSurrogate();
1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1059 pending_surrogate_ = lead_surrogate;
1060 }
1061
1062
1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1065 if (pending_surrogate_ != kNoPendingSurrogate) {
1066 uc16 lead_surrogate = pending_surrogate_;
1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1068 ZoneList<uc16> surrogate_pair(2, zone());
1069 surrogate_pair.Add(lead_surrogate, zone());
1070 surrogate_pair.Add(trail_surrogate, zone());
1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1072 pending_surrogate_ = kNoPendingSurrogate;
1073 AddAtom(atom);
1074 } else {
1075 pending_surrogate_ = trail_surrogate;
1076 FlushPendingSurrogate();
1077 }
1078 }
1079
1080
1081 void RegExpBuilder::FlushPendingSurrogate() {
1082 if (pending_surrogate_ != kNoPendingSurrogate) {
1083 // Use character class to desugar lone surrogate matching.
1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(
1085 CharacterRange::List(zone(),
1086 CharacterRange::Singleton(pending_surrogate_)),
1087 false);
1088 pending_surrogate_ = kNoPendingSurrogate;
1089 DCHECK(unicode());
1090 AddCharacterClass(cc);
1091 }
1092 }
1093
1094
1028 void RegExpBuilder::FlushCharacters() { 1095 void RegExpBuilder::FlushCharacters() {
1096 FlushPendingSurrogate();
1029 pending_empty_ = false; 1097 pending_empty_ = false;
1030 if (characters_ != NULL) { 1098 if (characters_ != NULL) {
1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); 1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1032 characters_ = NULL; 1100 characters_ = NULL;
1033 text_.Add(atom, zone()); 1101 text_.Add(atom, zone());
1034 LAST(ADD_ATOM); 1102 LAST(ADD_ATOM);
1035 } 1103 }
1036 } 1104 }
1037 1105
1038 1106
1039 void RegExpBuilder::FlushText() { 1107 void RegExpBuilder::FlushText() {
1040 FlushCharacters(); 1108 FlushCharacters();
1041 int num_text = text_.length(); 1109 int num_text = text_.length();
1042 if (num_text == 0) { 1110 if (num_text == 0) {
1043 return; 1111 return;
1044 } else if (num_text == 1) { 1112 } else if (num_text == 1) {
1045 terms_.Add(text_.last(), zone()); 1113 terms_.Add(text_.last(), zone());
1046 } else { 1114 } else {
1047 RegExpText* text = new (zone()) RegExpText(zone()); 1115 RegExpText* text = new (zone()) RegExpText(zone());
1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); 1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1049 terms_.Add(text, zone()); 1117 terms_.Add(text, zone());
1050 } 1118 }
1051 text_.Clear(); 1119 text_.Clear();
1052 } 1120 }
1053 1121
1054 1122
1055 void RegExpBuilder::AddCharacter(uc16 c) { 1123 void RegExpBuilder::AddCharacter(uc16 c) {
1124 FlushPendingSurrogate();
1056 pending_empty_ = false; 1125 pending_empty_ = false;
1057 if (characters_ == NULL) { 1126 if (characters_ == NULL) {
1058 characters_ = new (zone()) ZoneList<uc16>(4, zone()); 1127 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1059 } 1128 }
1060 characters_->Add(c, zone()); 1129 characters_->Add(c, zone());
1061 LAST(ADD_CHAR); 1130 LAST(ADD_CHAR);
1062 } 1131 }
1063 1132
1064 1133
1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { 1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { 1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
1067 ZoneList<uc16> surrogate_pair(2, zone()); 1136 DCHECK(unicode());
1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); 1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); 1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); 1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1071 AddAtom(atom); 1140 AddLeadSurrogate(c);
1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1142 AddTrailSurrogate(c);
1072 } else { 1143 } else {
1073 AddCharacter(static_cast<uc16>(c)); 1144 AddCharacter(static_cast<uc16>(c));
1074 } 1145 }
1075 } 1146 }
1076 1147
1077 1148
1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1079 1150
1080 1151
1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {
1154 // In unicode mode, character class needs to be desugared, so it
1155 // must be a standalone term instead of being part of a RegExpText.
1156 AddTerm(cc);
1157 } else {
1158 AddAtom(cc);
1159 }
1160 }
1161
1162
1081 void RegExpBuilder::AddAtom(RegExpTree* term) { 1163 void RegExpBuilder::AddAtom(RegExpTree* term) {
1082 if (term->IsEmpty()) { 1164 if (term->IsEmpty()) {
1083 AddEmpty(); 1165 AddEmpty();
1084 return; 1166 return;
1085 } 1167 }
1086 if (term->IsTextElement()) { 1168 if (term->IsTextElement()) {
1087 FlushCharacters(); 1169 FlushCharacters();
1088 text_.Add(term, zone()); 1170 text_.Add(term, zone());
1089 } else { 1171 } else {
1090 FlushText(); 1172 FlushText();
1091 terms_.Add(term, zone()); 1173 terms_.Add(term, zone());
1092 } 1174 }
1093 LAST(ADD_ATOM); 1175 LAST(ADD_ATOM);
1094 } 1176 }
1095 1177
1096 1178
1179 void RegExpBuilder::AddTerm(RegExpTree* term) {
1180 FlushText();
1181 terms_.Add(term, zone());
1182 LAST(ADD_ATOM);
1183 }
1184
1185
1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) { 1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1098 FlushText(); 1187 FlushText();
1099 terms_.Add(assert, zone()); 1188 terms_.Add(assert, zone());
1100 LAST(ADD_ASSERT); 1189 LAST(ADD_ASSERT);
1101 } 1190 }
1102 1191
1103 1192
1104 void RegExpBuilder::NewAlternative() { FlushTerms(); } 1193 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1105 1194
1106 1195
(...skipping 18 matching lines...) Expand all
1125 FlushTerms(); 1214 FlushTerms();
1126 int num_alternatives = alternatives_.length(); 1215 int num_alternatives = alternatives_.length();
1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1128 if (num_alternatives == 1) return alternatives_.last(); 1217 if (num_alternatives == 1) return alternatives_.last();
1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1130 } 1219 }
1131 1220
1132 1221
1133 void RegExpBuilder::AddQuantifierToAtom( 1222 void RegExpBuilder::AddQuantifierToAtom(
1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1224 FlushPendingSurrogate();
1135 if (pending_empty_) { 1225 if (pending_empty_) {
1136 pending_empty_ = false; 1226 pending_empty_ = false;
1137 return; 1227 return;
1138 } 1228 }
1139 RegExpTree* atom; 1229 RegExpTree* atom;
1140 if (characters_ != NULL) { 1230 if (characters_ != NULL) {
1141 DCHECK(last_added_ == ADD_CHAR); 1231 DCHECK(last_added_ == ADD_CHAR);
1142 // Last atom was character. 1232 // Last atom was character.
1143 Vector<const uc16> char_vector = characters_->ToConstVector(); 1233 Vector<const uc16> char_vector = characters_->ToConstVector();
1144 int num_chars = char_vector.length(); 1234 int num_chars = char_vector.length();
(...skipping 26 matching lines...) Expand all
1171 UNREACHABLE(); 1261 UNREACHABLE();
1172 return; 1262 return;
1173 } 1263 }
1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1175 zone()); 1265 zone());
1176 LAST(ADD_TERM); 1266 LAST(ADD_TERM);
1177 } 1267 }
1178 1268
1179 } // namespace internal 1269 } // namespace internal
1180 } // namespace v8 1270 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698