Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(175)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1618753002: Revert of [regexp] implement character classes for unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
11 #include "src/regexp/jsregexp.h" 11 #include "src/regexp/jsregexp.h"
12 #include "src/utils.h" 12 #include "src/utils.h"
13 13
14 namespace v8 { 14 namespace v8 {
15 namespace internal { 15 namespace internal {
16 16
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) 18 bool multiline, bool unicode, Isolate* isolate,
19 Zone* zone)
19 : isolate_(isolate), 20 : isolate_(isolate),
20 zone_(zone), 21 zone_(zone),
21 error_(error), 22 error_(error),
22 captures_(NULL), 23 captures_(NULL),
23 in_(in), 24 in_(in),
24 current_(kEndMarker), 25 current_(kEndMarker),
25 flags_(flags),
26 next_pos_(0), 26 next_pos_(0),
27 captures_started_(0), 27 captures_started_(0),
28 capture_count_(0), 28 capture_count_(0),
29 has_more_(true), 29 has_more_(true),
30 multiline_(multiline),
31 unicode_(unicode),
30 simple_(false), 32 simple_(false),
31 contains_anchor_(false), 33 contains_anchor_(false),
32 is_scanned_for_captures_(false), 34 is_scanned_for_captures_(false),
33 failed_(false) { 35 failed_(false) {
34 Advance(); 36 Advance();
35 } 37 }
36 38
37 39
38 template <bool update_position>
39 uc32 RegExpParser::ReadNext() {
40 int position = next_pos_;
41 uc32 c0 = in()->Get(position);
42 position++;
43 // Read the whole surrogate pair in case of unicode flag, if possible.
44 if (unicode() && position < in()->length() &&
45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) {
46 uc16 c1 = in()->Get(position);
47 if (unibrow::Utf16::IsTrailSurrogate(c1)) {
48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1);
49 position++;
50 }
51 }
52 if (update_position) next_pos_ = position;
53 return c0;
54 }
55
56
57 uc32 RegExpParser::Next() { 40 uc32 RegExpParser::Next() {
58 if (has_next()) { 41 if (has_next()) {
59 return ReadNext<false>(); 42 return in()->Get(next_pos_);
60 } else { 43 } else {
61 return kEndMarker; 44 return kEndMarker;
62 } 45 }
63 } 46 }
64 47
65 48
66 void RegExpParser::Advance() { 49 void RegExpParser::Advance() {
67 if (has_next()) { 50 if (next_pos_ < in()->length()) {
68 StackLimitCheck check(isolate()); 51 StackLimitCheck check(isolate());
69 if (check.HasOverflowed()) { 52 if (check.HasOverflowed()) {
70 ReportError(CStrVector(Isolate::kStackOverflowMessage)); 53 ReportError(CStrVector(Isolate::kStackOverflowMessage));
71 } else if (zone()->excess_allocation()) { 54 } else if (zone()->excess_allocation()) {
72 ReportError(CStrVector("Regular expression too large")); 55 ReportError(CStrVector("Regular expression too large"));
73 } else { 56 } else {
74 current_ = ReadNext<true>(); 57 current_ = in()->Get(next_pos_);
58 next_pos_++;
59 // Read the whole surrogate pair in case of unicode flag, if possible.
60 if (unicode_ && next_pos_ < in()->length() &&
61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) {
62 uc16 trail = in()->Get(next_pos_);
63 if (unibrow::Utf16::IsTrailSurrogate(trail)) {
64 current_ = unibrow::Utf16::CombineSurrogatePair(
65 static_cast<uc16>(current_), trail);
66 next_pos_++;
67 }
68 }
75 } 69 }
76 } else { 70 } else {
77 current_ = kEndMarker; 71 current_ = kEndMarker;
78 // Advance so that position() points to 1-after-the-last-character. This is 72 // Advance so that position() points to 1-after-the-last-character. This is
79 // important so that Reset() to this position works correctly. 73 // important so that Reset() to this position works correctly.
80 next_pos_ = in()->length() + 1; 74 next_pos_ = in()->length() + 1;
81 has_more_ = false; 75 has_more_ = false;
82 } 76 }
83 } 77 }
84 78
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after
141 // Alternative :: 135 // Alternative ::
142 // [empty] 136 // [empty]
143 // Term Alternative 137 // Term Alternative
144 // Term :: 138 // Term ::
145 // Assertion 139 // Assertion
146 // Atom 140 // Atom
147 // Atom Quantifier 141 // Atom Quantifier
148 RegExpTree* RegExpParser::ParseDisjunction() { 142 RegExpTree* RegExpParser::ParseDisjunction() {
149 // Used to store current state while parsing subexpressions. 143 // Used to store current state while parsing subexpressions.
150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, 144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0,
151 flags_, zone()); 145 zone());
152 RegExpParserState* state = &initial_state; 146 RegExpParserState* state = &initial_state;
153 // Cache the builder in a local variable for quick access. 147 // Cache the builder in a local variable for quick access.
154 RegExpBuilder* builder = initial_state.builder(); 148 RegExpBuilder* builder = initial_state.builder();
155 while (true) { 149 while (true) {
156 switch (current()) { 150 switch (current()) {
157 case kEndMarker: 151 case kEndMarker:
158 if (state->IsSubexpression()) { 152 if (state->IsSubexpression()) {
159 // Inside a parenthesized group when hitting end of input. 153 // Inside a parenthesized group when hitting end of input.
160 ReportError(CStrVector("Unterminated group") CHECK_FAILED); 154 ReportError(CStrVector("Unterminated group") CHECK_FAILED);
161 } 155 }
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after
205 Advance(); 199 Advance();
206 builder->NewAlternative(); 200 builder->NewAlternative();
207 continue; 201 continue;
208 } 202 }
209 case '*': 203 case '*':
210 case '+': 204 case '+':
211 case '?': 205 case '?':
212 return ReportError(CStrVector("Nothing to repeat")); 206 return ReportError(CStrVector("Nothing to repeat"));
213 case '^': { 207 case '^': {
214 Advance(); 208 Advance();
215 if (multiline()) { 209 if (multiline_) {
216 builder->AddAssertion( 210 builder->AddAssertion(
217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); 211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE));
218 } else { 212 } else {
219 builder->AddAssertion( 213 builder->AddAssertion(
220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); 214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT));
221 set_contains_anchor(); 215 set_contains_anchor();
222 } 216 }
223 continue; 217 continue;
224 } 218 }
225 case '$': { 219 case '$': {
226 Advance(); 220 Advance();
227 RegExpAssertion::AssertionType assertion_type = 221 RegExpAssertion::AssertionType assertion_type =
228 multiline() ? RegExpAssertion::END_OF_LINE 222 multiline_ ? RegExpAssertion::END_OF_LINE
229 : RegExpAssertion::END_OF_INPUT; 223 : RegExpAssertion::END_OF_INPUT;
230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); 224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type));
231 continue; 225 continue;
232 } 226 }
233 case '.': { 227 case '.': {
234 Advance(); 228 Advance();
235 // everything except \x0a, \x0d, \u2028 and \u2029 229 // everything except \x0a, \x0d, \u2028 and \u2029
236 ZoneList<CharacterRange>* ranges = 230 ZoneList<CharacterRange>* ranges =
237 new (zone()) ZoneList<CharacterRange>(2, zone()); 231 new (zone()) ZoneList<CharacterRange>(2, zone());
238 CharacterRange::AddClassEscape('.', ranges, zone()); 232 CharacterRange::AddClassEscape('.', ranges, zone());
239 RegExpCharacterClass* cc = 233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
240 new (zone()) RegExpCharacterClass(ranges, false); 234 builder->AddAtom(atom);
241 builder->AddCharacterClass(cc);
242 break; 235 break;
243 } 236 }
244 case '(': { 237 case '(': {
245 SubexpressionType subexpr_type = CAPTURE; 238 SubexpressionType subexpr_type = CAPTURE;
246 RegExpLookaround::Type lookaround_type = state->lookaround_type(); 239 RegExpLookaround::Type lookaround_type = state->lookaround_type();
247 Advance(); 240 Advance();
248 if (current() == '?') { 241 if (current() == '?') {
249 switch (Next()) { 242 switch (Next()) {
250 case ':': 243 case ':':
251 subexpr_type = GROUPING; 244 subexpr_type = GROUPING;
(...skipping 24 matching lines...) Expand all
276 break; 269 break;
277 } 270 }
278 Advance(2); 271 Advance(2);
279 } else { 272 } else {
280 if (captures_started_ >= kMaxCaptures) { 273 if (captures_started_ >= kMaxCaptures) {
281 ReportError(CStrVector("Too many captures") CHECK_FAILED); 274 ReportError(CStrVector("Too many captures") CHECK_FAILED);
282 } 275 }
283 captures_started_++; 276 captures_started_++;
284 } 277 }
285 // Store current state and begin new disjunction parsing. 278 // Store current state and begin new disjunction parsing.
286 state = 279 state = new (zone()) RegExpParserState(
287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, 280 state, subexpr_type, lookaround_type, captures_started_, zone());
288 captures_started_, flags_, zone());
289 builder = state->builder(); 281 builder = state->builder();
290 continue; 282 continue;
291 } 283 }
292 case '[': { 284 case '[': {
293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); 285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED);
294 builder->AddCharacterClass(cc->AsCharacterClass()); 286 builder->AddAtom(atom);
295 break; 287 break;
296 } 288 }
297 // Atom :: 289 // Atom ::
298 // \ AtomEscape 290 // \ AtomEscape
299 case '\\': 291 case '\\':
300 switch (Next()) { 292 switch (Next()) {
301 case kEndMarker: 293 case kEndMarker:
302 return ReportError(CStrVector("\\ at end of pattern")); 294 return ReportError(CStrVector("\\ at end of pattern"));
303 case 'b': 295 case 'b':
304 Advance(2); 296 Advance(2);
(...skipping 14 matching lines...) Expand all
319 case 'D': 311 case 'D':
320 case 's': 312 case 's':
321 case 'S': 313 case 'S':
322 case 'w': 314 case 'w':
323 case 'W': { 315 case 'W': {
324 uc32 c = Next(); 316 uc32 c = Next();
325 Advance(2); 317 Advance(2);
326 ZoneList<CharacterRange>* ranges = 318 ZoneList<CharacterRange>* ranges =
327 new (zone()) ZoneList<CharacterRange>(2, zone()); 319 new (zone()) ZoneList<CharacterRange>(2, zone());
328 CharacterRange::AddClassEscape(c, ranges, zone()); 320 CharacterRange::AddClassEscape(c, ranges, zone());
329 RegExpCharacterClass* cc = 321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false);
330 new (zone()) RegExpCharacterClass(ranges, false); 322 builder->AddAtom(atom);
331 builder->AddCharacterClass(cc);
332 break; 323 break;
333 } 324 }
334 case '1': 325 case '1':
335 case '2': 326 case '2':
336 case '3': 327 case '3':
337 case '4': 328 case '4':
338 case '5': 329 case '5':
339 case '6': 330 case '6':
340 case '7': 331 case '7':
341 case '8': 332 case '8':
(...skipping 13 matching lines...) Expand all
355 builder->AddAtom(atom); 346 builder->AddAtom(atom);
356 } 347 }
357 break; 348 break;
358 } 349 }
359 uc32 first_digit = Next(); 350 uc32 first_digit = Next();
360 if (first_digit == '8' || first_digit == '9') { 351 if (first_digit == '8' || first_digit == '9') {
361 // If the 'u' flag is present, only syntax characters can be 352 // If the 'u' flag is present, only syntax characters can be
362 // escaped, 353 // escaped,
363 // no other identity escapes are allowed. If the 'u' flag is not 354 // no other identity escapes are allowed. If the 'u' flag is not
364 // present, all identity escapes are allowed. 355 // present, all identity escapes are allowed.
365 if (!unicode()) { 356 if (!unicode_) {
366 builder->AddCharacter(first_digit); 357 builder->AddCharacter(first_digit);
367 Advance(2); 358 Advance(2);
368 } else { 359 } else {
369 return ReportError(CStrVector("Invalid escape")); 360 return ReportError(CStrVector("Invalid escape"));
370 } 361 }
371 break; 362 break;
372 } 363 }
373 } 364 }
374 // FALLTHROUGH 365 // FALLTHROUGH
375 case '0': { 366 case '0': {
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after
416 Advance(2); 407 Advance(2);
417 builder->AddCharacter(controlLetter & 0x1f); 408 builder->AddCharacter(controlLetter & 0x1f);
418 } 409 }
419 break; 410 break;
420 } 411 }
421 case 'x': { 412 case 'x': {
422 Advance(2); 413 Advance(2);
423 uc32 value; 414 uc32 value;
424 if (ParseHexEscape(2, &value)) { 415 if (ParseHexEscape(2, &value)) {
425 builder->AddCharacter(value); 416 builder->AddCharacter(value);
426 } else if (!unicode()) { 417 } else if (!unicode_) {
427 builder->AddCharacter('x'); 418 builder->AddCharacter('x');
428 } else { 419 } else {
429 // If the 'u' flag is present, invalid escapes are not treated as 420 // If the 'u' flag is present, invalid escapes are not treated as
430 // identity escapes. 421 // identity escapes.
431 return ReportError(CStrVector("Invalid escape")); 422 return ReportError(CStrVector("Invalid escape"));
432 } 423 }
433 break; 424 break;
434 } 425 }
435 case 'u': { 426 case 'u': {
436 Advance(2); 427 Advance(2);
437 uc32 value; 428 uc32 value;
438 if (ParseUnicodeEscape(&value)) { 429 if (ParseUnicodeEscape(&value)) {
439 builder->AddUnicodeCharacter(value); 430 builder->AddUnicodeCharacter(value);
440 } else if (!unicode()) { 431 } else if (!unicode_) {
441 builder->AddCharacter('u'); 432 builder->AddCharacter('u');
442 } else { 433 } else {
443 // If the 'u' flag is present, invalid escapes are not treated as 434 // If the 'u' flag is present, invalid escapes are not treated as
444 // identity escapes. 435 // identity escapes.
445 return ReportError(CStrVector("Invalid unicode escape")); 436 return ReportError(CStrVector("Invalid unicode escape"));
446 } 437 }
447 break; 438 break;
448 } 439 }
449 default: 440 default:
450 Advance(); 441 Advance();
451 // If the 'u' flag is present, only syntax characters can be 442 // If the 'u' flag is present, only syntax characters can be
452 // escaped, no 443 // escaped, no
453 // other identity escapes are allowed. If the 'u' flag is not 444 // other identity escapes are allowed. If the 'u' flag is not
454 // present, 445 // present,
455 // all identity escapes are allowed. 446 // all identity escapes are allowed.
456 if (!unicode() || IsSyntaxCharacter(current())) { 447 if (!unicode_ || IsSyntaxCharacter(current())) {
457 builder->AddCharacter(current()); 448 builder->AddCharacter(current());
458 Advance(); 449 Advance();
459 } else { 450 } else {
460 return ReportError(CStrVector("Invalid escape")); 451 return ReportError(CStrVector("Invalid escape"));
461 } 452 }
462 break; 453 break;
463 } 454 }
464 break; 455 break;
465 case '{': { 456 case '{': {
466 int dummy; 457 int dummy;
(...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after
747 } 738 }
748 *value = val; 739 *value = val;
749 return true; 740 return true;
750 } 741 }
751 742
752 743
753 bool RegExpParser::ParseUnicodeEscape(uc32* value) { 744 bool RegExpParser::ParseUnicodeEscape(uc32* value) {
754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are 745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are
755 // allowed). In the latter case, the number of hex digits between { } is 746 // allowed). In the latter case, the number of hex digits between { } is
756 // arbitrary. \ and u have already been read. 747 // arbitrary. \ and u have already been read.
757 if (current() == '{' && unicode()) { 748 if (current() == '{' && unicode_) {
758 int start = position(); 749 int start = position();
759 Advance(); 750 Advance();
760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { 751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) {
761 if (current() == '}') { 752 if (current() == '}') {
762 Advance(); 753 Advance();
763 return true; 754 return true;
764 } 755 }
765 } 756 }
766 Reset(start); 757 Reset(start);
767 return false; 758 return false;
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after
842 // For compatibility, we interpret a decimal escape that isn't 833 // For compatibility, we interpret a decimal escape that isn't
843 // a back reference (and therefore either \0 or not valid according 834 // a back reference (and therefore either \0 or not valid according
844 // to the specification) as a 1..3 digit octal character code. 835 // to the specification) as a 1..3 digit octal character code.
845 return ParseOctalLiteral(); 836 return ParseOctalLiteral();
846 case 'x': { 837 case 'x': {
847 Advance(); 838 Advance();
848 uc32 value; 839 uc32 value;
849 if (ParseHexEscape(2, &value)) { 840 if (ParseHexEscape(2, &value)) {
850 return value; 841 return value;
851 } 842 }
852 if (!unicode()) { 843 if (!unicode_) {
853 // If \x is not followed by a two-digit hexadecimal, treat it 844 // If \x is not followed by a two-digit hexadecimal, treat it
854 // as an identity escape. 845 // as an identity escape.
855 return 'x'; 846 return 'x';
856 } 847 }
857 // If the 'u' flag is present, invalid escapes are not treated as 848 // If the 'u' flag is present, invalid escapes are not treated as
858 // identity escapes. 849 // identity escapes.
859 ReportError(CStrVector("Invalid escape")); 850 ReportError(CStrVector("Invalid escape"));
860 return 0; 851 return 0;
861 } 852 }
862 case 'u': { 853 case 'u': {
863 Advance(); 854 Advance();
864 uc32 value; 855 uc32 value;
865 if (ParseUnicodeEscape(&value)) { 856 if (ParseUnicodeEscape(&value)) {
866 return value; 857 return value;
867 } 858 }
868 if (!unicode()) { 859 if (!unicode_) {
869 return 'u'; 860 return 'u';
870 } 861 }
871 // If the 'u' flag is present, invalid escapes are not treated as 862 // If the 'u' flag is present, invalid escapes are not treated as
872 // identity escapes. 863 // identity escapes.
873 ReportError(CStrVector("Invalid unicode escape")); 864 ReportError(CStrVector("Invalid unicode escape"));
874 return 0; 865 return 0;
875 } 866 }
876 default: { 867 default: {
877 uc32 result = current(); 868 uc32 result = current();
878 // If the 'u' flag is present, only syntax characters can be escaped, no 869 // If the 'u' flag is present, only syntax characters can be escaped, no
879 // other identity escapes are allowed. If the 'u' flag is not present, all 870 // other identity escapes are allowed. If the 'u' flag is not present, all
880 // identity escapes are allowed. 871 // identity escapes are allowed.
881 if (!unicode() || IsSyntaxCharacter(result)) { 872 if (!unicode_ || IsSyntaxCharacter(result)) {
882 Advance(); 873 Advance();
883 return result; 874 return result;
884 } 875 }
885 ReportError(CStrVector("Invalid escape")); 876 ReportError(CStrVector("Invalid escape"));
886 return 0; 877 return 0;
887 } 878 }
888 } 879 }
889 return 0; 880 return 0;
890 } 881 }
891 882
892 883
893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { 884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) {
894 DCHECK_EQ(0, *char_class); 885 DCHECK_EQ(0, *char_class);
895 uc32 first = current(); 886 uc32 first = current();
896 if (first == '\\') { 887 if (first == '\\') {
897 switch (Next()) { 888 switch (Next()) {
898 case 'w': 889 case 'w':
899 case 'W': 890 case 'W':
900 case 'd': 891 case 'd':
901 case 'D': 892 case 'D':
902 case 's': 893 case 's':
903 case 'S': { 894 case 'S': {
904 *char_class = Next(); 895 *char_class = Next();
905 Advance(2); 896 Advance(2);
906 return CharacterRange::Singleton(0); // Return dummy value. 897 return CharacterRange::Singleton(0); // Return dummy value.
907 } 898 }
908 case kEndMarker: 899 case kEndMarker:
909 return ReportError(CStrVector("\\ at end of pattern")); 900 return ReportError(CStrVector("\\ at end of pattern"));
910 default: 901 default:
911 first = ParseClassCharacterEscape(CHECK_FAILED); 902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED);
903 return CharacterRange::Singleton(c);
912 } 904 }
913 } else { 905 } else {
914 Advance(); 906 Advance();
907 return CharacterRange::Singleton(first);
915 } 908 }
916
917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) {
918 // Combine with possibly following trail surrogate.
919 int start = position();
920 uc32 second = current();
921 if (second == '\\') {
922 second = ParseClassCharacterEscape(CHECK_FAILED);
923 } else {
924 Advance();
925 }
926 if (unibrow::Utf16::IsTrailSurrogate(second)) {
927 first = unibrow::Utf16::CombineSurrogatePair(first, second);
928 } else {
929 Reset(start);
930 }
931 }
932
933 return CharacterRange::Singleton(first);
934 } 909 }
935 910
936 911
937 static const uc16 kNoCharClass = 0; 912 static const uc16 kNoCharClass = 0;
938 913
939 // Adds range or pre-defined character class to character ranges. 914 // Adds range or pre-defined character class to character ranges.
940 // If char_class is not kInvalidClass, it's interpreted as a class 915 // If char_class is not kInvalidClass, it's interpreted as a class
941 // escape (i.e., 's' means whitespace, from '\s'). 916 // escape (i.e., 's' means whitespace, from '\s').
942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, 917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges,
943 uc16 char_class, CharacterRange range, 918 uc16 char_class, CharacterRange range,
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
1003 is_negated = !is_negated; 978 is_negated = !is_negated;
1004 } 979 }
1005 return new (zone()) RegExpCharacterClass(ranges, is_negated); 980 return new (zone()) RegExpCharacterClass(ranges, is_negated);
1006 } 981 }
1007 982
1008 983
1009 #undef CHECK_FAILED 984 #undef CHECK_FAILED
1010 985
1011 986
1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, 987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone,
1013 FlatStringReader* input, JSRegExp::Flags flags, 988 FlatStringReader* input, bool multiline,
1014 RegExpCompileData* result) { 989 bool unicode, RegExpCompileData* result) {
1015 DCHECK(result != NULL); 990 DCHECK(result != NULL);
1016 RegExpParser parser(input, &result->error, flags, isolate, zone); 991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone);
1017 RegExpTree* tree = parser.ParsePattern(); 992 RegExpTree* tree = parser.ParsePattern();
1018 if (parser.failed()) { 993 if (parser.failed()) {
1019 DCHECK(tree == NULL); 994 DCHECK(tree == NULL);
1020 DCHECK(!result->error.is_null()); 995 DCHECK(!result->error.is_null());
1021 } else { 996 } else {
1022 DCHECK(tree != NULL); 997 DCHECK(tree != NULL);
1023 DCHECK(result->error.is_null()); 998 DCHECK(result->error.is_null());
1024 if (FLAG_trace_regexp_parser) { 999 if (FLAG_trace_regexp_parser) {
1025 OFStream os(stdout); 1000 OFStream os(stdout);
1026 tree->Print(os, zone); 1001 tree->Print(os, zone);
1027 os << "\n"; 1002 os << "\n";
1028 } 1003 }
1029 result->tree = tree; 1004 result->tree = tree;
1030 int capture_count = parser.captures_started(); 1005 int capture_count = parser.captures_started();
1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; 1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0;
1032 result->contains_anchor = parser.contains_anchor(); 1007 result->contains_anchor = parser.contains_anchor();
1033 result->capture_count = capture_count; 1008 result->capture_count = capture_count;
1034 } 1009 }
1035 return !parser.failed(); 1010 return !parser.failed();
1036 } 1011 }
1037 1012
1038 1013
1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) 1014 RegExpBuilder::RegExpBuilder(Zone* zone)
1040 : zone_(zone), 1015 : zone_(zone),
1041 pending_empty_(false), 1016 pending_empty_(false),
1042 flags_(flags),
1043 characters_(NULL), 1017 characters_(NULL),
1044 pending_surrogate_(kNoPendingSurrogate),
1045 terms_(), 1018 terms_(),
1046 alternatives_() 1019 alternatives_()
1047 #ifdef DEBUG 1020 #ifdef DEBUG
1048 , 1021 ,
1049 last_added_(ADD_NONE) 1022 last_added_(ADD_NONE)
1050 #endif 1023 #endif
1051 { 1024 {
1052 } 1025 }
1053 1026
1054 1027
1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) {
1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1057 FlushPendingSurrogate();
1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1059 pending_surrogate_ = lead_surrogate;
1060 }
1061
1062
1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1065 if (pending_surrogate_ != kNoPendingSurrogate) {
1066 uc16 lead_surrogate = pending_surrogate_;
1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1068 ZoneList<uc16> surrogate_pair(2, zone());
1069 surrogate_pair.Add(lead_surrogate, zone());
1070 surrogate_pair.Add(trail_surrogate, zone());
1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1072 pending_surrogate_ = kNoPendingSurrogate;
1073 AddAtom(atom);
1074 } else {
1075 pending_surrogate_ = trail_surrogate;
1076 FlushPendingSurrogate();
1077 }
1078 }
1079
1080
1081 void RegExpBuilder::FlushPendingSurrogate() {
1082 if (pending_surrogate_ != kNoPendingSurrogate) {
1083 // Use character class to desugar lone surrogate matching.
1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(
1085 CharacterRange::List(zone(),
1086 CharacterRange::Singleton(pending_surrogate_)),
1087 false);
1088 pending_surrogate_ = kNoPendingSurrogate;
1089 DCHECK(unicode());
1090 AddCharacterClass(cc);
1091 }
1092 }
1093
1094
1095 void RegExpBuilder::FlushCharacters() { 1028 void RegExpBuilder::FlushCharacters() {
1096 FlushPendingSurrogate();
1097 pending_empty_ = false; 1029 pending_empty_ = false;
1098 if (characters_ != NULL) { 1030 if (characters_ != NULL) {
1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); 1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1100 characters_ = NULL; 1032 characters_ = NULL;
1101 text_.Add(atom, zone()); 1033 text_.Add(atom, zone());
1102 LAST(ADD_ATOM); 1034 LAST(ADD_ATOM);
1103 } 1035 }
1104 } 1036 }
1105 1037
1106 1038
1107 void RegExpBuilder::FlushText() { 1039 void RegExpBuilder::FlushText() {
1108 FlushCharacters(); 1040 FlushCharacters();
1109 int num_text = text_.length(); 1041 int num_text = text_.length();
1110 if (num_text == 0) { 1042 if (num_text == 0) {
1111 return; 1043 return;
1112 } else if (num_text == 1) { 1044 } else if (num_text == 1) {
1113 terms_.Add(text_.last(), zone()); 1045 terms_.Add(text_.last(), zone());
1114 } else { 1046 } else {
1115 RegExpText* text = new (zone()) RegExpText(zone()); 1047 RegExpText* text = new (zone()) RegExpText(zone());
1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); 1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1117 terms_.Add(text, zone()); 1049 terms_.Add(text, zone());
1118 } 1050 }
1119 text_.Clear(); 1051 text_.Clear();
1120 } 1052 }
1121 1053
1122 1054
1123 void RegExpBuilder::AddCharacter(uc16 c) { 1055 void RegExpBuilder::AddCharacter(uc16 c) {
1124 FlushPendingSurrogate();
1125 pending_empty_ = false; 1056 pending_empty_ = false;
1126 if (characters_ == NULL) { 1057 if (characters_ == NULL) {
1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); 1058 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1128 } 1059 }
1129 characters_->Add(c, zone()); 1060 characters_->Add(c, zone());
1130 LAST(ADD_CHAR); 1061 LAST(ADD_CHAR);
1131 } 1062 }
1132 1063
1133 1064
1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { 1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { 1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
1136 DCHECK(unicode()); 1067 ZoneList<uc16> surrogate_pair(2, zone());
1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); 1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone());
1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone());
1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1140 AddLeadSurrogate(c); 1071 AddAtom(atom);
1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1142 AddTrailSurrogate(c);
1143 } else { 1072 } else {
1144 AddCharacter(static_cast<uc16>(c)); 1073 AddCharacter(static_cast<uc16>(c));
1145 } 1074 }
1146 } 1075 }
1147 1076
1148 1077
1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1150 1079
1151 1080
1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {
1154 // In unicode mode, character class needs to be desugared, so it
1155 // must be a standalone term instead of being part of a RegExpText.
1156 AddTerm(cc);
1157 } else {
1158 AddAtom(cc);
1159 }
1160 }
1161
1162
1163 void RegExpBuilder::AddAtom(RegExpTree* term) { 1081 void RegExpBuilder::AddAtom(RegExpTree* term) {
1164 if (term->IsEmpty()) { 1082 if (term->IsEmpty()) {
1165 AddEmpty(); 1083 AddEmpty();
1166 return; 1084 return;
1167 } 1085 }
1168 if (term->IsTextElement()) { 1086 if (term->IsTextElement()) {
1169 FlushCharacters(); 1087 FlushCharacters();
1170 text_.Add(term, zone()); 1088 text_.Add(term, zone());
1171 } else { 1089 } else {
1172 FlushText(); 1090 FlushText();
1173 terms_.Add(term, zone()); 1091 terms_.Add(term, zone());
1174 } 1092 }
1175 LAST(ADD_ATOM); 1093 LAST(ADD_ATOM);
1176 } 1094 }
1177 1095
1178
1179 void RegExpBuilder::AddTerm(RegExpTree* term) {
1180 FlushText();
1181 terms_.Add(term, zone());
1182 LAST(ADD_ATOM);
1183 }
1184
1185 1096
1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) { 1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) {
1187 FlushText(); 1098 FlushText();
1188 terms_.Add(assert, zone()); 1099 terms_.Add(assert, zone());
1189 LAST(ADD_ASSERT); 1100 LAST(ADD_ASSERT);
1190 } 1101 }
1191 1102
1192 1103
1193 void RegExpBuilder::NewAlternative() { FlushTerms(); } 1104 void RegExpBuilder::NewAlternative() { FlushTerms(); }
1194 1105
(...skipping 19 matching lines...) Expand all
1214 FlushTerms(); 1125 FlushTerms();
1215 int num_alternatives = alternatives_.length(); 1126 int num_alternatives = alternatives_.length();
1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1217 if (num_alternatives == 1) return alternatives_.last(); 1128 if (num_alternatives == 1) return alternatives_.last();
1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1219 } 1130 }
1220 1131
1221 1132
1222 void RegExpBuilder::AddQuantifierToAtom( 1133 void RegExpBuilder::AddQuantifierToAtom(
1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { 1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) {
1224 FlushPendingSurrogate();
1225 if (pending_empty_) { 1135 if (pending_empty_) {
1226 pending_empty_ = false; 1136 pending_empty_ = false;
1227 return; 1137 return;
1228 } 1138 }
1229 RegExpTree* atom; 1139 RegExpTree* atom;
1230 if (characters_ != NULL) { 1140 if (characters_ != NULL) {
1231 DCHECK(last_added_ == ADD_CHAR); 1141 DCHECK(last_added_ == ADD_CHAR);
1232 // Last atom was character. 1142 // Last atom was character.
1233 Vector<const uc16> char_vector = characters_->ToConstVector(); 1143 Vector<const uc16> char_vector = characters_->ToConstVector();
1234 int num_chars = char_vector.length(); 1144 int num_chars = char_vector.length();
(...skipping 26 matching lines...) Expand all
1261 UNREACHABLE(); 1171 UNREACHABLE();
1262 return; 1172 return;
1263 } 1173 }
1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1265 zone()); 1175 zone());
1266 LAST(ADD_TERM); 1176 LAST(ADD_TERM);
1267 } 1177 }
1268 1178
1269 } // namespace internal 1179 } // namespace internal
1270 } // namespace v8 1180 } // namespace v8
OLDNEW
« no previous file with comments | « src/regexp/regexp-parser.h ('k') | test/cctest/test-regexp.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698