OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
11 #include "src/regexp/jsregexp.h" | 11 #include "src/regexp/jsregexp.h" |
12 #include "src/utils.h" | 12 #include "src/utils.h" |
13 | 13 |
14 namespace v8 { | 14 namespace v8 { |
15 namespace internal { | 15 namespace internal { |
16 | 16 |
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
18 bool multiline, bool unicode, Isolate* isolate, | 18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
19 Zone* zone) | |
20 : isolate_(isolate), | 19 : isolate_(isolate), |
21 zone_(zone), | 20 zone_(zone), |
22 error_(error), | 21 error_(error), |
23 captures_(NULL), | 22 captures_(NULL), |
24 in_(in), | 23 in_(in), |
25 current_(kEndMarker), | 24 current_(kEndMarker), |
| 25 flags_(flags), |
26 next_pos_(0), | 26 next_pos_(0), |
27 captures_started_(0), | 27 captures_started_(0), |
28 capture_count_(0), | 28 capture_count_(0), |
29 has_more_(true), | 29 has_more_(true), |
30 multiline_(multiline), | |
31 unicode_(unicode), | |
32 simple_(false), | 30 simple_(false), |
33 contains_anchor_(false), | 31 contains_anchor_(false), |
34 is_scanned_for_captures_(false), | 32 is_scanned_for_captures_(false), |
35 failed_(false) { | 33 failed_(false) { |
36 Advance(); | 34 Advance(); |
37 } | 35 } |
38 | 36 |
39 | 37 |
| 38 template <bool update_position> |
| 39 uc32 RegExpParser::ReadNext() { |
| 40 int position = next_pos_; |
| 41 uc32 c0 = in()->Get(position); |
| 42 position++; |
| 43 // Read the whole surrogate pair in case of unicode flag, if possible. |
| 44 if (unicode() && position < in()->length() && |
| 45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { |
| 46 uc16 c1 = in()->Get(position); |
| 47 if (unibrow::Utf16::IsTrailSurrogate(c1)) { |
| 48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); |
| 49 position++; |
| 50 } |
| 51 } |
| 52 if (update_position) next_pos_ = position; |
| 53 return c0; |
| 54 } |
| 55 |
| 56 |
40 uc32 RegExpParser::Next() { | 57 uc32 RegExpParser::Next() { |
41 if (has_next()) { | 58 if (has_next()) { |
42 return in()->Get(next_pos_); | 59 return ReadNext<false>(); |
43 } else { | 60 } else { |
44 return kEndMarker; | 61 return kEndMarker; |
45 } | 62 } |
46 } | 63 } |
47 | 64 |
48 | 65 |
49 void RegExpParser::Advance() { | 66 void RegExpParser::Advance() { |
50 if (next_pos_ < in()->length()) { | 67 if (has_next()) { |
51 StackLimitCheck check(isolate()); | 68 StackLimitCheck check(isolate()); |
52 if (check.HasOverflowed()) { | 69 if (check.HasOverflowed()) { |
53 ReportError(CStrVector(Isolate::kStackOverflowMessage)); | 70 ReportError(CStrVector(Isolate::kStackOverflowMessage)); |
54 } else if (zone()->excess_allocation()) { | 71 } else if (zone()->excess_allocation()) { |
55 ReportError(CStrVector("Regular expression too large")); | 72 ReportError(CStrVector("Regular expression too large")); |
56 } else { | 73 } else { |
57 current_ = in()->Get(next_pos_); | 74 current_ = ReadNext<true>(); |
58 next_pos_++; | |
59 // Read the whole surrogate pair in case of unicode flag, if possible. | |
60 if (unicode_ && next_pos_ < in()->length() && | |
61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { | |
62 uc16 trail = in()->Get(next_pos_); | |
63 if (unibrow::Utf16::IsTrailSurrogate(trail)) { | |
64 current_ = unibrow::Utf16::CombineSurrogatePair( | |
65 static_cast<uc16>(current_), trail); | |
66 next_pos_++; | |
67 } | |
68 } | |
69 } | 75 } |
70 } else { | 76 } else { |
71 current_ = kEndMarker; | 77 current_ = kEndMarker; |
72 // Advance so that position() points to 1-after-the-last-character. This is | 78 // Advance so that position() points to 1-after-the-last-character. This is |
73 // important so that Reset() to this position works correctly. | 79 // important so that Reset() to this position works correctly. |
74 next_pos_ = in()->length() + 1; | 80 next_pos_ = in()->length() + 1; |
75 has_more_ = false; | 81 has_more_ = false; |
76 } | 82 } |
77 } | 83 } |
78 | 84 |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
135 // Alternative :: | 141 // Alternative :: |
136 // [empty] | 142 // [empty] |
137 // Term Alternative | 143 // Term Alternative |
138 // Term :: | 144 // Term :: |
139 // Assertion | 145 // Assertion |
140 // Atom | 146 // Atom |
141 // Atom Quantifier | 147 // Atom Quantifier |
142 RegExpTree* RegExpParser::ParseDisjunction() { | 148 RegExpTree* RegExpParser::ParseDisjunction() { |
143 // Used to store current state while parsing subexpressions. | 149 // Used to store current state while parsing subexpressions. |
144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | 150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
145 zone()); | 151 flags_, zone()); |
146 RegExpParserState* state = &initial_state; | 152 RegExpParserState* state = &initial_state; |
147 // Cache the builder in a local variable for quick access. | 153 // Cache the builder in a local variable for quick access. |
148 RegExpBuilder* builder = initial_state.builder(); | 154 RegExpBuilder* builder = initial_state.builder(); |
149 while (true) { | 155 while (true) { |
150 switch (current()) { | 156 switch (current()) { |
151 case kEndMarker: | 157 case kEndMarker: |
152 if (state->IsSubexpression()) { | 158 if (state->IsSubexpression()) { |
153 // Inside a parenthesized group when hitting end of input. | 159 // Inside a parenthesized group when hitting end of input. |
154 ReportError(CStrVector("Unterminated group") CHECK_FAILED); | 160 ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
155 } | 161 } |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
199 Advance(); | 205 Advance(); |
200 builder->NewAlternative(); | 206 builder->NewAlternative(); |
201 continue; | 207 continue; |
202 } | 208 } |
203 case '*': | 209 case '*': |
204 case '+': | 210 case '+': |
205 case '?': | 211 case '?': |
206 return ReportError(CStrVector("Nothing to repeat")); | 212 return ReportError(CStrVector("Nothing to repeat")); |
207 case '^': { | 213 case '^': { |
208 Advance(); | 214 Advance(); |
209 if (multiline_) { | 215 if (multiline()) { |
210 builder->AddAssertion( | 216 builder->AddAssertion( |
211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); | 217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
212 } else { | 218 } else { |
213 builder->AddAssertion( | 219 builder->AddAssertion( |
214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); | 220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); |
215 set_contains_anchor(); | 221 set_contains_anchor(); |
216 } | 222 } |
217 continue; | 223 continue; |
218 } | 224 } |
219 case '$': { | 225 case '$': { |
220 Advance(); | 226 Advance(); |
221 RegExpAssertion::AssertionType assertion_type = | 227 RegExpAssertion::AssertionType assertion_type = |
222 multiline_ ? RegExpAssertion::END_OF_LINE | 228 multiline() ? RegExpAssertion::END_OF_LINE |
223 : RegExpAssertion::END_OF_INPUT; | 229 : RegExpAssertion::END_OF_INPUT; |
224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); | 230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); |
225 continue; | 231 continue; |
226 } | 232 } |
227 case '.': { | 233 case '.': { |
228 Advance(); | 234 Advance(); |
229 // everything except \x0a, \x0d, \u2028 and \u2029 | 235 // everything except \x0a, \x0d, \u2028 and \u2029 |
230 ZoneList<CharacterRange>* ranges = | 236 ZoneList<CharacterRange>* ranges = |
231 new (zone()) ZoneList<CharacterRange>(2, zone()); | 237 new (zone()) ZoneList<CharacterRange>(2, zone()); |
232 CharacterRange::AddClassEscape('.', ranges, zone()); | 238 CharacterRange::AddClassEscape('.', ranges, zone()); |
233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); | 239 RegExpCharacterClass* cc = |
234 builder->AddAtom(atom); | 240 new (zone()) RegExpCharacterClass(ranges, false); |
| 241 builder->AddCharacterClass(cc); |
235 break; | 242 break; |
236 } | 243 } |
237 case '(': { | 244 case '(': { |
238 SubexpressionType subexpr_type = CAPTURE; | 245 SubexpressionType subexpr_type = CAPTURE; |
239 RegExpLookaround::Type lookaround_type = state->lookaround_type(); | 246 RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
240 Advance(); | 247 Advance(); |
241 if (current() == '?') { | 248 if (current() == '?') { |
242 switch (Next()) { | 249 switch (Next()) { |
243 case ':': | 250 case ':': |
244 subexpr_type = GROUPING; | 251 subexpr_type = GROUPING; |
(...skipping 24 matching lines...) Expand all Loading... |
269 break; | 276 break; |
270 } | 277 } |
271 Advance(2); | 278 Advance(2); |
272 } else { | 279 } else { |
273 if (captures_started_ >= kMaxCaptures) { | 280 if (captures_started_ >= kMaxCaptures) { |
274 ReportError(CStrVector("Too many captures") CHECK_FAILED); | 281 ReportError(CStrVector("Too many captures") CHECK_FAILED); |
275 } | 282 } |
276 captures_started_++; | 283 captures_started_++; |
277 } | 284 } |
278 // Store current state and begin new disjunction parsing. | 285 // Store current state and begin new disjunction parsing. |
279 state = new (zone()) RegExpParserState( | 286 state = |
280 state, subexpr_type, lookaround_type, captures_started_, zone()); | 287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, |
| 288 captures_started_, flags_, zone()); |
281 builder = state->builder(); | 289 builder = state->builder(); |
282 continue; | 290 continue; |
283 } | 291 } |
284 case '[': { | 292 case '[': { |
285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); | 293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); |
286 builder->AddAtom(atom); | 294 builder->AddCharacterClass(cc->AsCharacterClass()); |
287 break; | 295 break; |
288 } | 296 } |
289 // Atom :: | 297 // Atom :: |
290 // \ AtomEscape | 298 // \ AtomEscape |
291 case '\\': | 299 case '\\': |
292 switch (Next()) { | 300 switch (Next()) { |
293 case kEndMarker: | 301 case kEndMarker: |
294 return ReportError(CStrVector("\\ at end of pattern")); | 302 return ReportError(CStrVector("\\ at end of pattern")); |
295 case 'b': | 303 case 'b': |
296 Advance(2); | 304 Advance(2); |
(...skipping 14 matching lines...) Expand all Loading... |
311 case 'D': | 319 case 'D': |
312 case 's': | 320 case 's': |
313 case 'S': | 321 case 'S': |
314 case 'w': | 322 case 'w': |
315 case 'W': { | 323 case 'W': { |
316 uc32 c = Next(); | 324 uc32 c = Next(); |
317 Advance(2); | 325 Advance(2); |
318 ZoneList<CharacterRange>* ranges = | 326 ZoneList<CharacterRange>* ranges = |
319 new (zone()) ZoneList<CharacterRange>(2, zone()); | 327 new (zone()) ZoneList<CharacterRange>(2, zone()); |
320 CharacterRange::AddClassEscape(c, ranges, zone()); | 328 CharacterRange::AddClassEscape(c, ranges, zone()); |
321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); | 329 RegExpCharacterClass* cc = |
322 builder->AddAtom(atom); | 330 new (zone()) RegExpCharacterClass(ranges, false); |
| 331 builder->AddCharacterClass(cc); |
323 break; | 332 break; |
324 } | 333 } |
325 case '1': | 334 case '1': |
326 case '2': | 335 case '2': |
327 case '3': | 336 case '3': |
328 case '4': | 337 case '4': |
329 case '5': | 338 case '5': |
330 case '6': | 339 case '6': |
331 case '7': | 340 case '7': |
332 case '8': | 341 case '8': |
(...skipping 13 matching lines...) Expand all Loading... |
346 builder->AddAtom(atom); | 355 builder->AddAtom(atom); |
347 } | 356 } |
348 break; | 357 break; |
349 } | 358 } |
350 uc32 first_digit = Next(); | 359 uc32 first_digit = Next(); |
351 if (first_digit == '8' || first_digit == '9') { | 360 if (first_digit == '8' || first_digit == '9') { |
352 // If the 'u' flag is present, only syntax characters can be | 361 // If the 'u' flag is present, only syntax characters can be |
353 // escaped, | 362 // escaped, |
354 // no other identity escapes are allowed. If the 'u' flag is not | 363 // no other identity escapes are allowed. If the 'u' flag is not |
355 // present, all identity escapes are allowed. | 364 // present, all identity escapes are allowed. |
356 if (!unicode_) { | 365 if (!unicode()) { |
357 builder->AddCharacter(first_digit); | 366 builder->AddCharacter(first_digit); |
358 Advance(2); | 367 Advance(2); |
359 } else { | 368 } else { |
360 return ReportError(CStrVector("Invalid escape")); | 369 return ReportError(CStrVector("Invalid escape")); |
361 } | 370 } |
362 break; | 371 break; |
363 } | 372 } |
364 } | 373 } |
365 // FALLTHROUGH | 374 // FALLTHROUGH |
366 case '0': { | 375 case '0': { |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
407 Advance(2); | 416 Advance(2); |
408 builder->AddCharacter(controlLetter & 0x1f); | 417 builder->AddCharacter(controlLetter & 0x1f); |
409 } | 418 } |
410 break; | 419 break; |
411 } | 420 } |
412 case 'x': { | 421 case 'x': { |
413 Advance(2); | 422 Advance(2); |
414 uc32 value; | 423 uc32 value; |
415 if (ParseHexEscape(2, &value)) { | 424 if (ParseHexEscape(2, &value)) { |
416 builder->AddCharacter(value); | 425 builder->AddCharacter(value); |
417 } else if (!unicode_) { | 426 } else if (!unicode()) { |
418 builder->AddCharacter('x'); | 427 builder->AddCharacter('x'); |
419 } else { | 428 } else { |
420 // If the 'u' flag is present, invalid escapes are not treated as | 429 // If the 'u' flag is present, invalid escapes are not treated as |
421 // identity escapes. | 430 // identity escapes. |
422 return ReportError(CStrVector("Invalid escape")); | 431 return ReportError(CStrVector("Invalid escape")); |
423 } | 432 } |
424 break; | 433 break; |
425 } | 434 } |
426 case 'u': { | 435 case 'u': { |
427 Advance(2); | 436 Advance(2); |
428 uc32 value; | 437 uc32 value; |
429 if (ParseUnicodeEscape(&value)) { | 438 if (ParseUnicodeEscape(&value)) { |
430 builder->AddUnicodeCharacter(value); | 439 builder->AddUnicodeCharacter(value); |
431 } else if (!unicode_) { | 440 } else if (!unicode()) { |
432 builder->AddCharacter('u'); | 441 builder->AddCharacter('u'); |
433 } else { | 442 } else { |
434 // If the 'u' flag is present, invalid escapes are not treated as | 443 // If the 'u' flag is present, invalid escapes are not treated as |
435 // identity escapes. | 444 // identity escapes. |
436 return ReportError(CStrVector("Invalid unicode escape")); | 445 return ReportError(CStrVector("Invalid unicode escape")); |
437 } | 446 } |
438 break; | 447 break; |
439 } | 448 } |
440 default: | 449 default: |
441 Advance(); | 450 Advance(); |
442 // If the 'u' flag is present, only syntax characters can be | 451 // If the 'u' flag is present, only syntax characters can be |
443 // escaped, no | 452 // escaped, no |
444 // other identity escapes are allowed. If the 'u' flag is not | 453 // other identity escapes are allowed. If the 'u' flag is not |
445 // present, | 454 // present, |
446 // all identity escapes are allowed. | 455 // all identity escapes are allowed. |
447 if (!unicode_ || IsSyntaxCharacter(current())) { | 456 if (!unicode() || IsSyntaxCharacter(current())) { |
448 builder->AddCharacter(current()); | 457 builder->AddCharacter(current()); |
449 Advance(); | 458 Advance(); |
450 } else { | 459 } else { |
451 return ReportError(CStrVector("Invalid escape")); | 460 return ReportError(CStrVector("Invalid escape")); |
452 } | 461 } |
453 break; | 462 break; |
454 } | 463 } |
455 break; | 464 break; |
456 case '{': { | 465 case '{': { |
457 int dummy; | 466 int dummy; |
(...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
738 } | 747 } |
739 *value = val; | 748 *value = val; |
740 return true; | 749 return true; |
741 } | 750 } |
742 | 751 |
743 | 752 |
744 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 753 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
746 // allowed). In the latter case, the number of hex digits between { } is | 755 // allowed). In the latter case, the number of hex digits between { } is |
747 // arbitrary. \ and u have already been read. | 756 // arbitrary. \ and u have already been read. |
748 if (current() == '{' && unicode_) { | 757 if (current() == '{' && unicode()) { |
749 int start = position(); | 758 int start = position(); |
750 Advance(); | 759 Advance(); |
751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
752 if (current() == '}') { | 761 if (current() == '}') { |
753 Advance(); | 762 Advance(); |
754 return true; | 763 return true; |
755 } | 764 } |
756 } | 765 } |
757 Reset(start); | 766 Reset(start); |
758 return false; | 767 return false; |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
833 // For compatibility, we interpret a decimal escape that isn't | 842 // For compatibility, we interpret a decimal escape that isn't |
834 // a back reference (and therefore either \0 or not valid according | 843 // a back reference (and therefore either \0 or not valid according |
835 // to the specification) as a 1..3 digit octal character code. | 844 // to the specification) as a 1..3 digit octal character code. |
836 return ParseOctalLiteral(); | 845 return ParseOctalLiteral(); |
837 case 'x': { | 846 case 'x': { |
838 Advance(); | 847 Advance(); |
839 uc32 value; | 848 uc32 value; |
840 if (ParseHexEscape(2, &value)) { | 849 if (ParseHexEscape(2, &value)) { |
841 return value; | 850 return value; |
842 } | 851 } |
843 if (!unicode_) { | 852 if (!unicode()) { |
844 // If \x is not followed by a two-digit hexadecimal, treat it | 853 // If \x is not followed by a two-digit hexadecimal, treat it |
845 // as an identity escape. | 854 // as an identity escape. |
846 return 'x'; | 855 return 'x'; |
847 } | 856 } |
848 // If the 'u' flag is present, invalid escapes are not treated as | 857 // If the 'u' flag is present, invalid escapes are not treated as |
849 // identity escapes. | 858 // identity escapes. |
850 ReportError(CStrVector("Invalid escape")); | 859 ReportError(CStrVector("Invalid escape")); |
851 return 0; | 860 return 0; |
852 } | 861 } |
853 case 'u': { | 862 case 'u': { |
854 Advance(); | 863 Advance(); |
855 uc32 value; | 864 uc32 value; |
856 if (ParseUnicodeEscape(&value)) { | 865 if (ParseUnicodeEscape(&value)) { |
857 return value; | 866 return value; |
858 } | 867 } |
859 if (!unicode_) { | 868 if (!unicode()) { |
860 return 'u'; | 869 return 'u'; |
861 } | 870 } |
862 // If the 'u' flag is present, invalid escapes are not treated as | 871 // If the 'u' flag is present, invalid escapes are not treated as |
863 // identity escapes. | 872 // identity escapes. |
864 ReportError(CStrVector("Invalid unicode escape")); | 873 ReportError(CStrVector("Invalid unicode escape")); |
865 return 0; | 874 return 0; |
866 } | 875 } |
867 default: { | 876 default: { |
868 uc32 result = current(); | 877 uc32 result = current(); |
869 // If the 'u' flag is present, only syntax characters can be escaped, no | 878 // If the 'u' flag is present, only syntax characters can be escaped, no |
870 // other identity escapes are allowed. If the 'u' flag is not present, all | 879 // other identity escapes are allowed. If the 'u' flag is not present, all |
871 // identity escapes are allowed. | 880 // identity escapes are allowed. |
872 if (!unicode_ || IsSyntaxCharacter(result)) { | 881 if (!unicode() || IsSyntaxCharacter(result)) { |
873 Advance(); | 882 Advance(); |
874 return result; | 883 return result; |
875 } | 884 } |
876 ReportError(CStrVector("Invalid escape")); | 885 ReportError(CStrVector("Invalid escape")); |
877 return 0; | 886 return 0; |
878 } | 887 } |
879 } | 888 } |
880 return 0; | 889 return 0; |
881 } | 890 } |
882 | 891 |
883 | 892 |
884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { | 893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
885 DCHECK_EQ(0, *char_class); | 894 DCHECK_EQ(0, *char_class); |
886 uc32 first = current(); | 895 uc32 first = current(); |
887 if (first == '\\') { | 896 if (first == '\\') { |
888 switch (Next()) { | 897 switch (Next()) { |
889 case 'w': | 898 case 'w': |
890 case 'W': | 899 case 'W': |
891 case 'd': | 900 case 'd': |
892 case 'D': | 901 case 'D': |
893 case 's': | 902 case 's': |
894 case 'S': { | 903 case 'S': { |
895 *char_class = Next(); | 904 *char_class = Next(); |
896 Advance(2); | 905 Advance(2); |
897 return CharacterRange::Singleton(0); // Return dummy value. | 906 return CharacterRange::Singleton(0); // Return dummy value. |
898 } | 907 } |
899 case kEndMarker: | 908 case kEndMarker: |
900 return ReportError(CStrVector("\\ at end of pattern")); | 909 return ReportError(CStrVector("\\ at end of pattern")); |
901 default: | 910 default: |
902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); | 911 first = ParseClassCharacterEscape(CHECK_FAILED); |
903 return CharacterRange::Singleton(c); | |
904 } | 912 } |
905 } else { | 913 } else { |
906 Advance(); | 914 Advance(); |
907 return CharacterRange::Singleton(first); | |
908 } | 915 } |
| 916 |
| 917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { |
| 918 // Combine with possibly following trail surrogate. |
| 919 int start = position(); |
| 920 uc32 second = current(); |
| 921 if (second == '\\') { |
| 922 second = ParseClassCharacterEscape(CHECK_FAILED); |
| 923 } else { |
| 924 Advance(); |
| 925 } |
| 926 if (unibrow::Utf16::IsTrailSurrogate(second)) { |
| 927 first = unibrow::Utf16::CombineSurrogatePair(first, second); |
| 928 } else { |
| 929 Reset(start); |
| 930 } |
| 931 } |
| 932 |
| 933 return CharacterRange::Singleton(first); |
909 } | 934 } |
910 | 935 |
911 | 936 |
912 static const uc16 kNoCharClass = 0; | 937 static const uc16 kNoCharClass = 0; |
913 | 938 |
914 // Adds range or pre-defined character class to character ranges. | 939 // Adds range or pre-defined character class to character ranges. |
915 // If char_class is not kInvalidClass, it's interpreted as a class | 940 // If char_class is not kInvalidClass, it's interpreted as a class |
916 // escape (i.e., 's' means whitespace, from '\s'). | 941 // escape (i.e., 's' means whitespace, from '\s'). |
917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
918 uc16 char_class, CharacterRange range, | 943 uc16 char_class, CharacterRange range, |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
978 is_negated = !is_negated; | 1003 is_negated = !is_negated; |
979 } | 1004 } |
980 return new (zone()) RegExpCharacterClass(ranges, is_negated); | 1005 return new (zone()) RegExpCharacterClass(ranges, is_negated); |
981 } | 1006 } |
982 | 1007 |
983 | 1008 |
984 #undef CHECK_FAILED | 1009 #undef CHECK_FAILED |
985 | 1010 |
986 | 1011 |
987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, | 1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
988 FlatStringReader* input, bool multiline, | 1013 FlatStringReader* input, JSRegExp::Flags flags, |
989 bool unicode, RegExpCompileData* result) { | 1014 RegExpCompileData* result) { |
990 DCHECK(result != NULL); | 1015 DCHECK(result != NULL); |
991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); | 1016 RegExpParser parser(input, &result->error, flags, isolate, zone); |
992 RegExpTree* tree = parser.ParsePattern(); | 1017 RegExpTree* tree = parser.ParsePattern(); |
993 if (parser.failed()) { | 1018 if (parser.failed()) { |
994 DCHECK(tree == NULL); | 1019 DCHECK(tree == NULL); |
995 DCHECK(!result->error.is_null()); | 1020 DCHECK(!result->error.is_null()); |
996 } else { | 1021 } else { |
997 DCHECK(tree != NULL); | 1022 DCHECK(tree != NULL); |
998 DCHECK(result->error.is_null()); | 1023 DCHECK(result->error.is_null()); |
999 if (FLAG_trace_regexp_parser) { | 1024 if (FLAG_trace_regexp_parser) { |
1000 OFStream os(stdout); | 1025 OFStream os(stdout); |
1001 tree->Print(os, zone); | 1026 tree->Print(os, zone); |
1002 os << "\n"; | 1027 os << "\n"; |
1003 } | 1028 } |
1004 result->tree = tree; | 1029 result->tree = tree; |
1005 int capture_count = parser.captures_started(); | 1030 int capture_count = parser.captures_started(); |
1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; | 1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
1007 result->contains_anchor = parser.contains_anchor(); | 1032 result->contains_anchor = parser.contains_anchor(); |
1008 result->capture_count = capture_count; | 1033 result->capture_count = capture_count; |
1009 } | 1034 } |
1010 return !parser.failed(); | 1035 return !parser.failed(); |
1011 } | 1036 } |
1012 | 1037 |
1013 | 1038 |
1014 RegExpBuilder::RegExpBuilder(Zone* zone) | 1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) |
1015 : zone_(zone), | 1040 : zone_(zone), |
1016 pending_empty_(false), | 1041 pending_empty_(false), |
| 1042 flags_(flags), |
1017 characters_(NULL), | 1043 characters_(NULL), |
| 1044 pending_surrogate_(kNoPendingSurrogate), |
1018 terms_(), | 1045 terms_(), |
1019 alternatives_() | 1046 alternatives_() |
1020 #ifdef DEBUG | 1047 #ifdef DEBUG |
1021 , | 1048 , |
1022 last_added_(ADD_NONE) | 1049 last_added_(ADD_NONE) |
1023 #endif | 1050 #endif |
1024 { | 1051 { |
1025 } | 1052 } |
1026 | 1053 |
1027 | 1054 |
| 1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { |
| 1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| 1057 FlushPendingSurrogate(); |
| 1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
| 1059 pending_surrogate_ = lead_surrogate; |
| 1060 } |
| 1061 |
| 1062 |
| 1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
| 1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
| 1065 if (pending_surrogate_ != kNoPendingSurrogate) { |
| 1066 uc16 lead_surrogate = pending_surrogate_; |
| 1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| 1068 ZoneList<uc16> surrogate_pair(2, zone()); |
| 1069 surrogate_pair.Add(lead_surrogate, zone()); |
| 1070 surrogate_pair.Add(trail_surrogate, zone()); |
| 1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
| 1072 pending_surrogate_ = kNoPendingSurrogate; |
| 1073 AddAtom(atom); |
| 1074 } else { |
| 1075 pending_surrogate_ = trail_surrogate; |
| 1076 FlushPendingSurrogate(); |
| 1077 } |
| 1078 } |
| 1079 |
| 1080 |
| 1081 void RegExpBuilder::FlushPendingSurrogate() { |
| 1082 if (pending_surrogate_ != kNoPendingSurrogate) { |
| 1083 // Use character class to desugar lone surrogate matching. |
| 1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( |
| 1085 CharacterRange::List(zone(), |
| 1086 CharacterRange::Singleton(pending_surrogate_)), |
| 1087 false); |
| 1088 pending_surrogate_ = kNoPendingSurrogate; |
| 1089 DCHECK(unicode()); |
| 1090 AddCharacterClass(cc); |
| 1091 } |
| 1092 } |
| 1093 |
| 1094 |
1028 void RegExpBuilder::FlushCharacters() { | 1095 void RegExpBuilder::FlushCharacters() { |
| 1096 FlushPendingSurrogate(); |
1029 pending_empty_ = false; | 1097 pending_empty_ = false; |
1030 if (characters_ != NULL) { | 1098 if (characters_ != NULL) { |
1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); | 1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
1032 characters_ = NULL; | 1100 characters_ = NULL; |
1033 text_.Add(atom, zone()); | 1101 text_.Add(atom, zone()); |
1034 LAST(ADD_ATOM); | 1102 LAST(ADD_ATOM); |
1035 } | 1103 } |
1036 } | 1104 } |
1037 | 1105 |
1038 | 1106 |
1039 void RegExpBuilder::FlushText() { | 1107 void RegExpBuilder::FlushText() { |
1040 FlushCharacters(); | 1108 FlushCharacters(); |
1041 int num_text = text_.length(); | 1109 int num_text = text_.length(); |
1042 if (num_text == 0) { | 1110 if (num_text == 0) { |
1043 return; | 1111 return; |
1044 } else if (num_text == 1) { | 1112 } else if (num_text == 1) { |
1045 terms_.Add(text_.last(), zone()); | 1113 terms_.Add(text_.last(), zone()); |
1046 } else { | 1114 } else { |
1047 RegExpText* text = new (zone()) RegExpText(zone()); | 1115 RegExpText* text = new (zone()) RegExpText(zone()); |
1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); | 1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
1049 terms_.Add(text, zone()); | 1117 terms_.Add(text, zone()); |
1050 } | 1118 } |
1051 text_.Clear(); | 1119 text_.Clear(); |
1052 } | 1120 } |
1053 | 1121 |
1054 | 1122 |
1055 void RegExpBuilder::AddCharacter(uc16 c) { | 1123 void RegExpBuilder::AddCharacter(uc16 c) { |
| 1124 FlushPendingSurrogate(); |
1056 pending_empty_ = false; | 1125 pending_empty_ = false; |
1057 if (characters_ == NULL) { | 1126 if (characters_ == NULL) { |
1058 characters_ = new (zone()) ZoneList<uc16>(4, zone()); | 1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
1059 } | 1128 } |
1060 characters_->Add(c, zone()); | 1129 characters_->Add(c, zone()); |
1061 LAST(ADD_CHAR); | 1130 LAST(ADD_CHAR); |
1062 } | 1131 } |
1063 | 1132 |
1064 | 1133 |
1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { | 1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
1067 ZoneList<uc16> surrogate_pair(2, zone()); | 1136 DCHECK(unicode()); |
1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); | 1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); | 1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | 1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
1071 AddAtom(atom); | 1140 AddLeadSurrogate(c); |
| 1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
| 1142 AddTrailSurrogate(c); |
1072 } else { | 1143 } else { |
1073 AddCharacter(static_cast<uc16>(c)); | 1144 AddCharacter(static_cast<uc16>(c)); |
1074 } | 1145 } |
1075 } | 1146 } |
1076 | 1147 |
1077 | 1148 |
1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1079 | 1150 |
1080 | 1151 |
| 1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| 1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { |
| 1154 // In unicode mode, character class needs to be desugared, so it |
| 1155 // must be a standalone term instead of being part of a RegExpText. |
| 1156 AddTerm(cc); |
| 1157 } else { |
| 1158 AddAtom(cc); |
| 1159 } |
| 1160 } |
| 1161 |
| 1162 |
1081 void RegExpBuilder::AddAtom(RegExpTree* term) { | 1163 void RegExpBuilder::AddAtom(RegExpTree* term) { |
1082 if (term->IsEmpty()) { | 1164 if (term->IsEmpty()) { |
1083 AddEmpty(); | 1165 AddEmpty(); |
1084 return; | 1166 return; |
1085 } | 1167 } |
1086 if (term->IsTextElement()) { | 1168 if (term->IsTextElement()) { |
1087 FlushCharacters(); | 1169 FlushCharacters(); |
1088 text_.Add(term, zone()); | 1170 text_.Add(term, zone()); |
1089 } else { | 1171 } else { |
1090 FlushText(); | 1172 FlushText(); |
1091 terms_.Add(term, zone()); | 1173 terms_.Add(term, zone()); |
1092 } | 1174 } |
1093 LAST(ADD_ATOM); | 1175 LAST(ADD_ATOM); |
1094 } | 1176 } |
1095 | 1177 |
1096 | 1178 |
| 1179 void RegExpBuilder::AddTerm(RegExpTree* term) { |
| 1180 FlushText(); |
| 1181 terms_.Add(term, zone()); |
| 1182 LAST(ADD_ATOM); |
| 1183 } |
| 1184 |
| 1185 |
1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) { | 1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
1098 FlushText(); | 1187 FlushText(); |
1099 terms_.Add(assert, zone()); | 1188 terms_.Add(assert, zone()); |
1100 LAST(ADD_ASSERT); | 1189 LAST(ADD_ASSERT); |
1101 } | 1190 } |
1102 | 1191 |
1103 | 1192 |
1104 void RegExpBuilder::NewAlternative() { FlushTerms(); } | 1193 void RegExpBuilder::NewAlternative() { FlushTerms(); } |
1105 | 1194 |
1106 | 1195 |
(...skipping 18 matching lines...) Expand all Loading... |
1125 FlushTerms(); | 1214 FlushTerms(); |
1126 int num_alternatives = alternatives_.length(); | 1215 int num_alternatives = alternatives_.length(); |
1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
1128 if (num_alternatives == 1) return alternatives_.last(); | 1217 if (num_alternatives == 1) return alternatives_.last(); |
1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
1130 } | 1219 } |
1131 | 1220 |
1132 | 1221 |
1133 void RegExpBuilder::AddQuantifierToAtom( | 1222 void RegExpBuilder::AddQuantifierToAtom( |
1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | 1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
| 1224 FlushPendingSurrogate(); |
1135 if (pending_empty_) { | 1225 if (pending_empty_) { |
1136 pending_empty_ = false; | 1226 pending_empty_ = false; |
1137 return; | 1227 return; |
1138 } | 1228 } |
1139 RegExpTree* atom; | 1229 RegExpTree* atom; |
1140 if (characters_ != NULL) { | 1230 if (characters_ != NULL) { |
1141 DCHECK(last_added_ == ADD_CHAR); | 1231 DCHECK(last_added_ == ADD_CHAR); |
1142 // Last atom was character. | 1232 // Last atom was character. |
1143 Vector<const uc16> char_vector = characters_->ToConstVector(); | 1233 Vector<const uc16> char_vector = characters_->ToConstVector(); |
1144 int num_chars = char_vector.length(); | 1234 int num_chars = char_vector.length(); |
(...skipping 26 matching lines...) Expand all Loading... |
1171 UNREACHABLE(); | 1261 UNREACHABLE(); |
1172 return; | 1262 return; |
1173 } | 1263 } |
1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1175 zone()); | 1265 zone()); |
1176 LAST(ADD_TERM); | 1266 LAST(ADD_TERM); |
1177 } | 1267 } |
1178 | 1268 |
1179 } // namespace internal | 1269 } // namespace internal |
1180 } // namespace v8 | 1270 } // namespace v8 |
OLD | NEW |