OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
11 #include "src/regexp/jsregexp.h" | 11 #include "src/regexp/jsregexp.h" |
12 #include "src/utils.h" | 12 #include "src/utils.h" |
13 | 13 |
14 namespace v8 { | 14 namespace v8 { |
15 namespace internal { | 15 namespace internal { |
16 | 16 |
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) | 18 bool multiline, bool unicode, Isolate* isolate, |
| 19 Zone* zone) |
19 : isolate_(isolate), | 20 : isolate_(isolate), |
20 zone_(zone), | 21 zone_(zone), |
21 error_(error), | 22 error_(error), |
22 captures_(NULL), | 23 captures_(NULL), |
23 in_(in), | 24 in_(in), |
24 current_(kEndMarker), | 25 current_(kEndMarker), |
25 flags_(flags), | |
26 next_pos_(0), | 26 next_pos_(0), |
27 captures_started_(0), | 27 captures_started_(0), |
28 capture_count_(0), | 28 capture_count_(0), |
29 has_more_(true), | 29 has_more_(true), |
| 30 multiline_(multiline), |
| 31 unicode_(unicode), |
30 simple_(false), | 32 simple_(false), |
31 contains_anchor_(false), | 33 contains_anchor_(false), |
32 is_scanned_for_captures_(false), | 34 is_scanned_for_captures_(false), |
33 failed_(false) { | 35 failed_(false) { |
34 Advance(); | 36 Advance(); |
35 } | 37 } |
36 | 38 |
37 | 39 |
38 template <bool update_position> | |
39 uc32 RegExpParser::ReadNext() { | |
40 int position = next_pos_; | |
41 uc32 c0 = in()->Get(position); | |
42 position++; | |
43 // Read the whole surrogate pair in case of unicode flag, if possible. | |
44 if (unicode() && position < in()->length() && | |
45 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(c0))) { | |
46 uc16 c1 = in()->Get(position); | |
47 if (unibrow::Utf16::IsTrailSurrogate(c1)) { | |
48 c0 = unibrow::Utf16::CombineSurrogatePair(static_cast<uc16>(c0), c1); | |
49 position++; | |
50 } | |
51 } | |
52 if (update_position) next_pos_ = position; | |
53 return c0; | |
54 } | |
55 | |
56 | |
57 uc32 RegExpParser::Next() { | 40 uc32 RegExpParser::Next() { |
58 if (has_next()) { | 41 if (has_next()) { |
59 return ReadNext<false>(); | 42 return in()->Get(next_pos_); |
60 } else { | 43 } else { |
61 return kEndMarker; | 44 return kEndMarker; |
62 } | 45 } |
63 } | 46 } |
64 | 47 |
65 | 48 |
66 void RegExpParser::Advance() { | 49 void RegExpParser::Advance() { |
67 if (has_next()) { | 50 if (next_pos_ < in()->length()) { |
68 StackLimitCheck check(isolate()); | 51 StackLimitCheck check(isolate()); |
69 if (check.HasOverflowed()) { | 52 if (check.HasOverflowed()) { |
70 ReportError(CStrVector(Isolate::kStackOverflowMessage)); | 53 ReportError(CStrVector(Isolate::kStackOverflowMessage)); |
71 } else if (zone()->excess_allocation()) { | 54 } else if (zone()->excess_allocation()) { |
72 ReportError(CStrVector("Regular expression too large")); | 55 ReportError(CStrVector("Regular expression too large")); |
73 } else { | 56 } else { |
74 current_ = ReadNext<true>(); | 57 current_ = in()->Get(next_pos_); |
| 58 next_pos_++; |
| 59 // Read the whole surrogate pair in case of unicode flag, if possible. |
| 60 if (unicode_ && next_pos_ < in()->length() && |
| 61 unibrow::Utf16::IsLeadSurrogate(static_cast<uc16>(current_))) { |
| 62 uc16 trail = in()->Get(next_pos_); |
| 63 if (unibrow::Utf16::IsTrailSurrogate(trail)) { |
| 64 current_ = unibrow::Utf16::CombineSurrogatePair( |
| 65 static_cast<uc16>(current_), trail); |
| 66 next_pos_++; |
| 67 } |
| 68 } |
75 } | 69 } |
76 } else { | 70 } else { |
77 current_ = kEndMarker; | 71 current_ = kEndMarker; |
78 // Advance so that position() points to 1-after-the-last-character. This is | 72 // Advance so that position() points to 1-after-the-last-character. This is |
79 // important so that Reset() to this position works correctly. | 73 // important so that Reset() to this position works correctly. |
80 next_pos_ = in()->length() + 1; | 74 next_pos_ = in()->length() + 1; |
81 has_more_ = false; | 75 has_more_ = false; |
82 } | 76 } |
83 } | 77 } |
84 | 78 |
(...skipping 56 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
141 // Alternative :: | 135 // Alternative :: |
142 // [empty] | 136 // [empty] |
143 // Term Alternative | 137 // Term Alternative |
144 // Term :: | 138 // Term :: |
145 // Assertion | 139 // Assertion |
146 // Atom | 140 // Atom |
147 // Atom Quantifier | 141 // Atom Quantifier |
148 RegExpTree* RegExpParser::ParseDisjunction() { | 142 RegExpTree* RegExpParser::ParseDisjunction() { |
149 // Used to store current state while parsing subexpressions. | 143 // Used to store current state while parsing subexpressions. |
150 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, | 144 RegExpParserState initial_state(NULL, INITIAL, RegExpLookaround::LOOKAHEAD, 0, |
151 flags_, zone()); | 145 zone()); |
152 RegExpParserState* state = &initial_state; | 146 RegExpParserState* state = &initial_state; |
153 // Cache the builder in a local variable for quick access. | 147 // Cache the builder in a local variable for quick access. |
154 RegExpBuilder* builder = initial_state.builder(); | 148 RegExpBuilder* builder = initial_state.builder(); |
155 while (true) { | 149 while (true) { |
156 switch (current()) { | 150 switch (current()) { |
157 case kEndMarker: | 151 case kEndMarker: |
158 if (state->IsSubexpression()) { | 152 if (state->IsSubexpression()) { |
159 // Inside a parenthesized group when hitting end of input. | 153 // Inside a parenthesized group when hitting end of input. |
160 ReportError(CStrVector("Unterminated group") CHECK_FAILED); | 154 ReportError(CStrVector("Unterminated group") CHECK_FAILED); |
161 } | 155 } |
(...skipping 43 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
205 Advance(); | 199 Advance(); |
206 builder->NewAlternative(); | 200 builder->NewAlternative(); |
207 continue; | 201 continue; |
208 } | 202 } |
209 case '*': | 203 case '*': |
210 case '+': | 204 case '+': |
211 case '?': | 205 case '?': |
212 return ReportError(CStrVector("Nothing to repeat")); | 206 return ReportError(CStrVector("Nothing to repeat")); |
213 case '^': { | 207 case '^': { |
214 Advance(); | 208 Advance(); |
215 if (multiline()) { | 209 if (multiline_) { |
216 builder->AddAssertion( | 210 builder->AddAssertion( |
217 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); | 211 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_LINE)); |
218 } else { | 212 } else { |
219 builder->AddAssertion( | 213 builder->AddAssertion( |
220 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); | 214 new (zone()) RegExpAssertion(RegExpAssertion::START_OF_INPUT)); |
221 set_contains_anchor(); | 215 set_contains_anchor(); |
222 } | 216 } |
223 continue; | 217 continue; |
224 } | 218 } |
225 case '$': { | 219 case '$': { |
226 Advance(); | 220 Advance(); |
227 RegExpAssertion::AssertionType assertion_type = | 221 RegExpAssertion::AssertionType assertion_type = |
228 multiline() ? RegExpAssertion::END_OF_LINE | 222 multiline_ ? RegExpAssertion::END_OF_LINE |
229 : RegExpAssertion::END_OF_INPUT; | 223 : RegExpAssertion::END_OF_INPUT; |
230 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); | 224 builder->AddAssertion(new (zone()) RegExpAssertion(assertion_type)); |
231 continue; | 225 continue; |
232 } | 226 } |
233 case '.': { | 227 case '.': { |
234 Advance(); | 228 Advance(); |
235 // everything except \x0a, \x0d, \u2028 and \u2029 | 229 // everything except \x0a, \x0d, \u2028 and \u2029 |
236 ZoneList<CharacterRange>* ranges = | 230 ZoneList<CharacterRange>* ranges = |
237 new (zone()) ZoneList<CharacterRange>(2, zone()); | 231 new (zone()) ZoneList<CharacterRange>(2, zone()); |
238 CharacterRange::AddClassEscape('.', ranges, zone()); | 232 CharacterRange::AddClassEscape('.', ranges, zone()); |
239 RegExpCharacterClass* cc = | 233 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
240 new (zone()) RegExpCharacterClass(ranges, false); | 234 builder->AddAtom(atom); |
241 builder->AddCharacterClass(cc); | |
242 break; | 235 break; |
243 } | 236 } |
244 case '(': { | 237 case '(': { |
245 SubexpressionType subexpr_type = CAPTURE; | 238 SubexpressionType subexpr_type = CAPTURE; |
246 RegExpLookaround::Type lookaround_type = state->lookaround_type(); | 239 RegExpLookaround::Type lookaround_type = state->lookaround_type(); |
247 Advance(); | 240 Advance(); |
248 if (current() == '?') { | 241 if (current() == '?') { |
249 switch (Next()) { | 242 switch (Next()) { |
250 case ':': | 243 case ':': |
251 subexpr_type = GROUPING; | 244 subexpr_type = GROUPING; |
(...skipping 24 matching lines...) Expand all Loading... |
276 break; | 269 break; |
277 } | 270 } |
278 Advance(2); | 271 Advance(2); |
279 } else { | 272 } else { |
280 if (captures_started_ >= kMaxCaptures) { | 273 if (captures_started_ >= kMaxCaptures) { |
281 ReportError(CStrVector("Too many captures") CHECK_FAILED); | 274 ReportError(CStrVector("Too many captures") CHECK_FAILED); |
282 } | 275 } |
283 captures_started_++; | 276 captures_started_++; |
284 } | 277 } |
285 // Store current state and begin new disjunction parsing. | 278 // Store current state and begin new disjunction parsing. |
286 state = | 279 state = new (zone()) RegExpParserState( |
287 new (zone()) RegExpParserState(state, subexpr_type, lookaround_type, | 280 state, subexpr_type, lookaround_type, captures_started_, zone()); |
288 captures_started_, flags_, zone()); | |
289 builder = state->builder(); | 281 builder = state->builder(); |
290 continue; | 282 continue; |
291 } | 283 } |
292 case '[': { | 284 case '[': { |
293 RegExpTree* cc = ParseCharacterClass(CHECK_FAILED); | 285 RegExpTree* atom = ParseCharacterClass(CHECK_FAILED); |
294 builder->AddCharacterClass(cc->AsCharacterClass()); | 286 builder->AddAtom(atom); |
295 break; | 287 break; |
296 } | 288 } |
297 // Atom :: | 289 // Atom :: |
298 // \ AtomEscape | 290 // \ AtomEscape |
299 case '\\': | 291 case '\\': |
300 switch (Next()) { | 292 switch (Next()) { |
301 case kEndMarker: | 293 case kEndMarker: |
302 return ReportError(CStrVector("\\ at end of pattern")); | 294 return ReportError(CStrVector("\\ at end of pattern")); |
303 case 'b': | 295 case 'b': |
304 Advance(2); | 296 Advance(2); |
(...skipping 14 matching lines...) Expand all Loading... |
319 case 'D': | 311 case 'D': |
320 case 's': | 312 case 's': |
321 case 'S': | 313 case 'S': |
322 case 'w': | 314 case 'w': |
323 case 'W': { | 315 case 'W': { |
324 uc32 c = Next(); | 316 uc32 c = Next(); |
325 Advance(2); | 317 Advance(2); |
326 ZoneList<CharacterRange>* ranges = | 318 ZoneList<CharacterRange>* ranges = |
327 new (zone()) ZoneList<CharacterRange>(2, zone()); | 319 new (zone()) ZoneList<CharacterRange>(2, zone()); |
328 CharacterRange::AddClassEscape(c, ranges, zone()); | 320 CharacterRange::AddClassEscape(c, ranges, zone()); |
329 RegExpCharacterClass* cc = | 321 RegExpTree* atom = new (zone()) RegExpCharacterClass(ranges, false); |
330 new (zone()) RegExpCharacterClass(ranges, false); | 322 builder->AddAtom(atom); |
331 builder->AddCharacterClass(cc); | |
332 break; | 323 break; |
333 } | 324 } |
334 case '1': | 325 case '1': |
335 case '2': | 326 case '2': |
336 case '3': | 327 case '3': |
337 case '4': | 328 case '4': |
338 case '5': | 329 case '5': |
339 case '6': | 330 case '6': |
340 case '7': | 331 case '7': |
341 case '8': | 332 case '8': |
(...skipping 13 matching lines...) Expand all Loading... |
355 builder->AddAtom(atom); | 346 builder->AddAtom(atom); |
356 } | 347 } |
357 break; | 348 break; |
358 } | 349 } |
359 uc32 first_digit = Next(); | 350 uc32 first_digit = Next(); |
360 if (first_digit == '8' || first_digit == '9') { | 351 if (first_digit == '8' || first_digit == '9') { |
361 // If the 'u' flag is present, only syntax characters can be | 352 // If the 'u' flag is present, only syntax characters can be |
362 // escaped, | 353 // escaped, |
363 // no other identity escapes are allowed. If the 'u' flag is not | 354 // no other identity escapes are allowed. If the 'u' flag is not |
364 // present, all identity escapes are allowed. | 355 // present, all identity escapes are allowed. |
365 if (!unicode()) { | 356 if (!unicode_) { |
366 builder->AddCharacter(first_digit); | 357 builder->AddCharacter(first_digit); |
367 Advance(2); | 358 Advance(2); |
368 } else { | 359 } else { |
369 return ReportError(CStrVector("Invalid escape")); | 360 return ReportError(CStrVector("Invalid escape")); |
370 } | 361 } |
371 break; | 362 break; |
372 } | 363 } |
373 } | 364 } |
374 // FALLTHROUGH | 365 // FALLTHROUGH |
375 case '0': { | 366 case '0': { |
(...skipping 40 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
416 Advance(2); | 407 Advance(2); |
417 builder->AddCharacter(controlLetter & 0x1f); | 408 builder->AddCharacter(controlLetter & 0x1f); |
418 } | 409 } |
419 break; | 410 break; |
420 } | 411 } |
421 case 'x': { | 412 case 'x': { |
422 Advance(2); | 413 Advance(2); |
423 uc32 value; | 414 uc32 value; |
424 if (ParseHexEscape(2, &value)) { | 415 if (ParseHexEscape(2, &value)) { |
425 builder->AddCharacter(value); | 416 builder->AddCharacter(value); |
426 } else if (!unicode()) { | 417 } else if (!unicode_) { |
427 builder->AddCharacter('x'); | 418 builder->AddCharacter('x'); |
428 } else { | 419 } else { |
429 // If the 'u' flag is present, invalid escapes are not treated as | 420 // If the 'u' flag is present, invalid escapes are not treated as |
430 // identity escapes. | 421 // identity escapes. |
431 return ReportError(CStrVector("Invalid escape")); | 422 return ReportError(CStrVector("Invalid escape")); |
432 } | 423 } |
433 break; | 424 break; |
434 } | 425 } |
435 case 'u': { | 426 case 'u': { |
436 Advance(2); | 427 Advance(2); |
437 uc32 value; | 428 uc32 value; |
438 if (ParseUnicodeEscape(&value)) { | 429 if (ParseUnicodeEscape(&value)) { |
439 builder->AddUnicodeCharacter(value); | 430 builder->AddUnicodeCharacter(value); |
440 } else if (!unicode()) { | 431 } else if (!unicode_) { |
441 builder->AddCharacter('u'); | 432 builder->AddCharacter('u'); |
442 } else { | 433 } else { |
443 // If the 'u' flag is present, invalid escapes are not treated as | 434 // If the 'u' flag is present, invalid escapes are not treated as |
444 // identity escapes. | 435 // identity escapes. |
445 return ReportError(CStrVector("Invalid unicode escape")); | 436 return ReportError(CStrVector("Invalid unicode escape")); |
446 } | 437 } |
447 break; | 438 break; |
448 } | 439 } |
449 default: | 440 default: |
450 Advance(); | 441 Advance(); |
451 // If the 'u' flag is present, only syntax characters can be | 442 // If the 'u' flag is present, only syntax characters can be |
452 // escaped, no | 443 // escaped, no |
453 // other identity escapes are allowed. If the 'u' flag is not | 444 // other identity escapes are allowed. If the 'u' flag is not |
454 // present, | 445 // present, |
455 // all identity escapes are allowed. | 446 // all identity escapes are allowed. |
456 if (!unicode() || IsSyntaxCharacter(current())) { | 447 if (!unicode_ || IsSyntaxCharacter(current())) { |
457 builder->AddCharacter(current()); | 448 builder->AddCharacter(current()); |
458 Advance(); | 449 Advance(); |
459 } else { | 450 } else { |
460 return ReportError(CStrVector("Invalid escape")); | 451 return ReportError(CStrVector("Invalid escape")); |
461 } | 452 } |
462 break; | 453 break; |
463 } | 454 } |
464 break; | 455 break; |
465 case '{': { | 456 case '{': { |
466 int dummy; | 457 int dummy; |
(...skipping 280 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
747 } | 738 } |
748 *value = val; | 739 *value = val; |
749 return true; | 740 return true; |
750 } | 741 } |
751 | 742 |
752 | 743 |
753 bool RegExpParser::ParseUnicodeEscape(uc32* value) { | 744 bool RegExpParser::ParseUnicodeEscape(uc32* value) { |
754 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are | 745 // Accept both \uxxxx and \u{xxxxxx} (if harmony unicode escapes are |
755 // allowed). In the latter case, the number of hex digits between { } is | 746 // allowed). In the latter case, the number of hex digits between { } is |
756 // arbitrary. \ and u have already been read. | 747 // arbitrary. \ and u have already been read. |
757 if (current() == '{' && unicode()) { | 748 if (current() == '{' && unicode_) { |
758 int start = position(); | 749 int start = position(); |
759 Advance(); | 750 Advance(); |
760 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { | 751 if (ParseUnlimitedLengthHexNumber(0x10ffff, value)) { |
761 if (current() == '}') { | 752 if (current() == '}') { |
762 Advance(); | 753 Advance(); |
763 return true; | 754 return true; |
764 } | 755 } |
765 } | 756 } |
766 Reset(start); | 757 Reset(start); |
767 return false; | 758 return false; |
(...skipping 74 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
842 // For compatibility, we interpret a decimal escape that isn't | 833 // For compatibility, we interpret a decimal escape that isn't |
843 // a back reference (and therefore either \0 or not valid according | 834 // a back reference (and therefore either \0 or not valid according |
844 // to the specification) as a 1..3 digit octal character code. | 835 // to the specification) as a 1..3 digit octal character code. |
845 return ParseOctalLiteral(); | 836 return ParseOctalLiteral(); |
846 case 'x': { | 837 case 'x': { |
847 Advance(); | 838 Advance(); |
848 uc32 value; | 839 uc32 value; |
849 if (ParseHexEscape(2, &value)) { | 840 if (ParseHexEscape(2, &value)) { |
850 return value; | 841 return value; |
851 } | 842 } |
852 if (!unicode()) { | 843 if (!unicode_) { |
853 // If \x is not followed by a two-digit hexadecimal, treat it | 844 // If \x is not followed by a two-digit hexadecimal, treat it |
854 // as an identity escape. | 845 // as an identity escape. |
855 return 'x'; | 846 return 'x'; |
856 } | 847 } |
857 // If the 'u' flag is present, invalid escapes are not treated as | 848 // If the 'u' flag is present, invalid escapes are not treated as |
858 // identity escapes. | 849 // identity escapes. |
859 ReportError(CStrVector("Invalid escape")); | 850 ReportError(CStrVector("Invalid escape")); |
860 return 0; | 851 return 0; |
861 } | 852 } |
862 case 'u': { | 853 case 'u': { |
863 Advance(); | 854 Advance(); |
864 uc32 value; | 855 uc32 value; |
865 if (ParseUnicodeEscape(&value)) { | 856 if (ParseUnicodeEscape(&value)) { |
866 return value; | 857 return value; |
867 } | 858 } |
868 if (!unicode()) { | 859 if (!unicode_) { |
869 return 'u'; | 860 return 'u'; |
870 } | 861 } |
871 // If the 'u' flag is present, invalid escapes are not treated as | 862 // If the 'u' flag is present, invalid escapes are not treated as |
872 // identity escapes. | 863 // identity escapes. |
873 ReportError(CStrVector("Invalid unicode escape")); | 864 ReportError(CStrVector("Invalid unicode escape")); |
874 return 0; | 865 return 0; |
875 } | 866 } |
876 default: { | 867 default: { |
877 uc32 result = current(); | 868 uc32 result = current(); |
878 // If the 'u' flag is present, only syntax characters can be escaped, no | 869 // If the 'u' flag is present, only syntax characters can be escaped, no |
879 // other identity escapes are allowed. If the 'u' flag is not present, all | 870 // other identity escapes are allowed. If the 'u' flag is not present, all |
880 // identity escapes are allowed. | 871 // identity escapes are allowed. |
881 if (!unicode() || IsSyntaxCharacter(result)) { | 872 if (!unicode_ || IsSyntaxCharacter(result)) { |
882 Advance(); | 873 Advance(); |
883 return result; | 874 return result; |
884 } | 875 } |
885 ReportError(CStrVector("Invalid escape")); | 876 ReportError(CStrVector("Invalid escape")); |
886 return 0; | 877 return 0; |
887 } | 878 } |
888 } | 879 } |
889 return 0; | 880 return 0; |
890 } | 881 } |
891 | 882 |
892 | 883 |
893 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { | 884 CharacterRange RegExpParser::ParseClassAtom(uc16* char_class) { |
894 DCHECK_EQ(0, *char_class); | 885 DCHECK_EQ(0, *char_class); |
895 uc32 first = current(); | 886 uc32 first = current(); |
896 if (first == '\\') { | 887 if (first == '\\') { |
897 switch (Next()) { | 888 switch (Next()) { |
898 case 'w': | 889 case 'w': |
899 case 'W': | 890 case 'W': |
900 case 'd': | 891 case 'd': |
901 case 'D': | 892 case 'D': |
902 case 's': | 893 case 's': |
903 case 'S': { | 894 case 'S': { |
904 *char_class = Next(); | 895 *char_class = Next(); |
905 Advance(2); | 896 Advance(2); |
906 return CharacterRange::Singleton(0); // Return dummy value. | 897 return CharacterRange::Singleton(0); // Return dummy value. |
907 } | 898 } |
908 case kEndMarker: | 899 case kEndMarker: |
909 return ReportError(CStrVector("\\ at end of pattern")); | 900 return ReportError(CStrVector("\\ at end of pattern")); |
910 default: | 901 default: |
911 first = ParseClassCharacterEscape(CHECK_FAILED); | 902 uc32 c = ParseClassCharacterEscape(CHECK_FAILED); |
| 903 return CharacterRange::Singleton(c); |
912 } | 904 } |
913 } else { | 905 } else { |
914 Advance(); | 906 Advance(); |
| 907 return CharacterRange::Singleton(first); |
915 } | 908 } |
916 | |
917 if (unicode() && unibrow::Utf16::IsLeadSurrogate(first)) { | |
918 // Combine with possibly following trail surrogate. | |
919 int start = position(); | |
920 uc32 second = current(); | |
921 if (second == '\\') { | |
922 second = ParseClassCharacterEscape(CHECK_FAILED); | |
923 } else { | |
924 Advance(); | |
925 } | |
926 if (unibrow::Utf16::IsTrailSurrogate(second)) { | |
927 first = unibrow::Utf16::CombineSurrogatePair(first, second); | |
928 } else { | |
929 Reset(start); | |
930 } | |
931 } | |
932 | |
933 return CharacterRange::Singleton(first); | |
934 } | 909 } |
935 | 910 |
936 | 911 |
937 static const uc16 kNoCharClass = 0; | 912 static const uc16 kNoCharClass = 0; |
938 | 913 |
939 // Adds range or pre-defined character class to character ranges. | 914 // Adds range or pre-defined character class to character ranges. |
940 // If char_class is not kInvalidClass, it's interpreted as a class | 915 // If char_class is not kInvalidClass, it's interpreted as a class |
941 // escape (i.e., 's' means whitespace, from '\s'). | 916 // escape (i.e., 's' means whitespace, from '\s'). |
942 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, | 917 static inline void AddRangeOrEscape(ZoneList<CharacterRange>* ranges, |
943 uc16 char_class, CharacterRange range, | 918 uc16 char_class, CharacterRange range, |
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1003 is_negated = !is_negated; | 978 is_negated = !is_negated; |
1004 } | 979 } |
1005 return new (zone()) RegExpCharacterClass(ranges, is_negated); | 980 return new (zone()) RegExpCharacterClass(ranges, is_negated); |
1006 } | 981 } |
1007 | 982 |
1008 | 983 |
1009 #undef CHECK_FAILED | 984 #undef CHECK_FAILED |
1010 | 985 |
1011 | 986 |
1012 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, | 987 bool RegExpParser::ParseRegExp(Isolate* isolate, Zone* zone, |
1013 FlatStringReader* input, JSRegExp::Flags flags, | 988 FlatStringReader* input, bool multiline, |
1014 RegExpCompileData* result) { | 989 bool unicode, RegExpCompileData* result) { |
1015 DCHECK(result != NULL); | 990 DCHECK(result != NULL); |
1016 RegExpParser parser(input, &result->error, flags, isolate, zone); | 991 RegExpParser parser(input, &result->error, multiline, unicode, isolate, zone); |
1017 RegExpTree* tree = parser.ParsePattern(); | 992 RegExpTree* tree = parser.ParsePattern(); |
1018 if (parser.failed()) { | 993 if (parser.failed()) { |
1019 DCHECK(tree == NULL); | 994 DCHECK(tree == NULL); |
1020 DCHECK(!result->error.is_null()); | 995 DCHECK(!result->error.is_null()); |
1021 } else { | 996 } else { |
1022 DCHECK(tree != NULL); | 997 DCHECK(tree != NULL); |
1023 DCHECK(result->error.is_null()); | 998 DCHECK(result->error.is_null()); |
1024 if (FLAG_trace_regexp_parser) { | 999 if (FLAG_trace_regexp_parser) { |
1025 OFStream os(stdout); | 1000 OFStream os(stdout); |
1026 tree->Print(os, zone); | 1001 tree->Print(os, zone); |
1027 os << "\n"; | 1002 os << "\n"; |
1028 } | 1003 } |
1029 result->tree = tree; | 1004 result->tree = tree; |
1030 int capture_count = parser.captures_started(); | 1005 int capture_count = parser.captures_started(); |
1031 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; | 1006 result->simple = tree->IsAtom() && parser.simple() && capture_count == 0; |
1032 result->contains_anchor = parser.contains_anchor(); | 1007 result->contains_anchor = parser.contains_anchor(); |
1033 result->capture_count = capture_count; | 1008 result->capture_count = capture_count; |
1034 } | 1009 } |
1035 return !parser.failed(); | 1010 return !parser.failed(); |
1036 } | 1011 } |
1037 | 1012 |
1038 | 1013 |
1039 RegExpBuilder::RegExpBuilder(Zone* zone, JSRegExp::Flags flags) | 1014 RegExpBuilder::RegExpBuilder(Zone* zone) |
1040 : zone_(zone), | 1015 : zone_(zone), |
1041 pending_empty_(false), | 1016 pending_empty_(false), |
1042 flags_(flags), | |
1043 characters_(NULL), | 1017 characters_(NULL), |
1044 pending_surrogate_(kNoPendingSurrogate), | |
1045 terms_(), | 1018 terms_(), |
1046 alternatives_() | 1019 alternatives_() |
1047 #ifdef DEBUG | 1020 #ifdef DEBUG |
1048 , | 1021 , |
1049 last_added_(ADD_NONE) | 1022 last_added_(ADD_NONE) |
1050 #endif | 1023 #endif |
1051 { | 1024 { |
1052 } | 1025 } |
1053 | 1026 |
1054 | 1027 |
1055 void RegExpBuilder::AddLeadSurrogate(uc16 lead_surrogate) { | |
1056 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | |
1057 FlushPendingSurrogate(); | |
1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. | |
1059 pending_surrogate_ = lead_surrogate; | |
1060 } | |
1061 | |
1062 | |
1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { | |
1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); | |
1065 if (pending_surrogate_ != kNoPendingSurrogate) { | |
1066 uc16 lead_surrogate = pending_surrogate_; | |
1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | |
1068 ZoneList<uc16> surrogate_pair(2, zone()); | |
1069 surrogate_pair.Add(lead_surrogate, zone()); | |
1070 surrogate_pair.Add(trail_surrogate, zone()); | |
1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | |
1072 pending_surrogate_ = kNoPendingSurrogate; | |
1073 AddAtom(atom); | |
1074 } else { | |
1075 pending_surrogate_ = trail_surrogate; | |
1076 FlushPendingSurrogate(); | |
1077 } | |
1078 } | |
1079 | |
1080 | |
1081 void RegExpBuilder::FlushPendingSurrogate() { | |
1082 if (pending_surrogate_ != kNoPendingSurrogate) { | |
1083 // Use character class to desugar lone surrogate matching. | |
1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( | |
1085 CharacterRange::List(zone(), | |
1086 CharacterRange::Singleton(pending_surrogate_)), | |
1087 false); | |
1088 pending_surrogate_ = kNoPendingSurrogate; | |
1089 DCHECK(unicode()); | |
1090 AddCharacterClass(cc); | |
1091 } | |
1092 } | |
1093 | |
1094 | |
1095 void RegExpBuilder::FlushCharacters() { | 1028 void RegExpBuilder::FlushCharacters() { |
1096 FlushPendingSurrogate(); | |
1097 pending_empty_ = false; | 1029 pending_empty_ = false; |
1098 if (characters_ != NULL) { | 1030 if (characters_ != NULL) { |
1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); | 1031 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
1100 characters_ = NULL; | 1032 characters_ = NULL; |
1101 text_.Add(atom, zone()); | 1033 text_.Add(atom, zone()); |
1102 LAST(ADD_ATOM); | 1034 LAST(ADD_ATOM); |
1103 } | 1035 } |
1104 } | 1036 } |
1105 | 1037 |
1106 | 1038 |
1107 void RegExpBuilder::FlushText() { | 1039 void RegExpBuilder::FlushText() { |
1108 FlushCharacters(); | 1040 FlushCharacters(); |
1109 int num_text = text_.length(); | 1041 int num_text = text_.length(); |
1110 if (num_text == 0) { | 1042 if (num_text == 0) { |
1111 return; | 1043 return; |
1112 } else if (num_text == 1) { | 1044 } else if (num_text == 1) { |
1113 terms_.Add(text_.last(), zone()); | 1045 terms_.Add(text_.last(), zone()); |
1114 } else { | 1046 } else { |
1115 RegExpText* text = new (zone()) RegExpText(zone()); | 1047 RegExpText* text = new (zone()) RegExpText(zone()); |
1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); | 1048 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
1117 terms_.Add(text, zone()); | 1049 terms_.Add(text, zone()); |
1118 } | 1050 } |
1119 text_.Clear(); | 1051 text_.Clear(); |
1120 } | 1052 } |
1121 | 1053 |
1122 | 1054 |
1123 void RegExpBuilder::AddCharacter(uc16 c) { | 1055 void RegExpBuilder::AddCharacter(uc16 c) { |
1124 FlushPendingSurrogate(); | |
1125 pending_empty_ = false; | 1056 pending_empty_ = false; |
1126 if (characters_ == NULL) { | 1057 if (characters_ == NULL) { |
1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); | 1058 characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
1128 } | 1059 } |
1129 characters_->Add(c, zone()); | 1060 characters_->Add(c, zone()); |
1130 LAST(ADD_CHAR); | 1061 LAST(ADD_CHAR); |
1131 } | 1062 } |
1132 | 1063 |
1133 | 1064 |
1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 1065 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { | 1066 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
1136 DCHECK(unicode()); | 1067 ZoneList<uc16> surrogate_pair(2, zone()); |
1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); | 1068 surrogate_pair.Add(unibrow::Utf16::LeadSurrogate(c), zone()); |
1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1069 surrogate_pair.Add(unibrow::Utf16::TrailSurrogate(c), zone()); |
1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1070 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
1140 AddLeadSurrogate(c); | 1071 AddAtom(atom); |
1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | |
1142 AddTrailSurrogate(c); | |
1143 } else { | 1072 } else { |
1144 AddCharacter(static_cast<uc16>(c)); | 1073 AddCharacter(static_cast<uc16>(c)); |
1145 } | 1074 } |
1146 } | 1075 } |
1147 | 1076 |
1148 | 1077 |
1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1078 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1150 | 1079 |
1151 | 1080 |
1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | |
1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { | |
1154 // In unicode mode, character class needs to be desugared, so it | |
1155 // must be a standalone term instead of being part of a RegExpText. | |
1156 AddTerm(cc); | |
1157 } else { | |
1158 AddAtom(cc); | |
1159 } | |
1160 } | |
1161 | |
1162 | |
1163 void RegExpBuilder::AddAtom(RegExpTree* term) { | 1081 void RegExpBuilder::AddAtom(RegExpTree* term) { |
1164 if (term->IsEmpty()) { | 1082 if (term->IsEmpty()) { |
1165 AddEmpty(); | 1083 AddEmpty(); |
1166 return; | 1084 return; |
1167 } | 1085 } |
1168 if (term->IsTextElement()) { | 1086 if (term->IsTextElement()) { |
1169 FlushCharacters(); | 1087 FlushCharacters(); |
1170 text_.Add(term, zone()); | 1088 text_.Add(term, zone()); |
1171 } else { | 1089 } else { |
1172 FlushText(); | 1090 FlushText(); |
1173 terms_.Add(term, zone()); | 1091 terms_.Add(term, zone()); |
1174 } | 1092 } |
1175 LAST(ADD_ATOM); | 1093 LAST(ADD_ATOM); |
1176 } | 1094 } |
1177 | 1095 |
1178 | |
1179 void RegExpBuilder::AddTerm(RegExpTree* term) { | |
1180 FlushText(); | |
1181 terms_.Add(term, zone()); | |
1182 LAST(ADD_ATOM); | |
1183 } | |
1184 | |
1185 | 1096 |
1186 void RegExpBuilder::AddAssertion(RegExpTree* assert) { | 1097 void RegExpBuilder::AddAssertion(RegExpTree* assert) { |
1187 FlushText(); | 1098 FlushText(); |
1188 terms_.Add(assert, zone()); | 1099 terms_.Add(assert, zone()); |
1189 LAST(ADD_ASSERT); | 1100 LAST(ADD_ASSERT); |
1190 } | 1101 } |
1191 | 1102 |
1192 | 1103 |
1193 void RegExpBuilder::NewAlternative() { FlushTerms(); } | 1104 void RegExpBuilder::NewAlternative() { FlushTerms(); } |
1194 | 1105 |
(...skipping 19 matching lines...) Expand all Loading... |
1214 FlushTerms(); | 1125 FlushTerms(); |
1215 int num_alternatives = alternatives_.length(); | 1126 int num_alternatives = alternatives_.length(); |
1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1127 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
1217 if (num_alternatives == 1) return alternatives_.last(); | 1128 if (num_alternatives == 1) return alternatives_.last(); |
1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1129 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
1219 } | 1130 } |
1220 | 1131 |
1221 | 1132 |
1222 void RegExpBuilder::AddQuantifierToAtom( | 1133 void RegExpBuilder::AddQuantifierToAtom( |
1223 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { | 1134 int min, int max, RegExpQuantifier::QuantifierType quantifier_type) { |
1224 FlushPendingSurrogate(); | |
1225 if (pending_empty_) { | 1135 if (pending_empty_) { |
1226 pending_empty_ = false; | 1136 pending_empty_ = false; |
1227 return; | 1137 return; |
1228 } | 1138 } |
1229 RegExpTree* atom; | 1139 RegExpTree* atom; |
1230 if (characters_ != NULL) { | 1140 if (characters_ != NULL) { |
1231 DCHECK(last_added_ == ADD_CHAR); | 1141 DCHECK(last_added_ == ADD_CHAR); |
1232 // Last atom was character. | 1142 // Last atom was character. |
1233 Vector<const uc16> char_vector = characters_->ToConstVector(); | 1143 Vector<const uc16> char_vector = characters_->ToConstVector(); |
1234 int num_chars = char_vector.length(); | 1144 int num_chars = char_vector.length(); |
(...skipping 26 matching lines...) Expand all Loading... |
1261 UNREACHABLE(); | 1171 UNREACHABLE(); |
1262 return; | 1172 return; |
1263 } | 1173 } |
1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1174 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1265 zone()); | 1175 zone()); |
1266 LAST(ADD_TERM); | 1176 LAST(ADD_TERM); |
1267 } | 1177 } |
1268 | 1178 |
1269 } // namespace internal | 1179 } // namespace internal |
1270 } // namespace v8 | 1180 } // namespace v8 |
OLD | NEW |