Chromium Code Reviews| Index: src/regexp/regexp-parser.cc |
| diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
| index 07d5779675786b0dfbec11fb7a8cf8fa19f3aecb..6d6aaa57f32a6e5deb5e2c2ad79f21ced751765c 100644 |
| --- a/src/regexp/regexp-parser.cc |
| +++ b/src/regexp/regexp-parser.cc |
| @@ -11,6 +11,10 @@ |
| #include "src/regexp/jsregexp.h" |
| #include "src/utils.h" |
| +#ifdef V8_I18N_SUPPORT |
| +#include "unicode/uset.h" |
| +#endif // V8_I18N_SUPPORT |
| + |
| namespace v8 { |
| namespace internal { |
| @@ -1064,13 +1068,20 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
| DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| uc16 lead_surrogate = pending_surrogate_; |
| - DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| - ZoneList<uc16> surrogate_pair(2, zone()); |
| - surrogate_pair.Add(lead_surrogate, zone()); |
| - surrogate_pair.Add(trail_surrogate, zone()); |
| - RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
| pending_surrogate_ = kNoPendingSurrogate; |
| - AddAtom(atom); |
| + DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
| + uc32 combined = |
| + unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
| + if (NeedsDesugaringForIgnoreCase(combined)) { |
| + AddCharacterClass(combined); |
| + } else { |
| + ZoneList<uc16> surrogate_pair(2, zone()); |
| + surrogate_pair.Add(lead_surrogate, zone()); |
| + surrogate_pair.Add(trail_surrogate, zone()); |
| + RegExpAtom* atom = |
| + new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
| + AddAtom(atom); |
| + } |
| } else { |
| pending_surrogate_ = trail_surrogate; |
| FlushPendingSurrogate(); |
| @@ -1080,14 +1091,10 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
| void RegExpBuilder::FlushPendingSurrogate() { |
| if (pending_surrogate_ != kNoPendingSurrogate) { |
| - // Use character class to desugar lone surrogate matching. |
| - RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( |
| - CharacterRange::List(zone(), |
| - CharacterRange::Singleton(pending_surrogate_)), |
| - false); |
| - pending_surrogate_ = kNoPendingSurrogate; |
| DCHECK(unicode()); |
| - AddCharacterClass(cc); |
| + uc32 c = pending_surrogate_; |
| + pending_surrogate_ = kNoPendingSurrogate; |
| + AddCharacterClass(c); |
| } |
| } |
| @@ -1123,11 +1130,15 @@ void RegExpBuilder::FlushText() { |
| void RegExpBuilder::AddCharacter(uc16 c) { |
| FlushPendingSurrogate(); |
| pending_empty_ = false; |
| - if (characters_ == NULL) { |
| - characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
| + if (NeedsDesugaringForIgnoreCase(c)) { |
| + AddCharacterClass(c); |
| + } else { |
| + if (characters_ == NULL) { |
| + characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
| + } |
| + characters_->Add(c, zone()); |
| + LAST(ADD_CHAR); |
| } |
| - characters_->Add(c, zone()); |
| - LAST(ADD_CHAR); |
| } |
| @@ -1150,7 +1161,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
| void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| - if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { |
| + if (NeedsDesugaringForUnicode(cc->ranges(zone()))) { |
|
erikcorry
2016/01/25 10:26:37
It's a bit unfortunate, that for all the standard
Yang
2016/01/25 11:46:37
Fixed.
|
| // In unicode mode, character class needs to be desugared, so it |
| // must be a standalone term instead of being part of a RegExpText. |
| AddTerm(cc); |
| @@ -1160,6 +1171,12 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
| } |
| +void RegExpBuilder::AddCharacterClass(uc32 c) { |
| + AddCharacterClass(new (zone()) RegExpCharacterClass( |
| + CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); |
| +} |
| + |
| + |
| void RegExpBuilder::AddAtom(RegExpTree* term) { |
| if (term->IsEmpty()) { |
| AddEmpty(); |
| @@ -1210,6 +1227,42 @@ void RegExpBuilder::FlushTerms() { |
| } |
| +bool RegExpBuilder::NeedsDesugaringForUnicode( |
| + ZoneList<CharacterRange>* ranges) { |
| + if (!unicode()) return false; |
| + CharacterRange::Canonicalize(ranges); |
| + static const uc32 kLeadSurrogateStart = 0xd800; |
| + static const uc32 kTrailSurrogateEnd = 0xdfff; |
| + static const uc32 kNonBmpStart = 0x10000; |
|
erikcorry
2016/01/25 10:26:37
Don't these constants already exist?
Yang
2016/01/25 11:46:37
Done.
|
| + for (int i = ranges->length() - 1; i >= 0; i--) { |
| + uc32 from = ranges->at(i).from(); |
| + uc32 to = ranges->at(i).to(); |
| + // Check for non-BMP characters. |
| + if (to >= kNonBmpStart) return true; |
| + // Check for lone surrogates. |
| + if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
| + } |
| + return false; |
| +} |
| + |
| + |
| +bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { |
| +#ifdef V8_I18N_SUPPORT |
|
erikcorry
2016/01/25 10:26:37
Perhaps a comment explaining what happens in the n
Yang
2016/01/25 11:46:37
Done.
|
| + // Ignore-case for ASCII characters is handled at a lower layer. |
| + if (c < 128) return false; |
|
erikcorry
2016/01/25 10:26:37
There's a named constant for this.
Yang
2016/01/25 11:46:37
Done.
|
| + if (unicode() && ignore_case()) { |
| + USet* set = uset_open(c, c); |
| + uset_closeOver(set, USET_CASE_INSENSITIVE); |
| + uset_removeAllStrings(set); |
| + bool result = uset_size(set) > 1; |
| + uset_close(set); |
| + return result; |
| + } |
| +#endif // V8_I18N_SUPPORT |
| + return false; |
| +} |
| + |
| + |
| RegExpTree* RegExpBuilder::ToRegExp() { |
| FlushTerms(); |
| int num_alternatives = alternatives_.length(); |