Index: src/regexp/regexp-parser.cc |
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc |
index 07d5779675786b0dfbec11fb7a8cf8fa19f3aecb..be2cf7c42a880671c3f288d2186d4972a52ac9ff 100644 |
--- a/src/regexp/regexp-parser.cc |
+++ b/src/regexp/regexp-parser.cc |
@@ -11,6 +11,10 @@ |
#include "src/regexp/jsregexp.h" |
#include "src/utils.h" |
+#ifdef V8_I18N_SUPPORT |
+#include "unicode/uset.h" |
+#endif // V8_I18N_SUPPORT |
+ |
namespace v8 { |
namespace internal { |
@@ -1064,13 +1068,20 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
if (pending_surrogate_ != kNoPendingSurrogate) { |
uc16 lead_surrogate = pending_surrogate_; |
- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
- ZoneList<uc16> surrogate_pair(2, zone()); |
- surrogate_pair.Add(lead_surrogate, zone()); |
- surrogate_pair.Add(trail_surrogate, zone()); |
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
pending_surrogate_ = kNoPendingSurrogate; |
- AddAtom(atom); |
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
+ uc32 combined = |
+ unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
+ if (NeedsDesugaringForIgnoreCase(combined)) { |
+ AddCharacterClass(combined); |
+ } else { |
+ ZoneList<uc16> surrogate_pair(2, zone()); |
+ surrogate_pair.Add(lead_surrogate, zone()); |
+ surrogate_pair.Add(trail_surrogate, zone()); |
+ RegExpAtom* atom = |
+ new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
+ AddAtom(atom); |
+ } |
} else { |
pending_surrogate_ = trail_surrogate; |
FlushPendingSurrogate(); |
@@ -1080,14 +1091,10 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
void RegExpBuilder::FlushPendingSurrogate() { |
if (pending_surrogate_ != kNoPendingSurrogate) { |
- // Use character class to desugar lone surrogate matching. |
- RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( |
- CharacterRange::List(zone(), |
- CharacterRange::Singleton(pending_surrogate_)), |
- false); |
- pending_surrogate_ = kNoPendingSurrogate; |
DCHECK(unicode()); |
- AddCharacterClass(cc); |
+ uc32 c = pending_surrogate_; |
+ pending_surrogate_ = kNoPendingSurrogate; |
+ AddCharacterClass(c); |
} |
} |
@@ -1123,11 +1130,15 @@ void RegExpBuilder::FlushText() { |
void RegExpBuilder::AddCharacter(uc16 c) { |
FlushPendingSurrogate(); |
pending_empty_ = false; |
- if (characters_ == NULL) { |
- characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
+ if (NeedsDesugaringForIgnoreCase(c)) { |
+ AddCharacterClass(c); |
+ } else { |
+ if (characters_ == NULL) { |
+ characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
+ } |
+ characters_->Add(c, zone()); |
+ LAST(ADD_CHAR); |
} |
- characters_->Add(c, zone()); |
- LAST(ADD_CHAR); |
} |
@@ -1150,7 +1161,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
- if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { |
+ if (NeedsDesugaringForUnicode(cc)) { |
// In unicode mode, character class needs to be desugared, so it |
// must be a standalone term instead of being part of a RegExpText. |
AddTerm(cc); |
@@ -1160,6 +1171,12 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
} |
+void RegExpBuilder::AddCharacterClass(uc32 c) { |
+ AddCharacterClass(new (zone()) RegExpCharacterClass( |
+ CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); |
+} |
+ |
+ |
void RegExpBuilder::AddAtom(RegExpTree* term) { |
if (term->IsEmpty()) { |
AddEmpty(); |
@@ -1210,6 +1227,47 @@ void RegExpBuilder::FlushTerms() { |
} |
+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { |
+ if (!unicode()) return false; |
+ switch (cc->standard_type()) { |
+ case 's': // white space |
+ case 'w': // ASCII word character |
+ case 'd': // ASCII digit |
+ return false; // These characters do not need desugaring. |
+ default: |
+ break; |
+ } |
+ ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
+ CharacterRange::Canonicalize(ranges); |
+ for (int i = ranges->length() - 1; i >= 0; i--) { |
+ uc32 from = ranges->at(i).from(); |
+ uc32 to = ranges->at(i).to(); |
+ // Check for non-BMP characters. |
+ if (to >= kNonBmpStart) return true; |
+ // Check for lone surrogates. |
+ if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
+ } |
+ return false; |
+} |
+ |
+ |
+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { |
+#ifdef V8_I18N_SUPPORT |
+ if (unicode() && ignore_case()) { |
+ USet* set = uset_open(c, c); |
+ uset_closeOver(set, USET_CASE_INSENSITIVE); |
+ uset_removeAllStrings(set); |
+ bool result = uset_size(set) > 1; |
+ uset_close(set); |
+ return result; |
+ } |
+ // In the case where ICU is not included, we act as if the unicode flag is |
+ // not set, and do not desugar. |
+#endif // V8_I18N_SUPPORT |
+ return false; |
+} |
+ |
+ |
RegExpTree* RegExpBuilder::ToRegExp() { |
FlushTerms(); |
int num_alternatives = alternatives_.length(); |