Index: src/regexp/jsregexp.cc |
diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc |
index 3559bcd111a4c19eb5755dd264d6a4a8cf39a88a..2229dd8e175cc9483741ad75bff773a0c0a7797f 100644 |
--- a/src/regexp/jsregexp.cc |
+++ b/src/regexp/jsregexp.cc |
@@ -25,6 +25,11 @@ |
#include "src/string-search.h" |
#include "src/unicode-decoder.h" |
+#ifdef V8_I18N_SUPPORT |
+#include "unicode/uset.h" |
+#include "unicode/utypes.h" |
+#endif // V8_I18N_SUPPORT |
+ |
#ifndef V8_INTERPRETED_REGEXP |
#if V8_TARGET_ARCH_IA32 |
#include "src/regexp/ia32/regexp-macro-assembler-ia32.h" |
@@ -4390,8 +4395,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
DCHECK_EQ(start_reg_ + 1, end_reg_); |
if (compiler->ignore_case()) { |
- assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), |
- trace->backtrack()); |
+ assembler->CheckNotBackReferenceIgnoreCase( |
+ start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); |
} else { |
assembler->CheckNotBackReference(start_reg_, read_backward(), |
trace->backtrack()); |
@@ -4861,21 +4866,6 @@ bool RegExpCharacterClass::is_standard(Zone* zone) { |
} |
-bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) { |
- ZoneList<CharacterRange>* ranges = this->ranges(zone); |
- CharacterRange::Canonicalize(ranges); |
- for (int i = ranges->length() - 1; i >= 0; i--) { |
- uc32 from = ranges->at(i).from(); |
- uc32 to = ranges->at(i).to(); |
- // Check for non-BMP characters. |
- if (to >= kNonBmpStart) return true; |
- // Check for lone surrogates. |
- if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
- } |
- return false; |
-} |
- |
- |
UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, |
ZoneList<CharacterRange>* base) |
: zone_(zone), |
@@ -5086,11 +5076,49 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, |
} |
+void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, |
+ ZoneList<CharacterRange>* ranges) { |
+#ifdef V8_I18N_SUPPORT |
+ // Use ICU to compute the case fold closure over the ranges. |
+ DCHECK(compiler->unicode()); |
+ DCHECK(compiler->ignore_case()); |
+ USet* set = uset_openEmpty(); |
+ for (int i = 0; i < ranges->length(); i++) { |
+ uset_addRange(set, ranges->at(i).from(), ranges->at(i).to()); |
+ } |
+ ranges->Clear(); |
+ uset_closeOver(set, USET_CASE_INSENSITIVE); |
+ // Full case mapping map single characters to multiple characters. |
+ // Those are represented as strings in the set. Remove them so that |
+ // we end up with only simple and common case mappings. |
+ uset_removeAllStrings(set); |
+ int item_count = uset_getItemCount(set); |
+ int item_result = 0; |
+ UErrorCode ec = U_ZERO_ERROR; |
+ Zone* zone = compiler->zone(); |
+ for (int i = 0; i < item_count; i++) { |
+ uc32 start = 0; |
+ uc32 end = 0; |
+ item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); |
+ ranges->Add(CharacterRange::Range(start, end), zone); |
+ } |
+ // No errors and everything we collected have been ranges. |
+ DCHECK_EQ(U_ZERO_ERROR, ec); |
+ DCHECK_EQ(0, item_result); |
+ uset_close(set); |
+ CharacterRange::Canonicalize(ranges); |
+#endif // V8_I18N_SUPPORT |
+} |
+ |
+ |
RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
RegExpNode* on_success) { |
set_.Canonicalize(); |
Zone* zone = compiler->zone(); |
ZoneList<CharacterRange>* ranges = this->ranges(zone); |
+ if (compiler->unicode() && compiler->ignore_case()) { |
+ AddUnicodeCaseEquivalents(compiler, ranges); |
+ } |
if (compiler->unicode() && !compiler->one_byte()) { |
if (is_negated()) { |
ZoneList<CharacterRange>* negated = |