Chromium Code Reviews| Index: src/regexp/jsregexp.cc |
| diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc |
| index 6235c25c7762f05ef399fc31838e64a10e2b2a39..b0a294bce16e17a100f46149dc5f57bb6cd99e0c 100644 |
| --- a/src/regexp/jsregexp.cc |
| +++ b/src/regexp/jsregexp.cc |
| @@ -25,6 +25,11 @@ |
| #include "src/string-search.h" |
| #include "src/unicode-decoder.h" |
| +#ifdef V8_I18N_SUPPORT |
| +#include "unicode/uset.h" |
| +#include "unicode/utypes.h" |
| +#endif // V8_I18N_SUPPORT |
| + |
| #ifndef V8_INTERPRETED_REGEXP |
| #if V8_TARGET_ARCH_IA32 |
| #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" |
| @@ -3420,10 +3425,7 @@ void TextNode::MakeCaseIndependent(Isolate* isolate, bool is_one_byte) { |
| // independent case and it slows us down if we don't know that. |
| if (cc->is_standard(zone())) continue; |
| ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| - int range_count = ranges->length(); |
| - for (int j = 0; j < range_count; j++) { |
| - ranges->at(j).AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); |
| - } |
| + CharacterRange::AddCaseEquivalents(isolate, zone(), ranges, is_one_byte); |
| } |
| } |
| } |
| @@ -3586,13 +3588,6 @@ class AlternativeGenerationList { |
| AlternativeGeneration a_few_alt_gens_[kAFew]; |
| }; |
| - |
| -static const uc32 kLeadSurrogateStart = 0xd800; |
| -static const uc32 kLeadSurrogateEnd = 0xdbff; |
| -static const uc32 kTrailSurrogateStart = 0xdc00; |
| -static const uc32 kTrailSurrogateEnd = 0xdfff; |
| -static const uc32 kNonBmpStart = 0x10000; |
| -static const uc32 kNonBmpEnd = 0x10ffff; |
| static const uc32 kRangeEndMarker = 0x110000; |
| // The '2' variant is has inclusive from and exclusive to. |
| @@ -4395,8 +4390,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) { |
| DCHECK_EQ(start_reg_ + 1, end_reg_); |
| if (compiler->ignore_case()) { |
| - assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), |
| - trace->backtrack()); |
| + assembler->CheckNotBackReferenceIgnoreCase( |
| + start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); |
| } else { |
| assembler->CheckNotBackReference(start_reg_, read_backward(), |
| trace->backtrack()); |
| @@ -4866,21 +4861,6 @@ bool RegExpCharacterClass::is_standard(Zone* zone) { |
| } |
| -bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) { |
| - ZoneList<CharacterRange>* ranges = this->ranges(zone); |
| - CharacterRange::Canonicalize(ranges); |
| - for (int i = ranges->length() - 1; i >= 0; i--) { |
| - uc32 from = ranges->at(i).from(); |
| - uc32 to = ranges->at(i).to(); |
| - // Check for non-BMP characters. |
| - if (to >= kNonBmpStart) return true; |
| - // Check for lone surrogates. |
| - if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
| - } |
| - return false; |
| -} |
| - |
| - |
| UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, |
| ZoneList<CharacterRange>* base) |
| : zone_(zone), |
| @@ -5120,11 +5100,53 @@ void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result, |
| } |
| +void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, |
| + ZoneList<CharacterRange>* ranges) { |
| +#ifdef V8_I18N_SUPPORT |
| + // Use ICU to compute the case fold closure over the ranges. |
| + DCHECK(compiler->unicode()); |
| + DCHECK(compiler->ignore_case()); |
| + USet* set = uset_openEmpty(); |
| + for (int i = 0; i < ranges->length(); i++) { |
| + uset_addRange(set, ranges->at(i).from(), ranges->at(i).to()); |
| + } |
| + ranges->Clear(); |
| + uset_closeOver(set, USET_CASE_INSENSITIVE); |
| + // Full case mapping map single characters to multiple characters. |
| + // Those are represented as strings in the set. Remove them so that |
| + // we end up with only simple and common case mappings. |
| + uset_removeAllStrings(set); |
| + int item_count = uset_getItemCount(set); |
| + int item_result = 0; |
| + UErrorCode ec = U_ZERO_ERROR; |
| + Zone* zone = compiler->zone(); |
| + for (int i = 0; i < item_count; i++) { |
| + uc32 start = 0; |
| + uc32 end = 0; |
| + item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); |
| + ranges->Add(CharacterRange::Range(start, end), zone); |
| + } |
| + // No errors and everything we collected have been ranges. |
| + DCHECK_EQ(U_ZERO_ERROR, ec); |
| + DCHECK_EQ(0, item_result); |
| + uset_close(set); |
| +#else |
| + // Fallback if ICU is not included. |
| + CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(), |
| + ranges, compiler->one_byte()); |
| +#endif // V8_I18N_SUPPORT |
| + CharacterRange::Canonicalize(ranges); |
| +} |
| + |
| + |
| RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
| RegExpNode* on_success) { |
| set_.Canonicalize(); |
| Zone* zone = compiler->zone(); |
| ZoneList<CharacterRange>* ranges = this->ranges(zone); |
| + if (compiler->unicode() && compiler->ignore_case()) { |
| + AddUnicodeCaseEquivalents(compiler, ranges); |
| + } |
| if (compiler->unicode() && !compiler->one_byte()) { |
| if (is_negated()) { |
| ZoneList<CharacterRange>* negated = |
| @@ -5853,16 +5875,19 @@ Vector<const int> CharacterRange::GetWordBounds() { |
| void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, |
| ZoneList<CharacterRange>* ranges, |
| bool is_one_byte) { |
| - uc32 bottom = from(); |
| - uc32 top = to(); |
| - // Nothing to be done for surrogates. |
| - if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; |
| - if (is_one_byte && !RangeContainsLatin1Equivalents(*this)) { |
| - if (bottom > String::kMaxOneByteCharCode) return; |
| - if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
| - } |
| - unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| - if (top == bottom) { |
| + int range_count = ranges->length(); |
| + for (int i = 0; i < range_count; i++) { |
| + CharacterRange range = ranges->at(i); |
| + uc32 bottom = range.from(); |
| + uc32 top = range.to(); |
| + // Nothing to be done for surrogates. |
| + if (bottom >= kLeadSurrogateStart && top <= kTrailSurrogateEnd) return; |
| + if (is_one_byte && !RangeContainsLatin1Equivalents(range)) { |
| + if (bottom > String::kMaxOneByteCharCode) return; |
| + if (top > String::kMaxOneByteCharCode) top = String::kMaxOneByteCharCode; |
| + } |
| + unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth]; |
| + if (top == bottom) { |
| // If this is a singleton we just expand the one character. |
|
brucedawson
2016/01/29 18:49:15
The indenting of this line - and the rest of the f
|
| int length = isolate->jsregexp_uncanonicalize()->get(bottom, '\0', chars); |
| for (int i = 0; i < length; i++) { |
| @@ -5914,6 +5939,7 @@ void CharacterRange::AddCaseEquivalents(Isolate* isolate, Zone* zone, |
| pos = end + 1; |
| } |
| } |
| + } |
| } |
| @@ -6284,7 +6310,7 @@ void TextNode::CalculateOffsets() { |
| void Analysis::VisitText(TextNode* that) { |
| - if (ignore_case_) { |
| + if (ignore_case()) { |
| that->MakeCaseIndependent(isolate(), is_one_byte_); |
| } |
| EnsureAnalyzed(that->on_success()); |
| @@ -6649,7 +6675,7 @@ RegExpEngine::CompilationResult RegExpEngine::Compile( |
| if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); |
| data->node = node; |
| - Analysis analysis(isolate, ignore_case, is_one_byte); |
| + Analysis analysis(isolate, flags, is_one_byte); |
| analysis.EnsureAnalyzed(node); |
| if (analysis.has_failed()) { |
| const char* error_message = analysis.error_message(); |