Index: src/regexp/jsregexp.cc |
diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc |
index fbbd3ecda29efcda0e3e72193d9bb39a2dd2e3a3..4e33e4ac7807849d1b137675bb400d88acf9626b 100644 |
--- a/src/regexp/jsregexp.cc |
+++ b/src/regexp/jsregexp.cc |
@@ -1019,6 +1019,11 @@ class RegExpCompiler { |
inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; } |
inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; } |
+ // Both unicode and ignore_case flags are set. We need to use ICU to find |
+ // the closure over case equivalents. |
+ inline bool needs_unicode_case_equivalents() { |
+ return unicode() && ignore_case(); |
+ } |
inline bool one_byte() { return one_byte_; } |
inline bool optimize() { return optimize_; } |
inline void set_optimize(bool value) { optimize_ = value; } |
@@ -5108,13 +5113,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, |
return TextNode::CreateForCharacterRanges(zone, range, false, on_success); |
} |
- |
-void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, |
- ZoneList<CharacterRange>* ranges) { |
+void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) { |
#ifdef V8_I18N_SUPPORT |
// Use ICU to compute the case fold closure over the ranges. |
- DCHECK(compiler->unicode()); |
- DCHECK(compiler->ignore_case()); |
icu::UnicodeSet set; |
for (int i = 0; i < ranges->length(); i++) { |
set.add(ranges->at(i).from(), ranges->at(i).to()); |
@@ -5125,18 +5126,13 @@ void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, |
// Those are represented as strings in the set. Remove them so that |
// we end up with only simple and common case mappings. |
set.removeAllStrings(); |
- Zone* zone = compiler->zone(); |
for (int i = 0; i < set.getRangeCount(); i++) { |
ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), |
zone); |
} |
// No errors and everything we collected have been ranges. |
-#else |
- // Fallback if ICU is not included. |
- CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(), |
- ranges, compiler->one_byte()); |
-#endif // V8_I18N_SUPPORT |
CharacterRange::Canonicalize(ranges); |
+#endif // V8_I18N_SUPPORT |
} |
@@ -5145,8 +5141,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
set_.Canonicalize(); |
Zone* zone = compiler->zone(); |
ZoneList<CharacterRange>* ranges = this->ranges(zone); |
- if (compiler->unicode() && compiler->ignore_case()) { |
- AddUnicodeCaseEquivalents(compiler, ranges); |
+ if (compiler->needs_unicode_case_equivalents()) { |
+ AddUnicodeCaseEquivalents(ranges, zone); |
} |
if (compiler->unicode() && !compiler->one_byte()) { |
if (is_negated()) { |
@@ -5619,6 +5615,42 @@ RegExpNode* RegExpQuantifier::ToNode(int min, |
} |
} |
+namespace { |
+// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and |
+// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W) |
+RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, |
+ RegExpNode* on_success, |
+ RegExpAssertion::AssertionType type) { |
+ DCHECK(compiler->needs_unicode_case_equivalents()); |
+ Zone* zone = compiler->zone(); |
+ ZoneList<CharacterRange>* word_range = |
+ new (zone) ZoneList<CharacterRange>(2, zone); |
+ CharacterRange::AddClassEscape('w', word_range, true, zone); |
+ int stack_register = compiler->UnicodeLookaroundStackRegister(); |
+ int position_register = compiler->UnicodeLookaroundPositionRegister(); |
+ ChoiceNode* result = new (zone) ChoiceNode(2, zone); |
+ // Add two choices. The (non-)boundary could start with a word or |
+ // a non-word-character. |
+ for (int i = 0; i < 2; i++) { |
+ bool lookbehind_for_word = i == 0; |
+ bool lookahead_for_word = |
+ (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; |
+ // Look to the left. |
+ RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, |
+ stack_register, position_register); |
+ RegExpNode* backward = TextNode::CreateForCharacterRanges( |
+ zone, word_range, true, lookbehind.on_match_success()); |
+ // Look to the right. |
+ RegExpLookaround::Builder lookahead(lookahead_for_word, |
+ lookbehind.ForMatch(backward), |
+ stack_register, position_register); |
+ RegExpNode* forward = TextNode::CreateForCharacterRanges( |
+ zone, word_range, false, lookahead.on_match_success()); |
+ result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); |
+ } |
+ return result; |
+} |
+} // anonymous namespace |
RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
RegExpNode* on_success) { |
@@ -5631,9 +5663,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
case START_OF_INPUT: |
return AssertionNode::AtStart(on_success); |
case BOUNDARY: |
- return AssertionNode::AtBoundary(on_success); |
+ return compiler->needs_unicode_case_equivalents() |
+ ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY) |
+ : AssertionNode::AtBoundary(on_success); |
case NON_BOUNDARY: |
- return AssertionNode::AtNonBoundary(on_success); |
+ return compiler->needs_unicode_case_equivalents() |
+ ? BoundaryAssertionAsLookaround(compiler, on_success, |
+ NON_BOUNDARY) |
+ : AssertionNode::AtNonBoundary(on_success); |
case END_OF_INPUT: |
return AssertionNode::AtEnd(on_success); |
case END_OF_LINE: { |
@@ -5647,7 +5684,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
// Create a newline atom. |
ZoneList<CharacterRange>* newline_ranges = |
new(zone) ZoneList<CharacterRange>(3, zone); |
- CharacterRange::AddClassEscape('n', newline_ranges, zone); |
+ CharacterRange::AddClassEscape('n', newline_ranges, false, zone); |
RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); |
TextNode* newline_matcher = new (zone) TextNode( |
newline_atom, false, ActionNode::PositiveSubmatchSuccess( |
@@ -5821,9 +5858,30 @@ static void AddClassNegated(const int *elmv, |
ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); |
} |
+void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, |
+ bool add_unicode_case_equivalents, |
+ Zone* zone) { |
+ if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { |
+ // See #sec-runtime-semantics-wordcharacters-abstract-operation |
+ // In case of unicode and ignore_case, we need to create the closure over |
+ // case equivalent characters before negating. |
+ ZoneList<CharacterRange>* new_ranges = |
+ new (zone) ZoneList<CharacterRange>(2, zone); |
+ AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); |
+ AddUnicodeCaseEquivalents(new_ranges, zone); |
+ if (type == 'W') { |
+ ZoneList<CharacterRange>* negated = |
+ new (zone) ZoneList<CharacterRange>(2, zone); |
+ CharacterRange::Negate(new_ranges, negated, zone); |
+ new_ranges = negated; |
+ } |
+ ranges->AddAll(*new_ranges, zone); |
+ return; |
+ } |
+ AddClassEscape(type, ranges, zone); |
+} |
-void CharacterRange::AddClassEscape(uc16 type, |
- ZoneList<CharacterRange>* ranges, |
+void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, |
Zone* zone) { |
switch (type) { |
case 's': |
@@ -5965,7 +6023,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { |
ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { |
if (ranges_ == NULL) { |
ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); |
- CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); |
+ CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone); |
} |
return ranges_; |
} |