Chromium Code Reviews| Index: src/regexp/jsregexp.cc |
| diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc |
| index 0ed3086ce674833ef973528d6943fc9f65451c10..bd36f8adf02b89e4f4ccb5df8c82453666d9c7c8 100644 |
| --- a/src/regexp/jsregexp.cc |
| +++ b/src/regexp/jsregexp.cc |
| @@ -5108,13 +5108,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler, |
| return TextNode::CreateForCharacterRanges(zone, range, false, on_success); |
| } |
| - |
| -void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, |
| - ZoneList<CharacterRange>* ranges) { |
| +void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) { |
| #ifdef V8_I18N_SUPPORT |
| // Use ICU to compute the case fold closure over the ranges. |
| - DCHECK(compiler->unicode()); |
| - DCHECK(compiler->ignore_case()); |
| icu::UnicodeSet set; |
| for (int i = 0; i < ranges->length(); i++) { |
| set.add(ranges->at(i).from(), ranges->at(i).to()); |
| @@ -5125,18 +5121,13 @@ void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, |
| // Those are represented as strings in the set. Remove them so that |
| // we end up with only simple and common case mappings. |
| set.removeAllStrings(); |
| - Zone* zone = compiler->zone(); |
| for (int i = 0; i < set.getRangeCount(); i++) { |
| ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)), |
| zone); |
| } |
| // No errors and everything we collected have been ranges. |
| -#else |
| - // Fallback if ICU is not included. |
| - CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(), |
| - ranges, compiler->one_byte()); |
| -#endif // V8_I18N_SUPPORT |
| CharacterRange::Canonicalize(ranges); |
| +#endif // V8_I18N_SUPPORT |
| } |
| @@ -5146,7 +5137,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
| Zone* zone = compiler->zone(); |
| ZoneList<CharacterRange>* ranges = this->ranges(zone); |
| if (compiler->unicode() && compiler->ignore_case()) { |
| - AddUnicodeCaseEquivalents(compiler, ranges); |
| + AddUnicodeCaseEquivalents(ranges, zone); |
| } |
| if (compiler->unicode() && !compiler->one_byte()) { |
| if (is_negated()) { |
| @@ -5619,6 +5610,42 @@ RegExpNode* RegExpQuantifier::ToNode(int min, |
| } |
| } |
| +namespace { |
| +// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and |
| +// \B to (?<=\W)(?=\W)|(?<=\w)(?=\w) |
|
jgruber
2017/02/28 13:44:56
Nit: Please swap the group order of \B to make the
Yang
2017/02/28 14:26:11
Done.
|
| +RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler, |
| + RegExpNode* on_success, |
| + RegExpAssertion::AssertionType type) { |
| + DCHECK(compiler->unicode() && compiler->ignore_case()); |
| + Zone* zone = compiler->zone(); |
| + ZoneList<CharacterRange>* word_range = |
| + new (zone) ZoneList<CharacterRange>(2, zone); |
| + CharacterRange::AddClassEscape('w', word_range, true, zone); |
| + int stack_register = compiler->UnicodeLookaroundStackRegister(); |
| + int position_register = compiler->UnicodeLookaroundPositionRegister(); |
| + ChoiceNode* result = new (zone) ChoiceNode(2, zone); |
| + // Add two choices. The (non-)boundary could start with a word or |
| + // a non-word-character. |
| + for (int i = 0; i < 2; i++) { |
| + bool lookbehind_for_word = i == 0; |
| + bool lookahead_for_word = |
| + (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word; |
| + // Look to the left. |
| + RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success, |
| + stack_register, position_register); |
| + RegExpNode* backward = TextNode::CreateForCharacterRanges( |
| + zone, word_range, true, lookbehind.on_match_success()); |
| + // Look to the right. |
| + RegExpLookaround::Builder lookahead(lookahead_for_word, |
| + lookbehind.ForMatch(backward), |
| + stack_register, position_register); |
| + RegExpNode* forward = TextNode::CreateForCharacterRanges( |
| + zone, word_range, false, lookahead.on_match_success()); |
| + result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward))); |
| + } |
| + return result; |
| +} |
| +} // anonymous namespace |
| RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
| RegExpNode* on_success) { |
| @@ -5631,9 +5658,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
| case START_OF_INPUT: |
| return AssertionNode::AtStart(on_success); |
| case BOUNDARY: |
| - return AssertionNode::AtBoundary(on_success); |
| + return compiler->unicode() && compiler->ignore_case() |
|
jgruber
2017/02/28 13:44:56
WDYT about adding compiler->needs_unicode_case_equ
Yang
2017/02/28 14:26:11
Done.
|
| + ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY) |
| + : AssertionNode::AtBoundary(on_success); |
| case NON_BOUNDARY: |
| - return AssertionNode::AtNonBoundary(on_success); |
| + return compiler->unicode() && compiler->ignore_case() |
| + ? BoundaryAssertionAsLookaround(compiler, on_success, |
| + NON_BOUNDARY) |
| + : AssertionNode::AtNonBoundary(on_success); |
| case END_OF_INPUT: |
| return AssertionNode::AtEnd(on_success); |
| case END_OF_LINE: { |
| @@ -5647,7 +5679,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler, |
| // Create a newline atom. |
| ZoneList<CharacterRange>* newline_ranges = |
| new(zone) ZoneList<CharacterRange>(3, zone); |
| - CharacterRange::AddClassEscape('n', newline_ranges, zone); |
| + CharacterRange::AddClassEscape('n', newline_ranges, false, zone); |
| RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n'); |
| TextNode* newline_matcher = new (zone) TextNode( |
| newline_atom, false, ActionNode::PositiveSubmatchSuccess( |
| @@ -5821,10 +5853,27 @@ static void AddClassNegated(const int *elmv, |
| ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone); |
| } |
| - |
| -void CharacterRange::AddClassEscape(uc16 type, |
| - ZoneList<CharacterRange>* ranges, |
| +void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges, |
| + bool add_unicode_case_equivalents, |
| Zone* zone) { |
| + if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) { |
| + // See #sec-runtime-semantics-wordcharacters-abstract-operation |
| + // In case of unicode and ignore_case, we need to create the closure over |
| + // case equivalent characters before negating. |
| + ZoneList<CharacterRange>* new_ranges = |
| + new (zone) ZoneList<CharacterRange>(2, zone); |
| + AddClass(kWordRanges, kWordRangeCount, new_ranges, zone); |
| + AddUnicodeCaseEquivalents(new_ranges, zone); |
| + if (type == 'W') { |
| + ZoneList<CharacterRange>* negated = |
| + new (zone) ZoneList<CharacterRange>(2, zone); |
| + CharacterRange::Negate(new_ranges, negated, zone); |
| + new_ranges = negated; |
| + } |
| + ranges->AddAll(*new_ranges, zone); |
| + return; |
| + } |
| + |
| switch (type) { |
| case 's': |
| AddClass(kSpaceRanges, kSpaceRangeCount, ranges, zone); |
| @@ -5965,7 +6014,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) { |
| ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) { |
| if (ranges_ == NULL) { |
| ranges_ = new(zone) ZoneList<CharacterRange>(2, zone); |
| - CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone); |
| + CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone); |
| } |
| return ranges_; |
| } |