| Index: src/regexp/jsregexp.cc
|
| diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc
|
| index fbbd3ecda29efcda0e3e72193d9bb39a2dd2e3a3..4e33e4ac7807849d1b137675bb400d88acf9626b 100644
|
| --- a/src/regexp/jsregexp.cc
|
| +++ b/src/regexp/jsregexp.cc
|
| @@ -1019,6 +1019,11 @@ class RegExpCompiler {
|
|
|
| inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }
|
| inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }
|
| + // Both unicode and ignore_case flags are set. We need to use ICU to find
|
| + // the closure over case equivalents.
|
| + inline bool needs_unicode_case_equivalents() {
|
| + return unicode() && ignore_case();
|
| + }
|
| inline bool one_byte() { return one_byte_; }
|
| inline bool optimize() { return optimize_; }
|
| inline void set_optimize(bool value) { optimize_ = value; }
|
| @@ -5108,13 +5113,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
|
| return TextNode::CreateForCharacterRanges(zone, range, false, on_success);
|
| }
|
|
|
| -
|
| -void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
|
| - ZoneList<CharacterRange>* ranges) {
|
| +void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {
|
| #ifdef V8_I18N_SUPPORT
|
| // Use ICU to compute the case fold closure over the ranges.
|
| - DCHECK(compiler->unicode());
|
| - DCHECK(compiler->ignore_case());
|
| icu::UnicodeSet set;
|
| for (int i = 0; i < ranges->length(); i++) {
|
| set.add(ranges->at(i).from(), ranges->at(i).to());
|
| @@ -5125,18 +5126,13 @@ void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
|
| // Those are represented as strings in the set. Remove them so that
|
| // we end up with only simple and common case mappings.
|
| set.removeAllStrings();
|
| - Zone* zone = compiler->zone();
|
| for (int i = 0; i < set.getRangeCount(); i++) {
|
| ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),
|
| zone);
|
| }
|
| // No errors and everything we collected have been ranges.
|
| -#else
|
| - // Fallback if ICU is not included.
|
| - CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),
|
| - ranges, compiler->one_byte());
|
| -#endif // V8_I18N_SUPPORT
|
| CharacterRange::Canonicalize(ranges);
|
| +#endif // V8_I18N_SUPPORT
|
| }
|
|
|
|
|
| @@ -5145,8 +5141,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
|
| set_.Canonicalize();
|
| Zone* zone = compiler->zone();
|
| ZoneList<CharacterRange>* ranges = this->ranges(zone);
|
| - if (compiler->unicode() && compiler->ignore_case()) {
|
| - AddUnicodeCaseEquivalents(compiler, ranges);
|
| + if (compiler->needs_unicode_case_equivalents()) {
|
| + AddUnicodeCaseEquivalents(ranges, zone);
|
| }
|
| if (compiler->unicode() && !compiler->one_byte()) {
|
| if (is_negated()) {
|
| @@ -5619,6 +5615,42 @@ RegExpNode* RegExpQuantifier::ToNode(int min,
|
| }
|
| }
|
|
|
| +namespace {
|
| +// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and
|
| +// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)
|
| +RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,
|
| + RegExpNode* on_success,
|
| + RegExpAssertion::AssertionType type) {
|
| + DCHECK(compiler->needs_unicode_case_equivalents());
|
| + Zone* zone = compiler->zone();
|
| + ZoneList<CharacterRange>* word_range =
|
| + new (zone) ZoneList<CharacterRange>(2, zone);
|
| + CharacterRange::AddClassEscape('w', word_range, true, zone);
|
| + int stack_register = compiler->UnicodeLookaroundStackRegister();
|
| + int position_register = compiler->UnicodeLookaroundPositionRegister();
|
| + ChoiceNode* result = new (zone) ChoiceNode(2, zone);
|
| + // Add two choices. The (non-)boundary could start with a word or
|
| + // a non-word-character.
|
| + for (int i = 0; i < 2; i++) {
|
| + bool lookbehind_for_word = i == 0;
|
| + bool lookahead_for_word =
|
| + (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;
|
| + // Look to the left.
|
| + RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,
|
| + stack_register, position_register);
|
| + RegExpNode* backward = TextNode::CreateForCharacterRanges(
|
| + zone, word_range, true, lookbehind.on_match_success());
|
| + // Look to the right.
|
| + RegExpLookaround::Builder lookahead(lookahead_for_word,
|
| + lookbehind.ForMatch(backward),
|
| + stack_register, position_register);
|
| + RegExpNode* forward = TextNode::CreateForCharacterRanges(
|
| + zone, word_range, false, lookahead.on_match_success());
|
| + result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));
|
| + }
|
| + return result;
|
| +}
|
| +} // anonymous namespace
|
|
|
| RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
|
| RegExpNode* on_success) {
|
| @@ -5631,9 +5663,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
|
| case START_OF_INPUT:
|
| return AssertionNode::AtStart(on_success);
|
| case BOUNDARY:
|
| - return AssertionNode::AtBoundary(on_success);
|
| + return compiler->needs_unicode_case_equivalents()
|
| + ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)
|
| + : AssertionNode::AtBoundary(on_success);
|
| case NON_BOUNDARY:
|
| - return AssertionNode::AtNonBoundary(on_success);
|
| + return compiler->needs_unicode_case_equivalents()
|
| + ? BoundaryAssertionAsLookaround(compiler, on_success,
|
| + NON_BOUNDARY)
|
| + : AssertionNode::AtNonBoundary(on_success);
|
| case END_OF_INPUT:
|
| return AssertionNode::AtEnd(on_success);
|
| case END_OF_LINE: {
|
| @@ -5647,7 +5684,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,
|
| // Create a newline atom.
|
| ZoneList<CharacterRange>* newline_ranges =
|
| new(zone) ZoneList<CharacterRange>(3, zone);
|
| - CharacterRange::AddClassEscape('n', newline_ranges, zone);
|
| + CharacterRange::AddClassEscape('n', newline_ranges, false, zone);
|
| RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');
|
| TextNode* newline_matcher = new (zone) TextNode(
|
| newline_atom, false, ActionNode::PositiveSubmatchSuccess(
|
| @@ -5821,9 +5858,30 @@ static void AddClassNegated(const int *elmv,
|
| ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);
|
| }
|
|
|
| +void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
|
| + bool add_unicode_case_equivalents,
|
| + Zone* zone) {
|
| + if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {
|
| + // See #sec-runtime-semantics-wordcharacters-abstract-operation
|
| + // In case of unicode and ignore_case, we need to create the closure over
|
| + // case equivalent characters before negating.
|
| + ZoneList<CharacterRange>* new_ranges =
|
| + new (zone) ZoneList<CharacterRange>(2, zone);
|
| + AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);
|
| + AddUnicodeCaseEquivalents(new_ranges, zone);
|
| + if (type == 'W') {
|
| + ZoneList<CharacterRange>* negated =
|
| + new (zone) ZoneList<CharacterRange>(2, zone);
|
| + CharacterRange::Negate(new_ranges, negated, zone);
|
| + new_ranges = negated;
|
| + }
|
| + ranges->AddAll(*new_ranges, zone);
|
| + return;
|
| + }
|
| + AddClassEscape(type, ranges, zone);
|
| +}
|
|
|
| -void CharacterRange::AddClassEscape(uc16 type,
|
| - ZoneList<CharacterRange>* ranges,
|
| +void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,
|
| Zone* zone) {
|
| switch (type) {
|
| case 's':
|
| @@ -5965,7 +6023,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {
|
| ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {
|
| if (ranges_ == NULL) {
|
| ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);
|
| - CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);
|
| + CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);
|
| }
|
| return ranges_;
|
| }
|
|
|