src/regexp/jsregexp.cc - Issue 2725583002: [regexp] fix /\W/ui wrt \u017f and \u212a.

Unified Diff: src/regexp/jsregexp.cc

Issue 2725583002: [regexp] fix /\W/ui wrt \u017f and \u212a. (Closed)

Patch Set: address comments Created 3 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/regexp/jsregexp.cc

diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc

index fbbd3ecda29efcda0e3e72193d9bb39a2dd2e3a3..4e33e4ac7807849d1b137675bb400d88acf9626b 100644

--- a/src/regexp/jsregexp.cc

+++ b/src/regexp/jsregexp.cc

@@ -1019,6 +1019,11 @@ class RegExpCompiler {

inline bool ignore_case() { return (flags_ & JSRegExp::kIgnoreCase) != 0; }

inline bool unicode() { return (flags_ & JSRegExp::kUnicode) != 0; }

+ // Both unicode and ignore_case flags are set. We need to use ICU to find

+ // the closure over case equivalents.

+ inline bool needs_unicode_case_equivalents() {

+ return unicode() && ignore_case();

+ }

inline bool one_byte() { return one_byte_; }

inline bool optimize() { return optimize_; }

inline void set_optimize(bool value) { optimize_ = value; }

@@ -5108,13 +5113,9 @@ RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,

return TextNode::CreateForCharacterRanges(zone, range, false, on_success);

}

-void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,

- ZoneList<CharacterRange>* ranges) {

+void AddUnicodeCaseEquivalents(ZoneList<CharacterRange>* ranges, Zone* zone) {

#ifdef V8_I18N_SUPPORT

// Use ICU to compute the case fold closure over the ranges.

- DCHECK(compiler->unicode());

- DCHECK(compiler->ignore_case());

icu::UnicodeSet set;

for (int i = 0; i < ranges->length(); i++) {

set.add(ranges->at(i).from(), ranges->at(i).to());

@@ -5125,18 +5126,13 @@ void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,

// Those are represented as strings in the set. Remove them so that

// we end up with only simple and common case mappings.

set.removeAllStrings();

- Zone* zone = compiler->zone();

for (int i = 0; i < set.getRangeCount(); i++) {

ranges->Add(CharacterRange::Range(set.getRangeStart(i), set.getRangeEnd(i)),

zone);

}

// No errors and everything we collected have been ranges.

-#else

- // Fallback if ICU is not included.

- CharacterRange::AddCaseEquivalents(compiler->isolate(), compiler->zone(),

- ranges, compiler->one_byte());

-#endif // V8_I18N_SUPPORT

CharacterRange::Canonicalize(ranges);

+#endif // V8_I18N_SUPPORT

}

@@ -5145,8 +5141,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

set_.Canonicalize();

Zone* zone = compiler->zone();

ZoneList<CharacterRange>* ranges = this->ranges(zone);

- if (compiler->unicode() && compiler->ignore_case()) {

- AddUnicodeCaseEquivalents(compiler, ranges);

+ if (compiler->needs_unicode_case_equivalents()) {

+ AddUnicodeCaseEquivalents(ranges, zone);

}

if (compiler->unicode() && !compiler->one_byte()) {

if (is_negated()) {

@@ -5619,6 +5615,42 @@ RegExpNode* RegExpQuantifier::ToNode(int min,

}

+namespace {

+// Desugar \b to (?<=\w)(?=\W)|(?<=\W)(?=\w) and

+// \B to (?<=\w)(?=\w)|(?<=\W)(?=\W)

+RegExpNode* BoundaryAssertionAsLookaround(RegExpCompiler* compiler,

+ RegExpNode* on_success,

+ RegExpAssertion::AssertionType type) {

+ DCHECK(compiler->needs_unicode_case_equivalents());

+ Zone* zone = compiler->zone();

+ ZoneList<CharacterRange>* word_range =

+ new (zone) ZoneList<CharacterRange>(2, zone);

+ CharacterRange::AddClassEscape('w', word_range, true, zone);

+ int stack_register = compiler->UnicodeLookaroundStackRegister();

+ int position_register = compiler->UnicodeLookaroundPositionRegister();

+ ChoiceNode* result = new (zone) ChoiceNode(2, zone);

+ // Add two choices. The (non-)boundary could start with a word or

+ // a non-word-character.

+ for (int i = 0; i < 2; i++) {

+ bool lookbehind_for_word = i == 0;

+ bool lookahead_for_word =

+ (type == RegExpAssertion::BOUNDARY) ^ lookbehind_for_word;

+ // Look to the left.

+ RegExpLookaround::Builder lookbehind(lookbehind_for_word, on_success,

+ stack_register, position_register);

+ RegExpNode* backward = TextNode::CreateForCharacterRanges(

+ zone, word_range, true, lookbehind.on_match_success());

+ // Look to the right.

+ RegExpLookaround::Builder lookahead(lookahead_for_word,

+ lookbehind.ForMatch(backward),

+ stack_register, position_register);

+ RegExpNode* forward = TextNode::CreateForCharacterRanges(

+ zone, word_range, false, lookahead.on_match_success());

+ result->AddAlternative(GuardedAlternative(lookahead.ForMatch(forward)));

+ }

+ return result;

+} // anonymous namespace

RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

RegExpNode* on_success) {

@@ -5631,9 +5663,14 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

case START_OF_INPUT:

return AssertionNode::AtStart(on_success);

case BOUNDARY:

- return AssertionNode::AtBoundary(on_success);

+ return compiler->needs_unicode_case_equivalents()

+ ? BoundaryAssertionAsLookaround(compiler, on_success, BOUNDARY)

+ : AssertionNode::AtBoundary(on_success);

case NON_BOUNDARY:

- return AssertionNode::AtNonBoundary(on_success);

+ return compiler->needs_unicode_case_equivalents()

+ ? BoundaryAssertionAsLookaround(compiler, on_success,

+ NON_BOUNDARY)

+ : AssertionNode::AtNonBoundary(on_success);

case END_OF_INPUT:

return AssertionNode::AtEnd(on_success);

case END_OF_LINE: {

@@ -5647,7 +5684,7 @@ RegExpNode* RegExpAssertion::ToNode(RegExpCompiler* compiler,

// Create a newline atom.

ZoneList<CharacterRange>* newline_ranges =

new(zone) ZoneList<CharacterRange>(3, zone);

- CharacterRange::AddClassEscape('n', newline_ranges, zone);

+ CharacterRange::AddClassEscape('n', newline_ranges, false, zone);

RegExpCharacterClass* newline_atom = new (zone) RegExpCharacterClass('n');

TextNode* newline_matcher = new (zone) TextNode(

newline_atom, false, ActionNode::PositiveSubmatchSuccess(

@@ -5821,9 +5858,30 @@ static void AddClassNegated(const int *elmv,

ranges->Add(CharacterRange::Range(last, String::kMaxCodePoint), zone);

}

+void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,

+ bool add_unicode_case_equivalents,

+ Zone* zone) {

+ if (add_unicode_case_equivalents && (type == 'w' || type == 'W')) {

+ // See #sec-runtime-semantics-wordcharacters-abstract-operation

+ // In case of unicode and ignore_case, we need to create the closure over

+ // case equivalent characters before negating.

+ ZoneList<CharacterRange>* new_ranges =

+ new (zone) ZoneList<CharacterRange>(2, zone);

+ AddClass(kWordRanges, kWordRangeCount, new_ranges, zone);

+ AddUnicodeCaseEquivalents(new_ranges, zone);

+ if (type == 'W') {

+ ZoneList<CharacterRange>* negated =

+ new (zone) ZoneList<CharacterRange>(2, zone);

+ CharacterRange::Negate(new_ranges, negated, zone);

+ new_ranges = negated;

+ }

+ ranges->AddAll(*new_ranges, zone);

+ return;

+ }

+ AddClassEscape(type, ranges, zone);

-void CharacterRange::AddClassEscape(uc16 type,

- ZoneList<CharacterRange>* ranges,

+void CharacterRange::AddClassEscape(uc16 type, ZoneList<CharacterRange>* ranges,

Zone* zone) {

switch (type) {

case 's':

@@ -5965,7 +6023,7 @@ bool CharacterRange::IsCanonical(ZoneList<CharacterRange>* ranges) {

ZoneList<CharacterRange>* CharacterSet::ranges(Zone* zone) {

if (ranges_ == NULL) {

ranges_ = new(zone) ZoneList<CharacterRange>(2, zone);

- CharacterRange::AddClassEscape(standard_set_type_, ranges_, zone);

+ CharacterRange::AddClassEscape(standard_set_type_, ranges_, false, zone);

}

return ranges_;

}

« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »