src/regexp/jsregexp.cc - Issue 2813893002: [regexp] Consider surrogate pairs when optimizing disjunctions

Unified Diff: src/regexp/jsregexp.cc

Issue 2813893002: [regexp] Consider surrogate pairs when optimizing disjunctions (Closed)

Patch Set: DCHECK(!IsLeadSurrogate) Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Index: src/regexp/jsregexp.cc

diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc

index 8ab2681dcf0ef6fec4798bca9e58398770257b08..add1d2006cbba69f6172002f71899cc10a619ae4 100644

--- a/src/regexp/jsregexp.cc

+++ b/src/regexp/jsregexp.cc

@@ -3327,9 +3327,8 @@ TextNode* TextNode::CreateForCharacterRanges(Zone* zone,

RegExpNode* on_success) {

DCHECK_NOT_NULL(ranges);

ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);

- elms->Add(

- TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),

- zone);

+ elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),

+ zone);

return new (zone) TextNode(elms, read_backward, on_success);

}

@@ -3341,12 +3340,12 @@ TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,

ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);

ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);

ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);

- elms->Add(TextElement::CharClass(

- new (zone) RegExpCharacterClass(lead_ranges, false)),

- zone);

- elms->Add(TextElement::CharClass(

- new (zone) RegExpCharacterClass(trail_ranges, false)),

- zone);

+ elms->Add(

+ TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),

+ zone);

+ elms->Add(

+ TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),

+ zone);

return new (zone) TextNode(elms, read_backward, on_success);

}

@@ -4851,7 +4850,7 @@ static bool CompareRanges(ZoneList<CharacterRange>* ranges,

bool RegExpCharacterClass::is_standard(Zone* zone) {

// TODO(lrn): Remove need for this function, by not throwing away information

// along the way.

- if (is_negated_) {

+ if (is_negated()) {

return false;

}

if (set_.is_standard()) {

@@ -5144,7 +5143,8 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

if (compiler->needs_unicode_case_equivalents()) {

AddUnicodeCaseEquivalents(ranges, zone);

}

- if (compiler->unicode() && !compiler->one_byte()) {

+ if (compiler->unicode() && !compiler->one_byte() &&

+ !contains_split_surrogate()) {

if (is_negated()) {

ZoneList<CharacterRange>* negated =

new (zone) ZoneList<CharacterRange>(2, zone);

@@ -5154,7 +5154,7 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

if (ranges->length() == 0) {

ranges->Add(CharacterRange::Everything(), zone);

RegExpCharacterClass* fail =

- new (zone) RegExpCharacterClass(ranges, true);

+ new (zone) RegExpCharacterClass(ranges, NEGATED);

return new (zone) TextNode(fail, compiler->read_backward(), on_success);

}

if (standard_type() == '*') {

@@ -5368,6 +5368,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(

i++;

continue;

}

+ DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));

+ bool contains_trail_surrogate =

+ unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));

int first_in_run = i;

i++;

while (i < length) {

@@ -5375,6 +5378,9 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(

if (!alternative->IsAtom()) break;

atom = alternative->AsAtom();

if (atom->length() != 1) break;

+ DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));

+ contains_trail_surrogate |=

+ unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));

i++;

}

if (i > first_in_run + 1) {

@@ -5387,8 +5393,12 @@ void RegExpDisjunction::FixSingleCharacterDisjunctions(

DCHECK_EQ(old_atom->length(), 1);

ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);

}

+ RegExpCharacterClass::Flags flags;

+ if (compiler->unicode() && contains_trail_surrogate) {

+ flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;

+ }

alternatives->at(write_posn++) =

- new (zone) RegExpCharacterClass(ranges, false);

+ new (zone) RegExpCharacterClass(ranges, flags);

} else {

// Just copy any trivial alternatives.

for (int j = first_in_run; j < i; j++) {

« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »