src/jsregexp.cc - Issue 1182783009: Extend big-disjunction optimization to case-independent regexps

Unified Diff: src/jsregexp.cc

Issue 1182783009: Extend big-disjunction optimization to case-independent regexps (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Fix 80 columns in test Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index e284e8cb15f3233eee2533bc9efda8b6735bd1e3..49a2998fed5c7d7222a36c6b9da8ad553921ea90 100644

--- a/src/jsregexp.cc

+++ b/src/jsregexp.cc

@@ -4828,6 +4828,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {

}

+static unibrow::uchar Canonical(

+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

+ unibrow::uchar c) {

+ unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];

+ int length = canonicalize->get(c, '\0', chars);

+ DCHECK_LE(length, 1);

+ unibrow::uchar canonical = c;

+ if (length == 1) canonical = chars[0];

+ return canonical;

+int CompareFirstCharCaseIndependent(

+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

+ RegExpTree* const* a, RegExpTree* const* b) {

+ RegExpAtom* atom1 = (*a)->AsAtom();

+ RegExpAtom* atom2 = (*b)->AsAtom();

+ unibrow::uchar character1 = atom1->data().at(0);

+ unibrow::uchar character2 = atom2->data().at(0);

+ if (character1 == character2) return 0;

+ if (character1 >= 'a' || character2 >= 'a') {

+ character1 = Canonical(canonicalize, character1);

+ character2 = Canonical(canonicalize, character2);

+ }

+ return static_cast<int>(character1) - static_cast<int>(character2);

// We can stable sort runs of atoms, since the order does not matter if they

// start with different characters.

// Returns true if any consecutive atoms were found.

@@ -4851,15 +4879,23 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

i++;

}

// Sort atoms to get ones with common prefixes together.

- // This step is not valid if we are in a case-independent regexp,

+ // This step is more tricky if we are in a case-independent regexp,

// because it would change /is|I/ to /I|is/, and order matters when

// the regexp parts don't match only disjoint starting points. To fix

- // this would need a version of CompareFirstChar that uses case-

+ // this we have a version of CompareFirstChar that uses case-

// independent character classes for comparison.

- if (!compiler->ignore_case()) {

- DCHECK_LT(first_atom, alternatives->length());

- DCHECK_LE(i, alternatives->length());

- DCHECK_LE(first_atom, i);

+ DCHECK_LT(first_atom, alternatives->length());

+ DCHECK_LE(i, alternatives->length());

+ DCHECK_LE(first_atom, i);

+ if (compiler->ignore_case()) {

+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

+ compiler->isolate()->regexp_macro_assembler_canonicalize();

+ auto compare_closure =

+ [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {

+ return CompareFirstCharCaseIndependent(canonicalize, a, b);

+ };

+ alternatives->StableSort(compare_closure, first_atom, i - first_atom);

+ } else {

alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);

}

if (i - first_atom > 1) found_consecutive_atoms = true;

@@ -4884,7 +4920,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

continue;

}

RegExpAtom* atom = alternative->AsAtom();

- uc16 common_prefix = atom->data().at(0);

+ unibrow::uchar common_prefix = atom->data().at(0);

int first_with_prefix = i;

int prefix_length = atom->length();

i++;

@@ -4892,7 +4928,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

alternative = alternatives->at(i);

if (!alternative->IsAtom()) break;

atom = alternative->AsAtom();

- if (atom->data().at(0) != common_prefix) break;

+ unibrow::uchar new_prefix = atom->data().at(0);

+ if (new_prefix != common_prefix) {

+ if (!compiler->ignore_case()) break;

+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

+ compiler->isolate()->regexp_macro_assembler_canonicalize();

+ new_prefix = Canonical(canonicalize, new_prefix);

+ common_prefix = Canonical(canonicalize, common_prefix);

+ if (new_prefix != common_prefix) break;

+ }

prefix_length = Min(prefix_length, atom->length());

i++;

}

@@ -4908,7 +4952,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

RegExpAtom* old_atom =

alternatives->at(j + first_with_prefix)->AsAtom();

for (int k = 1; k < prefix_length; k++) {

- if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;

+ if (atom->data().at(k) != old_atom->data().at(k)) {

+ prefix_length = k;

+ break;

+ }

}

RegExpAtom* prefix =

« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »