| Index: src/jsregexp.cc
|
| diff --git a/src/jsregexp.cc b/src/jsregexp.cc
|
| index e284e8cb15f3233eee2533bc9efda8b6735bd1e3..49a2998fed5c7d7222a36c6b9da8ad553921ea90 100644
|
| --- a/src/jsregexp.cc
|
| +++ b/src/jsregexp.cc
|
| @@ -4828,6 +4828,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
|
| }
|
|
|
|
|
| +static unibrow::uchar Canonical(
|
| + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
|
| + unibrow::uchar c) {
|
| + unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
|
| + int length = canonicalize->get(c, '\0', chars);
|
| + DCHECK_LE(length, 1);
|
| + unibrow::uchar canonical = c;
|
| + if (length == 1) canonical = chars[0];
|
| + return canonical;
|
| +}
|
| +
|
| +
|
| +int CompareFirstCharCaseIndependent(
|
| + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
|
| + RegExpTree* const* a, RegExpTree* const* b) {
|
| + RegExpAtom* atom1 = (*a)->AsAtom();
|
| + RegExpAtom* atom2 = (*b)->AsAtom();
|
| + unibrow::uchar character1 = atom1->data().at(0);
|
| + unibrow::uchar character2 = atom2->data().at(0);
|
| + if (character1 == character2) return 0;
|
| + if (character1 >= 'a' || character2 >= 'a') {
|
| + character1 = Canonical(canonicalize, character1);
|
| + character2 = Canonical(canonicalize, character2);
|
| + }
|
| + return static_cast<int>(character1) - static_cast<int>(character2);
|
| +}
|
| +
|
| +
|
| // We can stable sort runs of atoms, since the order does not matter if they
|
| // start with different characters.
|
| // Returns true if any consecutive atoms were found.
|
| @@ -4851,15 +4879,23 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
|
| i++;
|
| }
|
| // Sort atoms to get ones with common prefixes together.
|
| - // This step is not valid if we are in a case-independent regexp,
|
| + // This step is more tricky if we are in a case-independent regexp,
|
| // because it would change /is|I/ to /I|is/, and order matters when
|
| // the regexp parts don't match only disjoint starting points. To fix
|
| - // this would need a version of CompareFirstChar that uses case-
|
| + // this we have a version of CompareFirstChar that uses case-
|
| // independent character classes for comparison.
|
| - if (!compiler->ignore_case()) {
|
| - DCHECK_LT(first_atom, alternatives->length());
|
| - DCHECK_LE(i, alternatives->length());
|
| - DCHECK_LE(first_atom, i);
|
| + DCHECK_LT(first_atom, alternatives->length());
|
| + DCHECK_LE(i, alternatives->length());
|
| + DCHECK_LE(first_atom, i);
|
| + if (compiler->ignore_case()) {
|
| + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
|
| + compiler->isolate()->regexp_macro_assembler_canonicalize();
|
| + auto compare_closure =
|
| + [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
|
| + return CompareFirstCharCaseIndependent(canonicalize, a, b);
|
| + };
|
| + alternatives->StableSort(compare_closure, first_atom, i - first_atom);
|
| + } else {
|
| alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
|
| }
|
| if (i - first_atom > 1) found_consecutive_atoms = true;
|
| @@ -4884,7 +4920,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
|
| continue;
|
| }
|
| RegExpAtom* atom = alternative->AsAtom();
|
| - uc16 common_prefix = atom->data().at(0);
|
| + unibrow::uchar common_prefix = atom->data().at(0);
|
| int first_with_prefix = i;
|
| int prefix_length = atom->length();
|
| i++;
|
| @@ -4892,7 +4928,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
|
| alternative = alternatives->at(i);
|
| if (!alternative->IsAtom()) break;
|
| atom = alternative->AsAtom();
|
| - if (atom->data().at(0) != common_prefix) break;
|
| + unibrow::uchar new_prefix = atom->data().at(0);
|
| + if (new_prefix != common_prefix) {
|
| + if (!compiler->ignore_case()) break;
|
| + unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
|
| + compiler->isolate()->regexp_macro_assembler_canonicalize();
|
| + new_prefix = Canonical(canonicalize, new_prefix);
|
| + common_prefix = Canonical(canonicalize, common_prefix);
|
| + if (new_prefix != common_prefix) break;
|
| + }
|
| prefix_length = Min(prefix_length, atom->length());
|
| i++;
|
| }
|
| @@ -4908,7 +4952,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
|
| RegExpAtom* old_atom =
|
| alternatives->at(j + first_with_prefix)->AsAtom();
|
| for (int k = 1; k < prefix_length; k++) {
|
| - if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
|
| + if (atom->data().at(k) != old_atom->data().at(k)) {
|
| + prefix_length = k;
|
| + break;
|
| + }
|
| }
|
| }
|
| RegExpAtom* prefix =
|
|
|