Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(380)

Unified Diff: src/jsregexp.cc

Issue 1182783009: Extend big-disjunction optimization to case-independent regexps (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Fix 80 columns in test Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
diff --git a/src/jsregexp.cc b/src/jsregexp.cc
index e284e8cb15f3233eee2533bc9efda8b6735bd1e3..49a2998fed5c7d7222a36c6b9da8ad553921ea90 100644
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -4828,6 +4828,34 @@ int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
}
+static unibrow::uchar Canonical(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ unibrow::uchar c) {
+ unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
+ int length = canonicalize->get(c, '\0', chars);
+ DCHECK_LE(length, 1);
+ unibrow::uchar canonical = c;
+ if (length == 1) canonical = chars[0];
+ return canonical;
+}
+
+
+int CompareFirstCharCaseIndependent(
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
+ RegExpTree* const* a, RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ unibrow::uchar character1 = atom1->data().at(0);
+ unibrow::uchar character2 = atom2->data().at(0);
+ if (character1 == character2) return 0;
+ if (character1 >= 'a' || character2 >= 'a') {
+ character1 = Canonical(canonicalize, character1);
+ character2 = Canonical(canonicalize, character2);
+ }
+ return static_cast<int>(character1) - static_cast<int>(character2);
+}
+
+
// We can stable sort runs of atoms, since the order does not matter if they
// start with different characters.
// Returns true if any consecutive atoms were found.
@@ -4851,15 +4879,23 @@ bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
i++;
}
// Sort atoms to get ones with common prefixes together.
- // This step is not valid if we are in a case-independent regexp,
+ // This step is more tricky if we are in a case-independent regexp,
// because it would change /is|I/ to /I|is/, and order matters when
// the regexp parts don't match only disjoint starting points. To fix
- // this would need a version of CompareFirstChar that uses case-
+ // this we have a version of CompareFirstChar that uses case-
// independent character classes for comparison.
- if (!compiler->ignore_case()) {
- DCHECK_LT(first_atom, alternatives->length());
- DCHECK_LE(i, alternatives->length());
- DCHECK_LE(first_atom, i);
+ DCHECK_LT(first_atom, alternatives->length());
+ DCHECK_LE(i, alternatives->length());
+ DCHECK_LE(first_atom, i);
+ if (compiler->ignore_case()) {
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+ compiler->isolate()->regexp_macro_assembler_canonicalize();
+ auto compare_closure =
+ [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
+ return CompareFirstCharCaseIndependent(canonicalize, a, b);
+ };
+ alternatives->StableSort(compare_closure, first_atom, i - first_atom);
+ } else {
alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
}
if (i - first_atom > 1) found_consecutive_atoms = true;
@@ -4884,7 +4920,7 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
continue;
}
RegExpAtom* atom = alternative->AsAtom();
- uc16 common_prefix = atom->data().at(0);
+ unibrow::uchar common_prefix = atom->data().at(0);
int first_with_prefix = i;
int prefix_length = atom->length();
i++;
@@ -4892,7 +4928,15 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
alternative = alternatives->at(i);
if (!alternative->IsAtom()) break;
atom = alternative->AsAtom();
- if (atom->data().at(0) != common_prefix) break;
+ unibrow::uchar new_prefix = atom->data().at(0);
+ if (new_prefix != common_prefix) {
+ if (!compiler->ignore_case()) break;
+ unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
+ compiler->isolate()->regexp_macro_assembler_canonicalize();
+ new_prefix = Canonical(canonicalize, new_prefix);
+ common_prefix = Canonical(canonicalize, common_prefix);
+ if (new_prefix != common_prefix) break;
+ }
prefix_length = Min(prefix_length, atom->length());
i++;
}
@@ -4908,7 +4952,10 @@ void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
RegExpAtom* old_atom =
alternatives->at(j + first_with_prefix)->AsAtom();
for (int k = 1; k < prefix_length; k++) {
- if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
+ if (atom->data().at(k) != old_atom->data().at(k)) {
+ prefix_length = k;
+ break;
+ }
}
}
RegExpAtom* prefix =
« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698