Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(474)

Unified Diff: src/jsregexp.cc

Issue 1176453002: Optimize trivial regexp disjunctions (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
« no previous file with comments | « src/ast.h ('k') | src/list.h » ('j') | no next file with comments »
Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
Index: src/jsregexp.cc
diff --git a/src/jsregexp.cc b/src/jsregexp.cc
index b410d47b3d1fd41d93dd5d26cf8386050375745e..edfd065437ee7434486dd77437551d38eb0f3a78 100644
--- a/src/jsregexp.cc
+++ b/src/jsregexp.cc
@@ -4817,10 +4817,192 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
}
+int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
+ RegExpAtom* atom1 = (*a)->AsAtom();
+ RegExpAtom* atom2 = (*b)->AsAtom();
+ uc16 character1 = atom1->data().at(0);
+ uc16 character2 = atom2->data().at(0);
+ if (character1 < character2) return -1;
+ if (character1 > character2) return 1;
+ return 0;
+}
+
+
+// We can stable sort runs of atoms of at least length 0, since the order
+// does not matter.
+// Returns true if any consecutive atoms were found.
+bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+ bool found_consecutive_atoms = false;
+ for (int i = 0; i < length; i++) {
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (alternative->IsAtom() && alternative->AsAtom()->length() > 0) break;
+ i++;
+ }
+ // i is length or it is the index of an atom.
+ int first_atom = i;
+ i++;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ if (alternative->AsAtom()->length() == 0) break;
+ i++;
+ }
+ // Sort atoms to get ones with common prefixes together.
+ // This step is not valid if we are in a case-independent regexp,
+ // because it would change /is|I/ to /I|is/, and order matters when
+ // the regexp parts don't match only disjoint starting points.
+ if (!compiler->ignore_case()) {
+ alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
+ }
+ if (i - first_atom > 1) found_consecutive_atoms = true;
+ }
+ return found_consecutive_atoms;
+}
+
+
+// Optimizes ab|ac|az to a(?:b|c|d).
+void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
+ Zone* zone = compiler->zone();
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+
+ int write_posn = 0;
+ int i = 0;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ RegExpAtom* atom = alternative->AsAtom();
+ if (atom->length() < 1) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ uc16 common_prefix = atom->data().at(0);
+ int first_with_prefix = i;
+ i++;
+ while (i < length) {
+ alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ atom = alternative->AsAtom();
+ if (atom->length() < 1) break;
+ if (atom->data().at(0) != common_prefix) break;
+ i++;
+ }
+ if (i > first_with_prefix + 2) {
+ // Found worthwhile run of alternatives with common prefix.
+ int run_length = i - first_with_prefix;
+ ZoneList<uc16>* prefix_list = new (zone) ZoneList<uc16>(1, zone);
+ prefix_list->Add(common_prefix, zone);
+ RegExpAtom* prefix = new (zone) RegExpAtom(prefix_list->ToConstVector());
+ ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
+ pair->Add(prefix, zone);
+ ZoneList<RegExpTree*>* suffixes =
+ new (zone) ZoneList<RegExpTree*>(run_length, zone);
+ for (int j = 0; j < run_length; j++) {
+ RegExpAtom* old_atom =
+ alternatives->at(j + first_with_prefix)->AsAtom();
+ int len = old_atom->length();
+ if (len == 0) {
+ suffixes->Add(new (zone) RegExpEmpty(), zone);
+ } else {
+ ZoneList<uc16>* suffix_list =
+ new (zone) ZoneList<uc16>(len - 1, zone);
+ for (int k = 1; k < len; k++)
+ suffix_list->Add(old_atom->data().at(k), zone);
+ RegExpTree* suffix =
+ new (zone) RegExpAtom(suffix_list->ToConstVector());
Yang 2015/06/09 10:59:35 We could just use old_atom->data()->SubVector(1, l
+ suffixes->Add(suffix, zone);
+ }
+ }
+ pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
+ alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
+ } else {
+ // Just copy any non-worthwhile alternatives.
+ for (int j = first_with_prefix; j < i; j++) {
+ alternatives->at(write_posn++) = alternatives->at(j);
+ }
+ }
+ }
+ alternatives->Rewind(write_posn); // Trim end of array.
+}
+
+
+// Optimizes b|c|z to [bcz].
+void RegExpDisjunction::FixSingleCharacterDisjunctions(
+ RegExpCompiler* compiler) {
+ Zone* zone = compiler->zone();
+ ZoneList<RegExpTree*>* alternatives = this->alternatives();
+ int length = alternatives->length();
+
+ int write_posn = 0;
+ int i = 0;
+ while (i < length) {
+ RegExpTree* alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ RegExpAtom* atom = alternative->AsAtom();
+ if (atom->length() != 1) {
+ alternatives->at(write_posn++) = alternatives->at(i);
+ i++;
+ continue;
+ }
+ int first_in_run = i;
+ i++;
+ while (i < length) {
+ alternative = alternatives->at(i);
+ if (!alternative->IsAtom()) break;
+ atom = alternative->AsAtom();
+ if (atom->length() != 1) break;
+ i++;
+ }
+ if (i > first_in_run + 1) {
+ // Found non-trivial run of single-character alternatives.
+ int run_length = i - first_in_run;
+ ZoneList<CharacterRange>* ranges =
+ new (zone) ZoneList<CharacterRange>(2, zone);
+ for (int j = 0; j < run_length; j++) {
+ RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
+ DCHECK_EQ(old_atom->length(), 1);
+ ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
+ }
+ alternatives->at(write_posn++) =
+ new (zone) RegExpCharacterClass(ranges, false);
+ } else {
+ // Just copy any trivial alternatives.
+ for (int j = first_in_run; j < i; j++) {
+ alternatives->at(write_posn++) = alternatives->at(j);
+ }
+ }
+ }
+ alternatives->Rewind(write_posn); // Trim end of array.
+}
+
+
RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
RegExpNode* on_success) {
ZoneList<RegExpTree*>* alternatives = this->alternatives();
+
+ if (alternatives->length() > 2) {
+ bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
+ if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
+ FixSingleCharacterDisjunctions(compiler);
+ if (alternatives->length() == 1) {
+ return alternatives->at(0)->ToNode(compiler, on_success);
Yang 2015/06/09 10:59:35 I guess for something like aaaaaaaaaaaaaaab|aaaaaa
+ }
+ }
+
int length = alternatives->length();
+
ChoiceNode* result =
new(compiler->zone()) ChoiceNode(length, compiler->zone());
for (int i = 0; i < length; i++) {
« no previous file with comments | « src/ast.h ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698