src/jsregexp.cc - Issue 1172893002: Revert of Reland of 'Optimize trivial regexp disjunctions' CL 1176453002

Unified Diff: src/jsregexp.cc

Issue 1172893002: Revert of Reland of 'Optimize trivial regexp disjunctions' CL 1176453002 (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/jsregexp.cc

diff --git a/src/jsregexp.cc b/src/jsregexp.cc

index e284e8cb15f3233eee2533bc9efda8b6735bd1e3..b410d47b3d1fd41d93dd5d26cf8386050375745e 100644

--- a/src/jsregexp.cc

+++ b/src/jsregexp.cc

@@ -4817,200 +4817,10 @@

}

-int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {

- RegExpAtom* atom1 = (*a)->AsAtom();

- RegExpAtom* atom2 = (*b)->AsAtom();

- uc16 character1 = atom1->data().at(0);

- uc16 character2 = atom2->data().at(0);

- if (character1 < character2) return -1;

- if (character1 > character2) return 1;

- return 0;

-// We can stable sort runs of atoms, since the order does not matter if they

-// start with different characters.

-// Returns true if any consecutive atoms were found.

-bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

- ZoneList<RegExpTree*>* alternatives = this->alternatives();

- int length = alternatives->length();

- bool found_consecutive_atoms = false;

- for (int i = 0; i < length; i++) {

- while (i < length) {

- RegExpTree* alternative = alternatives->at(i);

- if (alternative->IsAtom()) break;

- i++;

- }

- // i is length or it is the index of an atom.

- if (i == length) break;

- int first_atom = i;

- i++;

- while (i < length) {

- RegExpTree* alternative = alternatives->at(i);

- if (!alternative->IsAtom()) break;

- i++;

- }

- // Sort atoms to get ones with common prefixes together.

- // This step is not valid if we are in a case-independent regexp,

- // because it would change /is|I/ to /I|is/, and order matters when

- // the regexp parts don't match only disjoint starting points. To fix

- // this would need a version of CompareFirstChar that uses case-

- // independent character classes for comparison.

- if (!compiler->ignore_case()) {

- DCHECK_LT(first_atom, alternatives->length());

- DCHECK_LE(i, alternatives->length());

- DCHECK_LE(first_atom, i);

- alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);

- }

- if (i - first_atom > 1) found_consecutive_atoms = true;

- }

- return found_consecutive_atoms;

-// Optimizes ab|ac|az to a(?:b|c|d).

-void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

- Zone* zone = compiler->zone();

- ZoneList<RegExpTree*>* alternatives = this->alternatives();

- int length = alternatives->length();

- int write_posn = 0;

- int i = 0;

- while (i < length) {

- RegExpTree* alternative = alternatives->at(i);

- if (!alternative->IsAtom()) {

- alternatives->at(write_posn++) = alternatives->at(i);

- i++;

- continue;

- }

- RegExpAtom* atom = alternative->AsAtom();

- uc16 common_prefix = atom->data().at(0);

- int first_with_prefix = i;

- int prefix_length = atom->length();

- i++;

- while (i < length) {

- alternative = alternatives->at(i);

- if (!alternative->IsAtom()) break;

- atom = alternative->AsAtom();

- if (atom->data().at(0) != common_prefix) break;

- prefix_length = Min(prefix_length, atom->length());

- i++;

- }

- if (i > first_with_prefix + 2) {

- // Found worthwhile run of alternatives with common prefix of at least one

- // character. The sorting function above did not sort on more than one

- // character for reasons of correctness, but there may still be a longer

- // common prefix if the terms were similar or presorted in the input.

- // Find out how long the common prefix is.

- int run_length = i - first_with_prefix;

- atom = alternatives->at(first_with_prefix)->AsAtom();

- for (int j = 1; j < run_length && prefix_length > 1; j++) {

- RegExpAtom* old_atom =

- alternatives->at(j + first_with_prefix)->AsAtom();

- for (int k = 1; k < prefix_length; k++) {

- if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;

- }

- RegExpAtom* prefix =

- new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));

- ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);

- pair->Add(prefix, zone);

- ZoneList<RegExpTree*>* suffixes =

- new (zone) ZoneList<RegExpTree*>(run_length, zone);

- for (int j = 0; j < run_length; j++) {

- RegExpAtom* old_atom =

- alternatives->at(j + first_with_prefix)->AsAtom();

- int len = old_atom->length();

- if (len == prefix_length) {

- suffixes->Add(new (zone) RegExpEmpty(), zone);

- } else {

- RegExpTree* suffix = new (zone) RegExpAtom(

- old_atom->data().SubVector(prefix_length, old_atom->length()));

- suffixes->Add(suffix, zone);

- }

- pair->Add(new (zone) RegExpDisjunction(suffixes), zone);

- alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);

- } else {

- // Just copy any non-worthwhile alternatives.

- for (int j = first_with_prefix; j < i; j++) {

- alternatives->at(write_posn++) = alternatives->at(j);

- }

- alternatives->Rewind(write_posn); // Trim end of array.

-// Optimizes b|c|z to [bcz].

-void RegExpDisjunction::FixSingleCharacterDisjunctions(

- RegExpCompiler* compiler) {

- Zone* zone = compiler->zone();

- ZoneList<RegExpTree*>* alternatives = this->alternatives();

- int length = alternatives->length();

- int write_posn = 0;

- int i = 0;

- while (i < length) {

- RegExpTree* alternative = alternatives->at(i);

- if (!alternative->IsAtom()) {

- alternatives->at(write_posn++) = alternatives->at(i);

- i++;

- continue;

- }

- RegExpAtom* atom = alternative->AsAtom();

- if (atom->length() != 1) {

- alternatives->at(write_posn++) = alternatives->at(i);

- i++;

- continue;

- }

- int first_in_run = i;

- i++;

- while (i < length) {

- alternative = alternatives->at(i);

- if (!alternative->IsAtom()) break;

- atom = alternative->AsAtom();

- if (atom->length() != 1) break;

- i++;

- }

- if (i > first_in_run + 1) {

- // Found non-trivial run of single-character alternatives.

- int run_length = i - first_in_run;

- ZoneList<CharacterRange>* ranges =

- new (zone) ZoneList<CharacterRange>(2, zone);

- for (int j = 0; j < run_length; j++) {

- RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();

- DCHECK_EQ(old_atom->length(), 1);

- ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);

- }

- alternatives->at(write_posn++) =

- new (zone) RegExpCharacterClass(ranges, false);

- } else {

- // Just copy any trivial alternatives.

- for (int j = first_in_run; j < i; j++) {

- alternatives->at(write_posn++) = alternatives->at(j);

- }

- alternatives->Rewind(write_posn); // Trim end of array.

RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,

RegExpNode* on_success) {

ZoneList<RegExpTree*>* alternatives = this->alternatives();

- if (alternatives->length() > 2) {

- bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);

- if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);

- FixSingleCharacterDisjunctions(compiler);

- if (alternatives->length() == 1) {

- return alternatives->at(0)->ToNode(compiler, on_success);

- }

int length = alternatives->length();

ChoiceNode* result =

new(compiler->zone()) ChoiceNode(length, compiler->zone());

for (int i = 0; i < length; i++) {

« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »