Chromium Code Reviews| Index: src/jsregexp.cc |
| diff --git a/src/jsregexp.cc b/src/jsregexp.cc |
| index b410d47b3d1fd41d93dd5d26cf8386050375745e..edfd065437ee7434486dd77437551d38eb0f3a78 100644 |
| --- a/src/jsregexp.cc |
| +++ b/src/jsregexp.cc |
| @@ -4817,10 +4817,192 @@ RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
| } |
| +int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { |
| + RegExpAtom* atom1 = (*a)->AsAtom(); |
| + RegExpAtom* atom2 = (*b)->AsAtom(); |
| + uc16 character1 = atom1->data().at(0); |
| + uc16 character2 = atom2->data().at(0); |
| + if (character1 < character2) return -1; |
| + if (character1 > character2) return 1; |
| + return 0; |
| +} |
| + |
| + |
| +// We can stable sort runs of atoms of at least length 0, since the order |
| +// does not matter. |
| +// Returns true if any consecutive atoms were found. |
| +bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { |
| + ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
| + int length = alternatives->length(); |
| + bool found_consecutive_atoms = false; |
| + for (int i = 0; i < length; i++) { |
| + while (i < length) { |
| + RegExpTree* alternative = alternatives->at(i); |
| + if (alternative->IsAtom() && alternative->AsAtom()->length() > 0) break; |
| + i++; |
| + } |
| + // i is length or it is the index of an atom. |
| + int first_atom = i; |
| + i++; |
| + while (i < length) { |
| + RegExpTree* alternative = alternatives->at(i); |
| + if (!alternative->IsAtom()) break; |
| + if (alternative->AsAtom()->length() == 0) break; |
| + i++; |
| + } |
| + // Sort atoms to get ones with common prefixes together. |
| + // This step is not valid if we are in a case-independent regexp, |
| + // because it would change /is|I/ to /I|is/, and order matters when |
| + // the regexp parts don't match only disjoint starting points. |
| + if (!compiler->ignore_case()) { |
| + alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); |
| + } |
| + if (i - first_atom > 1) found_consecutive_atoms = true; |
| + } |
| + return found_consecutive_atoms; |
| +} |
| + |
| + |
| +// Optimizes ab|ac|az to a(?:b|c|d). |
| +void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { |
| + Zone* zone = compiler->zone(); |
| + ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
| + int length = alternatives->length(); |
| + |
| + int write_posn = 0; |
| + int i = 0; |
| + while (i < length) { |
| + RegExpTree* alternative = alternatives->at(i); |
| + if (!alternative->IsAtom()) { |
| + alternatives->at(write_posn++) = alternatives->at(i); |
| + i++; |
| + continue; |
| + } |
| + RegExpAtom* atom = alternative->AsAtom(); |
| + if (atom->length() < 1) { |
| + alternatives->at(write_posn++) = alternatives->at(i); |
| + i++; |
| + continue; |
| + } |
| + uc16 common_prefix = atom->data().at(0); |
| + int first_with_prefix = i; |
| + i++; |
| + while (i < length) { |
| + alternative = alternatives->at(i); |
| + if (!alternative->IsAtom()) break; |
| + atom = alternative->AsAtom(); |
| + if (atom->length() < 1) break; |
| + if (atom->data().at(0) != common_prefix) break; |
| + i++; |
| + } |
| + if (i > first_with_prefix + 2) { |
| + // Found worthwhile run of alternatives with common prefix. |
| + int run_length = i - first_with_prefix; |
| + ZoneList<uc16>* prefix_list = new (zone) ZoneList<uc16>(1, zone); |
| + prefix_list->Add(common_prefix, zone); |
| + RegExpAtom* prefix = new (zone) RegExpAtom(prefix_list->ToConstVector()); |
| + ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); |
| + pair->Add(prefix, zone); |
| + ZoneList<RegExpTree*>* suffixes = |
| + new (zone) ZoneList<RegExpTree*>(run_length, zone); |
| + for (int j = 0; j < run_length; j++) { |
| + RegExpAtom* old_atom = |
| + alternatives->at(j + first_with_prefix)->AsAtom(); |
| + int len = old_atom->length(); |
| + if (len == 0) { |
| + suffixes->Add(new (zone) RegExpEmpty(), zone); |
| + } else { |
| + ZoneList<uc16>* suffix_list = |
| + new (zone) ZoneList<uc16>(len - 1, zone); |
| + for (int k = 1; k < len; k++) |
| + suffix_list->Add(old_atom->data().at(k), zone); |
| + RegExpTree* suffix = |
| + new (zone) RegExpAtom(suffix_list->ToConstVector()); |
|
Yang
2015/06/09 10:59:35
We could just use old_atom->data()->SubVector(1, l
|
| + suffixes->Add(suffix, zone); |
| + } |
| + } |
| + pair->Add(new (zone) RegExpDisjunction(suffixes), zone); |
| + alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair); |
| + } else { |
| + // Just copy any non-worthwhile alternatives. |
| + for (int j = first_with_prefix; j < i; j++) { |
| + alternatives->at(write_posn++) = alternatives->at(j); |
| + } |
| + } |
| + } |
| + alternatives->Rewind(write_posn); // Trim end of array. |
| +} |
| + |
| + |
| +// Optimizes b|c|z to [bcz]. |
| +void RegExpDisjunction::FixSingleCharacterDisjunctions( |
| + RegExpCompiler* compiler) { |
| + Zone* zone = compiler->zone(); |
| + ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
| + int length = alternatives->length(); |
| + |
| + int write_posn = 0; |
| + int i = 0; |
| + while (i < length) { |
| + RegExpTree* alternative = alternatives->at(i); |
| + if (!alternative->IsAtom()) { |
| + alternatives->at(write_posn++) = alternatives->at(i); |
| + i++; |
| + continue; |
| + } |
| + RegExpAtom* atom = alternative->AsAtom(); |
| + if (atom->length() != 1) { |
| + alternatives->at(write_posn++) = alternatives->at(i); |
| + i++; |
| + continue; |
| + } |
| + int first_in_run = i; |
| + i++; |
| + while (i < length) { |
| + alternative = alternatives->at(i); |
| + if (!alternative->IsAtom()) break; |
| + atom = alternative->AsAtom(); |
| + if (atom->length() != 1) break; |
| + i++; |
| + } |
| + if (i > first_in_run + 1) { |
| + // Found non-trivial run of single-character alternatives. |
| + int run_length = i - first_in_run; |
| + ZoneList<CharacterRange>* ranges = |
| + new (zone) ZoneList<CharacterRange>(2, zone); |
| + for (int j = 0; j < run_length; j++) { |
| + RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); |
| + DCHECK_EQ(old_atom->length(), 1); |
| + ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); |
| + } |
| + alternatives->at(write_posn++) = |
| + new (zone) RegExpCharacterClass(ranges, false); |
| + } else { |
| + // Just copy any trivial alternatives. |
| + for (int j = first_in_run; j < i; j++) { |
| + alternatives->at(write_posn++) = alternatives->at(j); |
| + } |
| + } |
| + } |
| + alternatives->Rewind(write_posn); // Trim end of array. |
| +} |
| + |
| + |
| RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, |
| RegExpNode* on_success) { |
| ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
| + |
| + if (alternatives->length() > 2) { |
| + bool found_consecutive_atoms = SortConsecutiveAtoms(compiler); |
| + if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler); |
| + FixSingleCharacterDisjunctions(compiler); |
| + if (alternatives->length() == 1) { |
| + return alternatives->at(0)->ToNode(compiler, on_success); |
|
Yang
2015/06/09 10:59:35
I guess for something like aaaaaaaaaaaaaaab|aaaaaa
|
| + } |
| + } |
| + |
| int length = alternatives->length(); |
| + |
| ChoiceNode* result = |
| new(compiler->zone()) ChoiceNode(length, compiler->zone()); |
| for (int i = 0; i < length; i++) { |