Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(179)

Side by Side Diff: src/jsregexp.cc

Issue 1176453002: Optimize trivial regexp disjunctions (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Make test faster Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/ast.h" 7 #include "src/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 4799 matching lines...) Expand 10 before | Expand all | Expand 10 after
4810 return false; 4810 return false;
4811 } 4811 }
4812 4812
4813 4813
4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4815 RegExpNode* on_success) { 4815 RegExpNode* on_success) {
4816 return new(compiler->zone()) TextNode(this, on_success); 4816 return new(compiler->zone()) TextNode(this, on_success);
4817 } 4817 }
4818 4818
4819 4819
4820 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
4821 RegExpAtom* atom1 = (*a)->AsAtom();
4822 RegExpAtom* atom2 = (*b)->AsAtom();
4823 uc16 character1 = atom1->data().at(0);
4824 uc16 character2 = atom2->data().at(0);
4825 if (character1 < character2) return -1;
4826 if (character1 > character2) return 1;
4827 return 0;
4828 }
4829
4830
4831 // We can stable sort runs of atoms, since the order does not matter if they
4832 // start with different characters.
4833 // Returns true if any consecutive atoms were found.
4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
4835 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4836 int length = alternatives->length();
4837 bool found_consecutive_atoms = false;
4838 for (int i = 0; i < length; i++) {
4839 while (i < length) {
4840 RegExpTree* alternative = alternatives->at(i);
4841 if (alternative->IsAtom()) break;
4842 i++;
4843 }
4844 // i is length or it is the index of an atom.
4845 int first_atom = i;
4846 i++;
4847 while (i < length) {
4848 RegExpTree* alternative = alternatives->at(i);
4849 if (!alternative->IsAtom()) break;
4850 i++;
4851 }
4852 // Sort atoms to get ones with common prefixes together.
4853 // This step is not valid if we are in a case-independent regexp,
4854 // because it would change /is|I/ to /I|is/, and order matters when
4855 // the regexp parts don't match only disjoint starting points.
4856 if (!compiler->ignore_case()) {
4857 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
4858 }
4859 if (i - first_atom > 1) found_consecutive_atoms = true;
4860 }
4861 return found_consecutive_atoms;
4862 }
4863
4864
4865 // Optimizes ab|ac|az to a(?:b|c|d).
4866 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
4867 Zone* zone = compiler->zone();
4868 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4869 int length = alternatives->length();
4870
4871 int write_posn = 0;
4872 int i = 0;
4873 while (i < length) {
4874 RegExpTree* alternative = alternatives->at(i);
4875 if (!alternative->IsAtom()) {
4876 alternatives->at(write_posn++) = alternatives->at(i);
4877 i++;
4878 continue;
4879 }
4880 RegExpAtom* atom = alternative->AsAtom();
4881 uc16 common_prefix = atom->data().at(0);
4882 int first_with_prefix = i;
4883 int prefix_length = atom->length();
4884 i++;
4885 while (i < length) {
4886 alternative = alternatives->at(i);
4887 if (!alternative->IsAtom()) break;
4888 atom = alternative->AsAtom();
4889 if (atom->data().at(0) != common_prefix) break;
4890 prefix_length = Min(prefix_length, atom->length());
4891 i++;
4892 }
4893 if (i > first_with_prefix + 2) {
4894 // Found worthwhile run of alternatives with common prefix of at least one
4895 // character. Find out how long the common prefix is.
4896 int run_length = i - first_with_prefix;
4897 atom = alternatives->at(first_with_prefix)->AsAtom();
4898 for (int j = 1; j < run_length && prefix_length > 1; j++) {
4899 RegExpAtom* old_atom =
4900 alternatives->at(j + first_with_prefix)->AsAtom();
4901 for (int k = 1; k < prefix_length; k++) {
4902 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
4903 }
4904 }
4905 RegExpAtom* prefix =
4906 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
4907 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
4908 pair->Add(prefix, zone);
4909 ZoneList<RegExpTree*>* suffixes =
4910 new (zone) ZoneList<RegExpTree*>(run_length, zone);
4911 for (int j = 0; j < run_length; j++) {
4912 RegExpAtom* old_atom =
4913 alternatives->at(j + first_with_prefix)->AsAtom();
4914 int len = old_atom->length();
4915 if (len == prefix_length) {
4916 suffixes->Add(new (zone) RegExpEmpty(), zone);
4917 } else {
4918 RegExpTree* suffix = new (zone) RegExpAtom(
4919 old_atom->data().SubVector(prefix_length, old_atom->length()));
4920 suffixes->Add(suffix, zone);
4921 }
4922 }
4923 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
4924 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
4925 } else {
4926 // Just copy any non-worthwhile alternatives.
4927 for (int j = first_with_prefix; j < i; j++) {
4928 alternatives->at(write_posn++) = alternatives->at(j);
4929 }
4930 }
4931 }
4932 alternatives->Rewind(write_posn); // Trim end of array.
4933 }
4934
4935
4936 // Optimizes b|c|z to [bcz].
4937 void RegExpDisjunction::FixSingleCharacterDisjunctions(
4938 RegExpCompiler* compiler) {
4939 Zone* zone = compiler->zone();
4940 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4941 int length = alternatives->length();
4942
4943 int write_posn = 0;
4944 int i = 0;
4945 while (i < length) {
4946 RegExpTree* alternative = alternatives->at(i);
4947 if (!alternative->IsAtom()) {
4948 alternatives->at(write_posn++) = alternatives->at(i);
4949 i++;
4950 continue;
4951 }
4952 RegExpAtom* atom = alternative->AsAtom();
4953 if (atom->length() != 1) {
4954 alternatives->at(write_posn++) = alternatives->at(i);
4955 i++;
4956 continue;
4957 }
4958 int first_in_run = i;
4959 i++;
4960 while (i < length) {
4961 alternative = alternatives->at(i);
4962 if (!alternative->IsAtom()) break;
4963 atom = alternative->AsAtom();
4964 if (atom->length() != 1) break;
4965 i++;
4966 }
4967 if (i > first_in_run + 1) {
4968 // Found non-trivial run of single-character alternatives.
4969 int run_length = i - first_in_run;
4970 ZoneList<CharacterRange>* ranges =
4971 new (zone) ZoneList<CharacterRange>(2, zone);
4972 for (int j = 0; j < run_length; j++) {
4973 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
4974 DCHECK_EQ(old_atom->length(), 1);
4975 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
4976 }
4977 alternatives->at(write_posn++) =
4978 new (zone) RegExpCharacterClass(ranges, false);
4979 } else {
4980 // Just copy any trivial alternatives.
4981 for (int j = first_in_run; j < i; j++) {
4982 alternatives->at(write_posn++) = alternatives->at(j);
4983 }
4984 }
4985 }
4986 alternatives->Rewind(write_posn); // Trim end of array.
4987 }
4988
4989
4820 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 4990 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4821 RegExpNode* on_success) { 4991 RegExpNode* on_success) {
4822 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4992 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4993
4994 if (alternatives->length() > 2) {
4995 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
4996 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
4997 FixSingleCharacterDisjunctions(compiler);
4998 if (alternatives->length() == 1) {
4999 return alternatives->at(0)->ToNode(compiler, on_success);
5000 }
5001 }
5002
4823 int length = alternatives->length(); 5003 int length = alternatives->length();
5004
4824 ChoiceNode* result = 5005 ChoiceNode* result =
4825 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5006 new(compiler->zone()) ChoiceNode(length, compiler->zone());
4826 for (int i = 0; i < length; i++) { 5007 for (int i = 0; i < length; i++) {
4827 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5008 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4828 on_success)); 5009 on_success));
4829 result->AddAlternative(alternative); 5010 result->AddAlternative(alternative);
4830 } 5011 }
4831 return result; 5012 return result;
4832 } 5013 }
4833 5014
(...skipping 1322 matching lines...) Expand 10 before | Expand all | Expand 10 after
6156 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6337 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6157 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6338 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6158 heap->isolate()->memory_allocator()->SizeExecutable() > 6339 heap->isolate()->memory_allocator()->SizeExecutable() >
6159 RegExpImpl::kRegExpExecutableMemoryLimit) { 6340 RegExpImpl::kRegExpExecutableMemoryLimit) {
6160 too_much = true; 6341 too_much = true;
6161 } 6342 }
6162 return too_much; 6343 return too_much;
6163 } 6344 }
6164 } // namespace internal 6345 } // namespace internal
6165 } // namespace v8 6346 } // namespace v8
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698