Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(353)

Side by Side Diff: src/jsregexp.cc

Issue 1176453002: Optimize trivial regexp disjunctions (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/ast.h ('k') | src/list.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/ast.h" 7 #include "src/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 4799 matching lines...) Expand 10 before | Expand all | Expand 10 after
4810 return false; 4810 return false;
4811 } 4811 }
4812 4812
4813 4813
4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4815 RegExpNode* on_success) { 4815 RegExpNode* on_success) {
4816 return new(compiler->zone()) TextNode(this, on_success); 4816 return new(compiler->zone()) TextNode(this, on_success);
4817 } 4817 }
4818 4818
4819 4819
4820 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
4821 RegExpAtom* atom1 = (*a)->AsAtom();
4822 RegExpAtom* atom2 = (*b)->AsAtom();
4823 uc16 character1 = atom1->data().at(0);
4824 uc16 character2 = atom2->data().at(0);
4825 if (character1 < character2) return -1;
4826 if (character1 > character2) return 1;
4827 return 0;
4828 }
4829
4830
4831 // We can stable sort runs of atoms of at least length 0, since the order
4832 // does not matter.
4833 // Returns true if any consecutive atoms were found.
4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
4835 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4836 int length = alternatives->length();
4837 bool found_consecutive_atoms = false;
4838 for (int i = 0; i < length; i++) {
4839 while (i < length) {
4840 RegExpTree* alternative = alternatives->at(i);
4841 if (alternative->IsAtom() && alternative->AsAtom()->length() > 0) break;
4842 i++;
4843 }
4844 // i is length or it is the index of an atom.
4845 int first_atom = i;
4846 i++;
4847 while (i < length) {
4848 RegExpTree* alternative = alternatives->at(i);
4849 if (!alternative->IsAtom()) break;
4850 if (alternative->AsAtom()->length() == 0) break;
4851 i++;
4852 }
4853 // Sort atoms to get ones with common prefixes together.
4854 // This step is not valid if we are in a case-independent regexp,
4855 // because it would change /is|I/ to /I|is/, and order matters when
4856 // the regexp parts don't match only disjoint starting points.
4857 if (!compiler->ignore_case()) {
4858 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
4859 }
4860 if (i - first_atom > 1) found_consecutive_atoms = true;
4861 }
4862 return found_consecutive_atoms;
4863 }
4864
4865
4866 // Optimizes ab|ac|az to a(?:b|c|d).
4867 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
4868 Zone* zone = compiler->zone();
4869 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4870 int length = alternatives->length();
4871
4872 int write_posn = 0;
4873 int i = 0;
4874 while (i < length) {
4875 RegExpTree* alternative = alternatives->at(i);
4876 if (!alternative->IsAtom()) {
4877 alternatives->at(write_posn++) = alternatives->at(i);
4878 i++;
4879 continue;
4880 }
4881 RegExpAtom* atom = alternative->AsAtom();
4882 if (atom->length() < 1) {
4883 alternatives->at(write_posn++) = alternatives->at(i);
4884 i++;
4885 continue;
4886 }
4887 uc16 common_prefix = atom->data().at(0);
4888 int first_with_prefix = i;
4889 i++;
4890 while (i < length) {
4891 alternative = alternatives->at(i);
4892 if (!alternative->IsAtom()) break;
4893 atom = alternative->AsAtom();
4894 if (atom->length() < 1) break;
4895 if (atom->data().at(0) != common_prefix) break;
4896 i++;
4897 }
4898 if (i > first_with_prefix + 2) {
4899 // Found worthwhile run of alternatives with common prefix.
4900 int run_length = i - first_with_prefix;
4901 ZoneList<uc16>* prefix_list = new (zone) ZoneList<uc16>(1, zone);
4902 prefix_list->Add(common_prefix, zone);
4903 RegExpAtom* prefix = new (zone) RegExpAtom(prefix_list->ToConstVector());
4904 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
4905 pair->Add(prefix, zone);
4906 ZoneList<RegExpTree*>* suffixes =
4907 new (zone) ZoneList<RegExpTree*>(run_length, zone);
4908 for (int j = 0; j < run_length; j++) {
4909 RegExpAtom* old_atom =
4910 alternatives->at(j + first_with_prefix)->AsAtom();
4911 int len = old_atom->length();
4912 if (len == 0) {
4913 suffixes->Add(new (zone) RegExpEmpty(), zone);
4914 } else {
4915 ZoneList<uc16>* suffix_list =
4916 new (zone) ZoneList<uc16>(len - 1, zone);
4917 for (int k = 1; k < len; k++)
4918 suffix_list->Add(old_atom->data().at(k), zone);
4919 RegExpTree* suffix =
4920 new (zone) RegExpAtom(suffix_list->ToConstVector());
Yang 2015/06/09 10:59:35 We could just use old_atom->data()->SubVector(1, l
4921 suffixes->Add(suffix, zone);
4922 }
4923 }
4924 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
4925 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
4926 } else {
4927 // Just copy any non-worthwhile alternatives.
4928 for (int j = first_with_prefix; j < i; j++) {
4929 alternatives->at(write_posn++) = alternatives->at(j);
4930 }
4931 }
4932 }
4933 alternatives->Rewind(write_posn); // Trim end of array.
4934 }
4935
4936
4937 // Optimizes b|c|z to [bcz].
4938 void RegExpDisjunction::FixSingleCharacterDisjunctions(
4939 RegExpCompiler* compiler) {
4940 Zone* zone = compiler->zone();
4941 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4942 int length = alternatives->length();
4943
4944 int write_posn = 0;
4945 int i = 0;
4946 while (i < length) {
4947 RegExpTree* alternative = alternatives->at(i);
4948 if (!alternative->IsAtom()) {
4949 alternatives->at(write_posn++) = alternatives->at(i);
4950 i++;
4951 continue;
4952 }
4953 RegExpAtom* atom = alternative->AsAtom();
4954 if (atom->length() != 1) {
4955 alternatives->at(write_posn++) = alternatives->at(i);
4956 i++;
4957 continue;
4958 }
4959 int first_in_run = i;
4960 i++;
4961 while (i < length) {
4962 alternative = alternatives->at(i);
4963 if (!alternative->IsAtom()) break;
4964 atom = alternative->AsAtom();
4965 if (atom->length() != 1) break;
4966 i++;
4967 }
4968 if (i > first_in_run + 1) {
4969 // Found non-trivial run of single-character alternatives.
4970 int run_length = i - first_in_run;
4971 ZoneList<CharacterRange>* ranges =
4972 new (zone) ZoneList<CharacterRange>(2, zone);
4973 for (int j = 0; j < run_length; j++) {
4974 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
4975 DCHECK_EQ(old_atom->length(), 1);
4976 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
4977 }
4978 alternatives->at(write_posn++) =
4979 new (zone) RegExpCharacterClass(ranges, false);
4980 } else {
4981 // Just copy any trivial alternatives.
4982 for (int j = first_in_run; j < i; j++) {
4983 alternatives->at(write_posn++) = alternatives->at(j);
4984 }
4985 }
4986 }
4987 alternatives->Rewind(write_posn); // Trim end of array.
4988 }
4989
4990
4820 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 4991 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4821 RegExpNode* on_success) { 4992 RegExpNode* on_success) {
4822 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4993 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4994
4995 if (alternatives->length() > 2) {
4996 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
4997 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
4998 FixSingleCharacterDisjunctions(compiler);
4999 if (alternatives->length() == 1) {
5000 return alternatives->at(0)->ToNode(compiler, on_success);
Yang 2015/06/09 10:59:35 I guess for something like aaaaaaaaaaaaaaab|aaaaaa
5001 }
5002 }
5003
4823 int length = alternatives->length(); 5004 int length = alternatives->length();
5005
4824 ChoiceNode* result = 5006 ChoiceNode* result =
4825 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5007 new(compiler->zone()) ChoiceNode(length, compiler->zone());
4826 for (int i = 0; i < length; i++) { 5008 for (int i = 0; i < length; i++) {
4827 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5009 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4828 on_success)); 5010 on_success));
4829 result->AddAlternative(alternative); 5011 result->AddAlternative(alternative);
4830 } 5012 }
4831 return result; 5013 return result;
4832 } 5014 }
4833 5015
(...skipping 1322 matching lines...) Expand 10 before | Expand all | Expand 10 after
6156 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6338 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6157 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6339 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6158 heap->isolate()->memory_allocator()->SizeExecutable() > 6340 heap->isolate()->memory_allocator()->SizeExecutable() >
6159 RegExpImpl::kRegExpExecutableMemoryLimit) { 6341 RegExpImpl::kRegExpExecutableMemoryLimit) {
6160 too_much = true; 6342 too_much = true;
6161 } 6343 }
6162 return too_much; 6344 return too_much;
6163 } 6345 }
6164 } // namespace internal 6346 } // namespace internal
6165 } // namespace v8 6347 } // namespace v8
OLDNEW
« no previous file with comments | « src/ast.h ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698