Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(283)

Side by Side Diff: src/jsregexp.cc

Issue 1180433003: Reland II of 'Optimize trivial regexp disjunctions' CL 1176453002 (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Re-add test Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/ast.h" 7 #include "src/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 4799 matching lines...) Expand 10 before | Expand all | Expand 10 after
4810 return false; 4810 return false;
4811 } 4811 }
4812 4812
4813 4813
4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
4815 RegExpNode* on_success) { 4815 RegExpNode* on_success) {
4816 return new(compiler->zone()) TextNode(this, on_success); 4816 return new(compiler->zone()) TextNode(this, on_success);
4817 } 4817 }
4818 4818
4819 4819
4820 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
4821 RegExpAtom* atom1 = (*a)->AsAtom();
4822 RegExpAtom* atom2 = (*b)->AsAtom();
4823 uc16 character1 = atom1->data().at(0);
4824 uc16 character2 = atom2->data().at(0);
4825 if (character1 < character2) return -1;
4826 if (character1 > character2) return 1;
4827 return 0;
4828 }
4829
4830
4831 // We can stable sort runs of atoms, since the order does not matter if they
4832 // start with different characters.
4833 // Returns true if any consecutive atoms were found.
4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
4835 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4836 int length = alternatives->length();
4837 bool found_consecutive_atoms = false;
4838 for (int i = 0; i < length; i++) {
4839 while (i < length) {
4840 RegExpTree* alternative = alternatives->at(i);
4841 if (alternative->IsAtom()) break;
4842 i++;
4843 }
4844 // i is length or it is the index of an atom.
4845 if (i == length) break;
4846 int first_atom = i;
4847 i++;
4848 while (i < length) {
4849 RegExpTree* alternative = alternatives->at(i);
4850 if (!alternative->IsAtom()) break;
4851 i++;
4852 }
4853 // Sort atoms to get ones with common prefixes together.
4854 // This step is not valid if we are in a case-independent regexp,
4855 // because it would change /is|I/ to /I|is/, and order matters when
4856 // the regexp parts don't match only disjoint starting points. To fix
4857 // this would need a version of CompareFirstChar that uses case-
4858 // independent character classes for comparison.
4859 if (!compiler->ignore_case()) {
4860 DCHECK_LT(first_atom, alternatives->length());
4861 DCHECK_LE(i, alternatives->length());
4862 DCHECK_LE(first_atom, i);
4863 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
4864 }
4865 if (i - first_atom > 1) found_consecutive_atoms = true;
4866 }
4867 return found_consecutive_atoms;
4868 }
4869
4870
4871 // Optimizes ab|ac|az to a(?:b|c|d).
4872 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
4873 Zone* zone = compiler->zone();
4874 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4875 int length = alternatives->length();
4876
4877 int write_posn = 0;
4878 int i = 0;
4879 while (i < length) {
4880 RegExpTree* alternative = alternatives->at(i);
4881 if (!alternative->IsAtom()) {
4882 alternatives->at(write_posn++) = alternatives->at(i);
4883 i++;
4884 continue;
4885 }
4886 RegExpAtom* atom = alternative->AsAtom();
4887 uc16 common_prefix = atom->data().at(0);
4888 int first_with_prefix = i;
4889 int prefix_length = atom->length();
4890 i++;
4891 while (i < length) {
4892 alternative = alternatives->at(i);
4893 if (!alternative->IsAtom()) break;
4894 atom = alternative->AsAtom();
4895 if (atom->data().at(0) != common_prefix) break;
4896 prefix_length = Min(prefix_length, atom->length());
4897 i++;
4898 }
4899 if (i > first_with_prefix + 2) {
4900 // Found worthwhile run of alternatives with common prefix of at least one
4901 // character. The sorting function above did not sort on more than one
4902 // character for reasons of correctness, but there may still be a longer
4903 // common prefix if the terms were similar or presorted in the input.
4904 // Find out how long the common prefix is.
4905 int run_length = i - first_with_prefix;
4906 atom = alternatives->at(first_with_prefix)->AsAtom();
4907 for (int j = 1; j < run_length && prefix_length > 1; j++) {
4908 RegExpAtom* old_atom =
4909 alternatives->at(j + first_with_prefix)->AsAtom();
4910 for (int k = 1; k < prefix_length; k++) {
4911 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;
4912 }
4913 }
4914 RegExpAtom* prefix =
4915 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
4916 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
4917 pair->Add(prefix, zone);
4918 ZoneList<RegExpTree*>* suffixes =
4919 new (zone) ZoneList<RegExpTree*>(run_length, zone);
4920 for (int j = 0; j < run_length; j++) {
4921 RegExpAtom* old_atom =
4922 alternatives->at(j + first_with_prefix)->AsAtom();
4923 int len = old_atom->length();
4924 if (len == prefix_length) {
4925 suffixes->Add(new (zone) RegExpEmpty(), zone);
4926 } else {
4927 RegExpTree* suffix = new (zone) RegExpAtom(
4928 old_atom->data().SubVector(prefix_length, old_atom->length()));
4929 suffixes->Add(suffix, zone);
4930 }
4931 }
4932 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);
4933 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);
4934 } else {
4935 // Just copy any non-worthwhile alternatives.
4936 for (int j = first_with_prefix; j < i; j++) {
4937 alternatives->at(write_posn++) = alternatives->at(j);
4938 }
4939 }
4940 }
4941 alternatives->Rewind(write_posn); // Trim end of array.
4942 }
4943
4944
4945 // Optimizes b|c|z to [bcz].
4946 void RegExpDisjunction::FixSingleCharacterDisjunctions(
4947 RegExpCompiler* compiler) {
4948 Zone* zone = compiler->zone();
4949 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4950 int length = alternatives->length();
4951
4952 int write_posn = 0;
4953 int i = 0;
4954 while (i < length) {
4955 RegExpTree* alternative = alternatives->at(i);
4956 if (!alternative->IsAtom()) {
4957 alternatives->at(write_posn++) = alternatives->at(i);
4958 i++;
4959 continue;
4960 }
4961 RegExpAtom* atom = alternative->AsAtom();
4962 if (atom->length() != 1) {
4963 alternatives->at(write_posn++) = alternatives->at(i);
4964 i++;
4965 continue;
4966 }
4967 int first_in_run = i;
4968 i++;
4969 while (i < length) {
4970 alternative = alternatives->at(i);
4971 if (!alternative->IsAtom()) break;
4972 atom = alternative->AsAtom();
4973 if (atom->length() != 1) break;
4974 i++;
4975 }
4976 if (i > first_in_run + 1) {
4977 // Found non-trivial run of single-character alternatives.
4978 int run_length = i - first_in_run;
4979 ZoneList<CharacterRange>* ranges =
4980 new (zone) ZoneList<CharacterRange>(2, zone);
4981 for (int j = 0; j < run_length; j++) {
4982 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
4983 DCHECK_EQ(old_atom->length(), 1);
4984 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
4985 }
4986 alternatives->at(write_posn++) =
4987 new (zone) RegExpCharacterClass(ranges, false);
4988 } else {
4989 // Just copy any trivial alternatives.
4990 for (int j = first_in_run; j < i; j++) {
4991 alternatives->at(write_posn++) = alternatives->at(j);
4992 }
4993 }
4994 }
4995 alternatives->Rewind(write_posn); // Trim end of array.
4996 }
4997
4998
4820 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler, 4999 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,
4821 RegExpNode* on_success) { 5000 RegExpNode* on_success) {
4822 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 5001 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4823 int length = alternatives->length(); 5002
5003 if (alternatives->length() > 2) {
5004 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);
5005 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);
5006 FixSingleCharacterDisjunctions(compiler);
5007 if (alternatives->length() == 1) {
5008 return alternatives->at(0)->ToNode(compiler, on_success);
5009 }
5010 }
5011
5012 int length = alternatives->length();
5013
4824 ChoiceNode* result = 5014 ChoiceNode* result =
4825 new(compiler->zone()) ChoiceNode(length, compiler->zone()); 5015 new(compiler->zone()) ChoiceNode(length, compiler->zone());
4826 for (int i = 0; i < length; i++) { 5016 for (int i = 0; i < length; i++) {
4827 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler, 5017 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,
4828 on_success)); 5018 on_success));
4829 result->AddAlternative(alternative); 5019 result->AddAlternative(alternative);
4830 } 5020 }
4831 return result; 5021 return result;
4832 } 5022 }
4833 5023
(...skipping 1322 matching lines...) Expand 10 before | Expand all | Expand 10 after
6156 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6157 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6158 heap->isolate()->memory_allocator()->SizeExecutable() > 6348 heap->isolate()->memory_allocator()->SizeExecutable() >
6159 RegExpImpl::kRegExpExecutableMemoryLimit) { 6349 RegExpImpl::kRegExpExecutableMemoryLimit) {
6160 too_much = true; 6350 too_much = true;
6161 } 6351 }
6162 return too_much; 6352 return too_much;
6163 } 6353 }
6164 } // namespace internal 6354 } // namespace internal
6165 } // namespace v8 6355 } // namespace v8
OLDNEW
« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698