src/jsregexp.cc - Issue 1174713002: Reland of 'Optimize trivial regexp disjunctions' CL 1176453002

Side by Side Diff: src/jsregexp.cc

Issue 1174713002: Reland of 'Optimize trivial regexp disjunctions' CL 1176453002 (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Fix for the out-of-bounds sort that caused the revert Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/ast.h"	7 #include "src/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 4799 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4810 return false;	4810 return false;

4811 }	4811 }

4812	4812

4813	4813

4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,	4814 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

4815 RegExpNode* on_success) {	4815 RegExpNode* on_success) {

4816 return new(compiler->zone()) TextNode(this, on_success);	4816 return new(compiler->zone()) TextNode(this, on_success);

4817 }	4817 }

4818	4818

4819	4819

	4820 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {

	4821 RegExpAtom* atom1 = (*a)->AsAtom();

	4822 RegExpAtom* atom2 = (*b)->AsAtom();

	4823 uc16 character1 = atom1->data().at(0);

	4824 uc16 character2 = atom2->data().at(0);

	4825 if (character1 < character2) return -1;

	4826 if (character1 > character2) return 1;

	4827 return 0;

	4828 }

	4829

	4830

	4831 // We can stable sort runs of atoms, since the order does not matter if they

	4832 // start with different characters.

	4833 // Returns true if any consecutive atoms were found.

	4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

	4835 ZoneList<RegExpTree> alternatives = this->alternatives();

	4836 int length = alternatives->length();

	4837 bool found_consecutive_atoms = false;

	4838 for (int i = 0; i < length; i++) {

	4839 while (i < length) {

	4840 RegExpTree* alternative = alternatives->at(i);

	4841 if (alternative->IsAtom()) break;

	4842 i++;

	4843 }

	4844 // i is length or it is the index of an atom.

	4845 if (i == length) break;
	Erik Corry Chromium.org 2015/06/09 20:27:11 This is the fix. This is the fix.
	4846 int first_atom = i;

	4847 i++;

	4848 while (i < length) {

	4849 RegExpTree* alternative = alternatives->at(i);

	4850 if (!alternative->IsAtom()) break;

	4851 i++;

	4852 }

	4853 // Sort atoms to get ones with common prefixes together.

	4854 // This step is not valid if we are in a case-independent regexp,

	4855 // because it would change /is\|I/ to /I\|is/, and order matters when

	4856 // the regexp parts don't match only disjoint starting points. To fix

	4857 // this would need a version of CompareFirstChar that uses case-

	4858 // independent character classes for comparison.

	4859 if (!compiler->ignore_case()) {

	4860 DCHECK_LT(first_atom, alternatives->length());

	4861 DCHECK_LE(i, alternatives->length());

	4862 DCHECK_LE(first_atom, i);

	4863 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);

	4864 }

	4865 if (i - first_atom > 1) found_consecutive_atoms = true;

	4866 }

	4867 return found_consecutive_atoms;

	4868 }

	4869

	4870

	4871 // Optimizes ab\|ac\|az to a(?:b\|c\|d).

	4872 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

	4873 Zone* zone = compiler->zone();

	4874 ZoneList<RegExpTree> alternatives = this->alternatives();

	4875 int length = alternatives->length();

	4876

	4877 int write_posn = 0;

	4878 int i = 0;

	4879 while (i < length) {

	4880 RegExpTree* alternative = alternatives->at(i);

	4881 if (!alternative->IsAtom()) {

	4882 alternatives->at(write_posn++) = alternatives->at(i);

	4883 i++;

	4884 continue;

	4885 }

	4886 RegExpAtom* atom = alternative->AsAtom();

	4887 uc16 common_prefix = atom->data().at(0);

	4888 int first_with_prefix = i;

	4889 int prefix_length = atom->length();

	4890 i++;

	4891 while (i < length) {

	4892 alternative = alternatives->at(i);

	4893 if (!alternative->IsAtom()) break;

	4894 atom = alternative->AsAtom();

	4895 if (atom->data().at(0) != common_prefix) break;

	4896 prefix_length = Min(prefix_length, atom->length());

	4897 i++;

	4898 }

	4899 if (i > first_with_prefix + 2) {

	4900 // Found worthwhile run of alternatives with common prefix of at least one

	4901 // character. The sorting function above did not sort on more than one

	4902 // character for reasons of correctness, but there may still be a longer

	4903 // common prefix if the terms were similar or presorted in the input.

	4904 // Find out how long the common prefix is.

	4905 int run_length = i - first_with_prefix;

	4906 atom = alternatives->at(first_with_prefix)->AsAtom();

	4907 for (int j = 1; j < run_length && prefix_length > 1; j++) {

	4908 RegExpAtom* old_atom =

	4909 alternatives->at(j + first_with_prefix)->AsAtom();

	4910 for (int k = 1; k < prefix_length; k++) {

	4911 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;

	4912 }

	4913 }

	4914 RegExpAtom* prefix =

	4915 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));

	4916 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);

	4917 pair->Add(prefix, zone);

	4918 ZoneList<RegExpTree> suffixes =

	4919 new (zone) ZoneList<RegExpTree*>(run_length, zone);

	4920 for (int j = 0; j < run_length; j++) {

	4921 RegExpAtom* old_atom =

	4922 alternatives->at(j + first_with_prefix)->AsAtom();

	4923 int len = old_atom->length();

	4924 if (len == prefix_length) {

	4925 suffixes->Add(new (zone) RegExpEmpty(), zone);

	4926 } else {

	4927 RegExpTree* suffix = new (zone) RegExpAtom(

	4928 old_atom->data().SubVector(prefix_length, old_atom->length()));

	4929 suffixes->Add(suffix, zone);

	4930 }

	4931 }

	4932 pair->Add(new (zone) RegExpDisjunction(suffixes), zone);

	4933 alternatives->at(write_posn++) = new (zone) RegExpAlternative(pair);

	4934 } else {

	4935 // Just copy any non-worthwhile alternatives.

	4936 for (int j = first_with_prefix; j < i; j++) {

	4937 alternatives->at(write_posn++) = alternatives->at(j);

	4938 }

	4939 }

	4940 }

	4941 alternatives->Rewind(write_posn); // Trim end of array.

	4942 }

	4943

	4944

	4945 // Optimizes b\|c\|z to [bcz].

	4946 void RegExpDisjunction::FixSingleCharacterDisjunctions(

	4947 RegExpCompiler* compiler) {

	4948 Zone* zone = compiler->zone();

	4949 ZoneList<RegExpTree> alternatives = this->alternatives();

	4950 int length = alternatives->length();

	4951

	4952 int write_posn = 0;

	4953 int i = 0;

	4954 while (i < length) {

	4955 RegExpTree* alternative = alternatives->at(i);

	4956 if (!alternative->IsAtom()) {

	4957 alternatives->at(write_posn++) = alternatives->at(i);

	4958 i++;

	4959 continue;

	4960 }

	4961 RegExpAtom* atom = alternative->AsAtom();

	4962 if (atom->length() != 1) {

	4963 alternatives->at(write_posn++) = alternatives->at(i);

	4964 i++;

	4965 continue;

	4966 }

	4967 int first_in_run = i;

	4968 i++;

	4969 while (i < length) {

	4970 alternative = alternatives->at(i);

	4971 if (!alternative->IsAtom()) break;

	4972 atom = alternative->AsAtom();

	4973 if (atom->length() != 1) break;

	4974 i++;

	4975 }

	4976 if (i > first_in_run + 1) {

	4977 // Found non-trivial run of single-character alternatives.

	4978 int run_length = i - first_in_run;

	4979 ZoneList<CharacterRange>* ranges =

	4980 new (zone) ZoneList<CharacterRange>(2, zone);

	4981 for (int j = 0; j < run_length; j++) {

	4982 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();

	4983 DCHECK_EQ(old_atom->length(), 1);

	4984 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);

	4985 }

	4986 alternatives->at(write_posn++) =

	4987 new (zone) RegExpCharacterClass(ranges, false);

	4988 } else {

	4989 // Just copy any trivial alternatives.

	4990 for (int j = first_in_run; j < i; j++) {

	4991 alternatives->at(write_posn++) = alternatives->at(j);

	4992 }

	4993 }

	4994 }

	4995 alternatives->Rewind(write_posn); // Trim end of array.

	4996 }

	4997

	4998

4820 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,	4999 RegExpNode* RegExpDisjunction::ToNode(RegExpCompiler* compiler,

4821 RegExpNode* on_success) {	5000 RegExpNode* on_success) {

4822 ZoneList<RegExpTree> alternatives = this->alternatives();	5001 ZoneList<RegExpTree> alternatives = this->alternatives();

4823 int length = alternatives->length();	5002

	5003 if (alternatives->length() > 2) {

	5004 bool found_consecutive_atoms = SortConsecutiveAtoms(compiler);

	5005 if (found_consecutive_atoms) RationalizeConsecutiveAtoms(compiler);

	5006 FixSingleCharacterDisjunctions(compiler);

	5007 if (alternatives->length() == 1) {

	5008 return alternatives->at(0)->ToNode(compiler, on_success);

	5009 }

	5010 }

	5011

	5012 int length = alternatives->length();

	5013

4824 ChoiceNode* result =	5014 ChoiceNode* result =

4825 new(compiler->zone()) ChoiceNode(length, compiler->zone());	5015 new(compiler->zone()) ChoiceNode(length, compiler->zone());

4826 for (int i = 0; i < length; i++) {	5016 for (int i = 0; i < length; i++) {

4827 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,	5017 GuardedAlternative alternative(alternatives->at(i)->ToNode(compiler,

4828 on_success));	5018 on_success));

4829 result->AddAlternative(alternative);	5019 result->AddAlternative(alternative);

4830 }	5020 }

4831 return result;	5021 return result;

4832 }	5022 }

4833	5023

(...skipping 1322 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6156 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;	6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;

6157 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&	6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&

6158 heap->isolate()->memory_allocator()->SizeExecutable() >	6348 heap->isolate()->memory_allocator()->SizeExecutable() >

6159 RegExpImpl::kRegExpExecutableMemoryLimit) {	6349 RegExpImpl::kRegExpExecutableMemoryLimit) {

6160 too_much = true;	6350 too_much = true;

6161 }	6351 }

6162 return too_much;	6352 return too_much;

6163 }	6353 }

6164 } // namespace internal	6354 } // namespace internal

6165 } // namespace v8	6355 } // namespace v8

OLD	NEW

« no previous file with comments | « src/jsregexp.h ('k') | src/list.h » ('j') | no next file with comments »