src/jsregexp.cc - Issue 1204123003: Reland Extend big-disjunction optimization to case-independent regexps

Side by Side Diff: src/jsregexp.cc

Issue 1204123003: Reland Extend big-disjunction optimization to case-independent regexps (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Skip slow architecture-independent tests on slower CPUs and simulators Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/ast.h"	7 #include "src/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 4819 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4830 RegExpAtom* atom1 = (*a)->AsAtom();	4830 RegExpAtom* atom1 = (*a)->AsAtom();

4831 RegExpAtom* atom2 = (*b)->AsAtom();	4831 RegExpAtom* atom2 = (*b)->AsAtom();

4832 uc16 character1 = atom1->data().at(0);	4832 uc16 character1 = atom1->data().at(0);

4833 uc16 character2 = atom2->data().at(0);	4833 uc16 character2 = atom2->data().at(0);

4834 if (character1 < character2) return -1;	4834 if (character1 < character2) return -1;

4835 if (character1 > character2) return 1;	4835 if (character1 > character2) return 1;

4836 return 0;	4836 return 0;

4837 }	4837 }

4838	4838

4839	4839

	4840 static unibrow::uchar Canonical(

	4841 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

	4842 unibrow::uchar c) {

	4843 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];

	4844 int length = canonicalize->get(c, '\0', chars);

	4845 DCHECK_LE(length, 1);

	4846 unibrow::uchar canonical = c;

	4847 if (length == 1) canonical = chars[0];

	4848 return canonical;

	4849 }

	4850

	4851

	4852 int CompareFirstCharCaseIndependent(

	4853 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

	4854 RegExpTree* const* a, RegExpTree* const* b) {

	4855 RegExpAtom* atom1 = (*a)->AsAtom();

	4856 RegExpAtom* atom2 = (*b)->AsAtom();

	4857 unibrow::uchar character1 = atom1->data().at(0);

	4858 unibrow::uchar character2 = atom2->data().at(0);

	4859 if (character1 == character2) return 0;

	4860 if (character1 >= 'a' \|\| character2 >= 'a') {

	4861 character1 = Canonical(canonicalize, character1);

	4862 character2 = Canonical(canonicalize, character2);

	4863 }

	4864 return static_cast<int>(character1) - static_cast<int>(character2);

	4865 }

	4866

	4867

4840 // We can stable sort runs of atoms, since the order does not matter if they	4868 // We can stable sort runs of atoms, since the order does not matter if they

4841 // start with different characters.	4869 // start with different characters.

4842 // Returns true if any consecutive atoms were found.	4870 // Returns true if any consecutive atoms were found.

4843 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {	4871 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

4844 ZoneList<RegExpTree> alternatives = this->alternatives();	4872 ZoneList<RegExpTree> alternatives = this->alternatives();

4845 int length = alternatives->length();	4873 int length = alternatives->length();

4846 bool found_consecutive_atoms = false;	4874 bool found_consecutive_atoms = false;

4847 for (int i = 0; i < length; i++) {	4875 for (int i = 0; i < length; i++) {

4848 while (i < length) {	4876 while (i < length) {

4849 RegExpTree* alternative = alternatives->at(i);	4877 RegExpTree* alternative = alternatives->at(i);

4850 if (alternative->IsAtom()) break;	4878 if (alternative->IsAtom()) break;

4851 i++;	4879 i++;

4852 }	4880 }

4853 // i is length or it is the index of an atom.	4881 // i is length or it is the index of an atom.

4854 if (i == length) break;	4882 if (i == length) break;

4855 int first_atom = i;	4883 int first_atom = i;

4856 i++;	4884 i++;

4857 while (i < length) {	4885 while (i < length) {

4858 RegExpTree* alternative = alternatives->at(i);	4886 RegExpTree* alternative = alternatives->at(i);

4859 if (!alternative->IsAtom()) break;	4887 if (!alternative->IsAtom()) break;

4860 i++;	4888 i++;

4861 }	4889 }

4862 // Sort atoms to get ones with common prefixes together.	4890 // Sort atoms to get ones with common prefixes together.

4863 // This step is not valid if we are in a case-independent regexp,	4891 // This step is more tricky if we are in a case-independent regexp,

4864 // because it would change /is\|I/ to /I\|is/, and order matters when	4892 // because it would change /is\|I/ to /I\|is/, and order matters when

4865 // the regexp parts don't match only disjoint starting points. To fix	4893 // the regexp parts don't match only disjoint starting points. To fix

4866 // this would need a version of CompareFirstChar that uses case-	4894 // this we have a version of CompareFirstChar that uses case-

4867 // independent character classes for comparison.	4895 // independent character classes for comparison.

4868 if (!compiler->ignore_case()) {	4896 DCHECK_LT(first_atom, alternatives->length());

4869 DCHECK_LT(first_atom, alternatives->length());	4897 DCHECK_LE(i, alternatives->length());

4870 DCHECK_LE(i, alternatives->length());	4898 DCHECK_LE(first_atom, i);

4871 DCHECK_LE(first_atom, i);	4899 if (compiler->ignore_case()) {

	4900 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

	4901 compiler->isolate()->regexp_macro_assembler_canonicalize();

	4902 auto compare_closure =

	4903 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {

	4904 return CompareFirstCharCaseIndependent(canonicalize, a, b);

	4905 };

	4906 alternatives->StableSort(compare_closure, first_atom, i - first_atom);

	4907 } else {

4872 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);	4908 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);

4873 }	4909 }

4874 if (i - first_atom > 1) found_consecutive_atoms = true;	4910 if (i - first_atom > 1) found_consecutive_atoms = true;

4875 }	4911 }

4876 return found_consecutive_atoms;	4912 return found_consecutive_atoms;

4877 }	4913 }

4878	4914

4879	4915

4880 // Optimizes ab\|ac\|az to a(?:b\|c\|d).	4916 // Optimizes ab\|ac\|az to a(?:b\|c\|d).

4881 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {	4917 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

4882 Zone* zone = compiler->zone();	4918 Zone* zone = compiler->zone();

4883 ZoneList<RegExpTree> alternatives = this->alternatives();	4919 ZoneList<RegExpTree> alternatives = this->alternatives();

4884 int length = alternatives->length();	4920 int length = alternatives->length();

4885	4921

4886 int write_posn = 0;	4922 int write_posn = 0;

4887 int i = 0;	4923 int i = 0;

4888 while (i < length) {	4924 while (i < length) {

4889 RegExpTree* alternative = alternatives->at(i);	4925 RegExpTree* alternative = alternatives->at(i);

4890 if (!alternative->IsAtom()) {	4926 if (!alternative->IsAtom()) {

4891 alternatives->at(write_posn++) = alternatives->at(i);	4927 alternatives->at(write_posn++) = alternatives->at(i);

4892 i++;	4928 i++;

4893 continue;	4929 continue;

4894 }	4930 }

4895 RegExpAtom* atom = alternative->AsAtom();	4931 RegExpAtom* atom = alternative->AsAtom();

4896 uc16 common_prefix = atom->data().at(0);	4932 unibrow::uchar common_prefix = atom->data().at(0);

4897 int first_with_prefix = i;	4933 int first_with_prefix = i;

4898 int prefix_length = atom->length();	4934 int prefix_length = atom->length();

4899 i++;	4935 i++;

4900 while (i < length) {	4936 while (i < length) {

4901 alternative = alternatives->at(i);	4937 alternative = alternatives->at(i);

4902 if (!alternative->IsAtom()) break;	4938 if (!alternative->IsAtom()) break;

4903 atom = alternative->AsAtom();	4939 atom = alternative->AsAtom();

4904 if (atom->data().at(0) != common_prefix) break;	4940 unibrow::uchar new_prefix = atom->data().at(0);

	4941 if (new_prefix != common_prefix) {

	4942 if (!compiler->ignore_case()) break;

	4943 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

	4944 compiler->isolate()->regexp_macro_assembler_canonicalize();

	4945 new_prefix = Canonical(canonicalize, new_prefix);

	4946 common_prefix = Canonical(canonicalize, common_prefix);

	4947 if (new_prefix != common_prefix) break;

	4948 }

4905 prefix_length = Min(prefix_length, atom->length());	4949 prefix_length = Min(prefix_length, atom->length());

4906 i++;	4950 i++;

4907 }	4951 }

4908 if (i > first_with_prefix + 2) {	4952 if (i > first_with_prefix + 2) {

4909 // Found worthwhile run of alternatives with common prefix of at least one	4953 // Found worthwhile run of alternatives with common prefix of at least one

4910 // character. The sorting function above did not sort on more than one	4954 // character. The sorting function above did not sort on more than one

4911 // character for reasons of correctness, but there may still be a longer	4955 // character for reasons of correctness, but there may still be a longer

4912 // common prefix if the terms were similar or presorted in the input.	4956 // common prefix if the terms were similar or presorted in the input.

4913 // Find out how long the common prefix is.	4957 // Find out how long the common prefix is.

4914 int run_length = i - first_with_prefix;	4958 int run_length = i - first_with_prefix;

4915 atom = alternatives->at(first_with_prefix)->AsAtom();	4959 atom = alternatives->at(first_with_prefix)->AsAtom();

4916 for (int j = 1; j < run_length && prefix_length > 1; j++) {	4960 for (int j = 1; j < run_length && prefix_length > 1; j++) {

4917 RegExpAtom* old_atom =	4961 RegExpAtom* old_atom =

4918 alternatives->at(j + first_with_prefix)->AsAtom();	4962 alternatives->at(j + first_with_prefix)->AsAtom();

4919 for (int k = 1; k < prefix_length; k++) {	4963 for (int k = 1; k < prefix_length; k++) {

4920 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;	4964 if (atom->data().at(k) != old_atom->data().at(k)) {

	4965 prefix_length = k;

	4966 break;

	4967 }

4921 }	4968 }

4922 }	4969 }

4923 RegExpAtom* prefix =	4970 RegExpAtom* prefix =

4924 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));	4971 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));

4925 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);	4972 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);

4926 pair->Add(prefix, zone);	4973 pair->Add(prefix, zone);

4927 ZoneList<RegExpTree> suffixes =	4974 ZoneList<RegExpTree> suffixes =

4928 new (zone) ZoneList<RegExpTree*>(run_length, zone);	4975 new (zone) ZoneList<RegExpTree*>(run_length, zone);

4929 for (int j = 0; j < run_length; j++) {	4976 for (int j = 0; j < run_length; j++) {

4930 RegExpAtom* old_atom =	4977 RegExpAtom* old_atom =

(...skipping 1424 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6355 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;	6402 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;

6356 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&	6403 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&

6357 heap->isolate()->memory_allocator()->SizeExecutable() >	6404 heap->isolate()->memory_allocator()->SizeExecutable() >

6358 RegExpImpl::kRegExpExecutableMemoryLimit) {	6405 RegExpImpl::kRegExpExecutableMemoryLimit) {

6359 too_much = true;	6406 too_much = true;

6360 }	6407 }

6361 return too_much;	6408 return too_much;

6362 }	6409 }

6363 } // namespace internal	6410 } // namespace internal

6364 } // namespace v8	6411 } // namespace v8

OLD	NEW

« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »