src/jsregexp.cc - Issue 1182783009: Extend big-disjunction optimization to case-independent regexps

Side by Side Diff: src/jsregexp.cc

Issue 1182783009: Extend big-disjunction optimization to case-independent regexps (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Fix 80 columns in test Created 5 years, 6 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/ast.h"	7 #include "src/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 4810 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4821 RegExpAtom* atom1 = (*a)->AsAtom();	4821 RegExpAtom* atom1 = (*a)->AsAtom();

4822 RegExpAtom* atom2 = (*b)->AsAtom();	4822 RegExpAtom* atom2 = (*b)->AsAtom();

4823 uc16 character1 = atom1->data().at(0);	4823 uc16 character1 = atom1->data().at(0);

4824 uc16 character2 = atom2->data().at(0);	4824 uc16 character2 = atom2->data().at(0);

4825 if (character1 < character2) return -1;	4825 if (character1 < character2) return -1;

4826 if (character1 > character2) return 1;	4826 if (character1 > character2) return 1;

4827 return 0;	4827 return 0;

4828 }	4828 }

4829	4829

4830	4830

	4831 static unibrow::uchar Canonical(

	4832 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

	4833 unibrow::uchar c) {

	4834 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];

	4835 int length = canonicalize->get(c, '\0', chars);

	4836 DCHECK_LE(length, 1);

	4837 unibrow::uchar canonical = c;

	4838 if (length == 1) canonical = chars[0];

	4839 return canonical;

	4840 }

	4841

	4842

	4843 int CompareFirstCharCaseIndependent(

	4844 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

	4845 RegExpTree* const* a, RegExpTree* const* b) {

	4846 RegExpAtom* atom1 = (*a)->AsAtom();

	4847 RegExpAtom* atom2 = (*b)->AsAtom();

	4848 unibrow::uchar character1 = atom1->data().at(0);

	4849 unibrow::uchar character2 = atom2->data().at(0);

	4850 if (character1 == character2) return 0;

	4851 if (character1 >= 'a' \|\| character2 >= 'a') {

	4852 character1 = Canonical(canonicalize, character1);

	4853 character2 = Canonical(canonicalize, character2);

	4854 }

	4855 return static_cast<int>(character1) - static_cast<int>(character2);

	4856 }

	4857

	4858

4831 // We can stable sort runs of atoms, since the order does not matter if they	4859 // We can stable sort runs of atoms, since the order does not matter if they

4832 // start with different characters.	4860 // start with different characters.

4833 // Returns true if any consecutive atoms were found.	4861 // Returns true if any consecutive atoms were found.

4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {	4862 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

4835 ZoneList<RegExpTree> alternatives = this->alternatives();	4863 ZoneList<RegExpTree> alternatives = this->alternatives();

4836 int length = alternatives->length();	4864 int length = alternatives->length();

4837 bool found_consecutive_atoms = false;	4865 bool found_consecutive_atoms = false;

4838 for (int i = 0; i < length; i++) {	4866 for (int i = 0; i < length; i++) {

4839 while (i < length) {	4867 while (i < length) {

4840 RegExpTree* alternative = alternatives->at(i);	4868 RegExpTree* alternative = alternatives->at(i);

4841 if (alternative->IsAtom()) break;	4869 if (alternative->IsAtom()) break;

4842 i++;	4870 i++;

4843 }	4871 }

4844 // i is length or it is the index of an atom.	4872 // i is length or it is the index of an atom.

4845 if (i == length) break;	4873 if (i == length) break;

4846 int first_atom = i;	4874 int first_atom = i;

4847 i++;	4875 i++;

4848 while (i < length) {	4876 while (i < length) {

4849 RegExpTree* alternative = alternatives->at(i);	4877 RegExpTree* alternative = alternatives->at(i);

4850 if (!alternative->IsAtom()) break;	4878 if (!alternative->IsAtom()) break;

4851 i++;	4879 i++;

4852 }	4880 }

4853 // Sort atoms to get ones with common prefixes together.	4881 // Sort atoms to get ones with common prefixes together.

4854 // This step is not valid if we are in a case-independent regexp,	4882 // This step is more tricky if we are in a case-independent regexp,

4855 // because it would change /is\|I/ to /I\|is/, and order matters when	4883 // because it would change /is\|I/ to /I\|is/, and order matters when

4856 // the regexp parts don't match only disjoint starting points. To fix	4884 // the regexp parts don't match only disjoint starting points. To fix

4857 // this would need a version of CompareFirstChar that uses case-	4885 // this we have a version of CompareFirstChar that uses case-

4858 // independent character classes for comparison.	4886 // independent character classes for comparison.

4859 if (!compiler->ignore_case()) {	4887 DCHECK_LT(first_atom, alternatives->length());

4860 DCHECK_LT(first_atom, alternatives->length());	4888 DCHECK_LE(i, alternatives->length());

4861 DCHECK_LE(i, alternatives->length());	4889 DCHECK_LE(first_atom, i);

4862 DCHECK_LE(first_atom, i);	4890 if (compiler->ignore_case()) {

	4891 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

	4892 compiler->isolate()->regexp_macro_assembler_canonicalize();

	4893 auto compare_closure =

	4894 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {

	4895 return CompareFirstCharCaseIndependent(canonicalize, a, b);

	4896 };

	4897 alternatives->StableSort(compare_closure, first_atom, i - first_atom);

	4898 } else {

4863 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);	4899 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);

4864 }	4900 }

4865 if (i - first_atom > 1) found_consecutive_atoms = true;	4901 if (i - first_atom > 1) found_consecutive_atoms = true;

4866 }	4902 }

4867 return found_consecutive_atoms;	4903 return found_consecutive_atoms;

4868 }	4904 }

4869	4905

4870	4906

4871 // Optimizes ab\|ac\|az to a(?:b\|c\|d).	4907 // Optimizes ab\|ac\|az to a(?:b\|c\|d).

4872 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {	4908 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

4873 Zone* zone = compiler->zone();	4909 Zone* zone = compiler->zone();

4874 ZoneList<RegExpTree> alternatives = this->alternatives();	4910 ZoneList<RegExpTree> alternatives = this->alternatives();

4875 int length = alternatives->length();	4911 int length = alternatives->length();

4876	4912

4877 int write_posn = 0;	4913 int write_posn = 0;

4878 int i = 0;	4914 int i = 0;

4879 while (i < length) {	4915 while (i < length) {

4880 RegExpTree* alternative = alternatives->at(i);	4916 RegExpTree* alternative = alternatives->at(i);

4881 if (!alternative->IsAtom()) {	4917 if (!alternative->IsAtom()) {

4882 alternatives->at(write_posn++) = alternatives->at(i);	4918 alternatives->at(write_posn++) = alternatives->at(i);

4883 i++;	4919 i++;

4884 continue;	4920 continue;

4885 }	4921 }

4886 RegExpAtom* atom = alternative->AsAtom();	4922 RegExpAtom* atom = alternative->AsAtom();

4887 uc16 common_prefix = atom->data().at(0);	4923 unibrow::uchar common_prefix = atom->data().at(0);

4888 int first_with_prefix = i;	4924 int first_with_prefix = i;

4889 int prefix_length = atom->length();	4925 int prefix_length = atom->length();

4890 i++;	4926 i++;

4891 while (i < length) {	4927 while (i < length) {

4892 alternative = alternatives->at(i);	4928 alternative = alternatives->at(i);

4893 if (!alternative->IsAtom()) break;	4929 if (!alternative->IsAtom()) break;

4894 atom = alternative->AsAtom();	4930 atom = alternative->AsAtom();

4895 if (atom->data().at(0) != common_prefix) break;	4931 unibrow::uchar new_prefix = atom->data().at(0);

	4932 if (new_prefix != common_prefix) {

	4933 if (!compiler->ignore_case()) break;

	4934 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

	4935 compiler->isolate()->regexp_macro_assembler_canonicalize();

	4936 new_prefix = Canonical(canonicalize, new_prefix);

	4937 common_prefix = Canonical(canonicalize, common_prefix);

	4938 if (new_prefix != common_prefix) break;

	4939 }

4896 prefix_length = Min(prefix_length, atom->length());	4940 prefix_length = Min(prefix_length, atom->length());

4897 i++;	4941 i++;

4898 }	4942 }

4899 if (i > first_with_prefix + 2) {	4943 if (i > first_with_prefix + 2) {

4900 // Found worthwhile run of alternatives with common prefix of at least one	4944 // Found worthwhile run of alternatives with common prefix of at least one

4901 // character. The sorting function above did not sort on more than one	4945 // character. The sorting function above did not sort on more than one

4902 // character for reasons of correctness, but there may still be a longer	4946 // character for reasons of correctness, but there may still be a longer

4903 // common prefix if the terms were similar or presorted in the input.	4947 // common prefix if the terms were similar or presorted in the input.

4904 // Find out how long the common prefix is.	4948 // Find out how long the common prefix is.

4905 int run_length = i - first_with_prefix;	4949 int run_length = i - first_with_prefix;

4906 atom = alternatives->at(first_with_prefix)->AsAtom();	4950 atom = alternatives->at(first_with_prefix)->AsAtom();

4907 for (int j = 1; j < run_length && prefix_length > 1; j++) {	4951 for (int j = 1; j < run_length && prefix_length > 1; j++) {

4908 RegExpAtom* old_atom =	4952 RegExpAtom* old_atom =

4909 alternatives->at(j + first_with_prefix)->AsAtom();	4953 alternatives->at(j + first_with_prefix)->AsAtom();

4910 for (int k = 1; k < prefix_length; k++) {	4954 for (int k = 1; k < prefix_length; k++) {

4911 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;	4955 if (atom->data().at(k) != old_atom->data().at(k)) {

	4956 prefix_length = k;

	4957 break;

	4958 }

4912 }	4959 }

4913 }	4960 }

4914 RegExpAtom* prefix =	4961 RegExpAtom* prefix =

4915 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));	4962 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));

4916 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);	4963 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);

4917 pair->Add(prefix, zone);	4964 pair->Add(prefix, zone);

4918 ZoneList<RegExpTree> suffixes =	4965 ZoneList<RegExpTree> suffixes =

4919 new (zone) ZoneList<RegExpTree*>(run_length, zone);	4966 new (zone) ZoneList<RegExpTree*>(run_length, zone);

4920 for (int j = 0; j < run_length; j++) {	4967 for (int j = 0; j < run_length; j++) {

4921 RegExpAtom* old_atom =	4968 RegExpAtom* old_atom =

(...skipping 1424 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;	6393 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;

6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&	6394 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&

6348 heap->isolate()->memory_allocator()->SizeExecutable() >	6395 heap->isolate()->memory_allocator()->SizeExecutable() >

6349 RegExpImpl::kRegExpExecutableMemoryLimit) {	6396 RegExpImpl::kRegExpExecutableMemoryLimit) {

6350 too_much = true;	6397 too_much = true;

6351 }	6398 }

6352 return too_much;	6399 return too_much;

6353 }	6400 }

6354 } // namespace internal	6401 } // namespace internal

6355 } // namespace v8	6402 } // namespace v8

OLD	NEW

« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »