src/jsregexp.cc - Issue 1204013003: Revert of Extend big-disjunction optimization to case-independent regexps

Side by Side Diff: src/jsregexp.cc

Issue 1204013003: Revert of Extend big-disjunction optimization to case-independent regexps (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Created 5 years, 5 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/v8.h"	5 #include "src/v8.h"

6	6

7 #include "src/ast.h"	7 #include "src/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

(...skipping 4810 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4821 RegExpAtom* atom1 = (*a)->AsAtom();	4821 RegExpAtom* atom1 = (*a)->AsAtom();

4822 RegExpAtom* atom2 = (*b)->AsAtom();	4822 RegExpAtom* atom2 = (*b)->AsAtom();

4823 uc16 character1 = atom1->data().at(0);	4823 uc16 character1 = atom1->data().at(0);

4824 uc16 character2 = atom2->data().at(0);	4824 uc16 character2 = atom2->data().at(0);

4825 if (character1 < character2) return -1;	4825 if (character1 < character2) return -1;

4826 if (character1 > character2) return 1;	4826 if (character1 > character2) return 1;

4827 return 0;	4827 return 0;

4828 }	4828 }

4829	4829

4830	4830

4831 static unibrow::uchar Canonical(

4832 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

4833 unibrow::uchar c) {

4834 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];

4835 int length = canonicalize->get(c, '\0', chars);

4836 DCHECK_LE(length, 1);

4837 unibrow::uchar canonical = c;

4838 if (length == 1) canonical = chars[0];

4839 return canonical;

4840 }

4841

4842

4843 int CompareFirstCharCaseIndependent(

4844 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,

4845 RegExpTree* const* a, RegExpTree* const* b) {

4846 RegExpAtom* atom1 = (*a)->AsAtom();

4847 RegExpAtom* atom2 = (*b)->AsAtom();

4848 unibrow::uchar character1 = atom1->data().at(0);

4849 unibrow::uchar character2 = atom2->data().at(0);

4850 if (character1 == character2) return 0;

4851 if (character1 >= 'a' \|\| character2 >= 'a') {

4852 character1 = Canonical(canonicalize, character1);

4853 character2 = Canonical(canonicalize, character2);

4854 }

4855 return static_cast<int>(character1) - static_cast<int>(character2);

4856 }

4857

4858

4859 // We can stable sort runs of atoms, since the order does not matter if they	4831 // We can stable sort runs of atoms, since the order does not matter if they

4860 // start with different characters.	4832 // start with different characters.

4861 // Returns true if any consecutive atoms were found.	4833 // Returns true if any consecutive atoms were found.

4862 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {	4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {

4863 ZoneList<RegExpTree> alternatives = this->alternatives();	4835 ZoneList<RegExpTree> alternatives = this->alternatives();

4864 int length = alternatives->length();	4836 int length = alternatives->length();

4865 bool found_consecutive_atoms = false;	4837 bool found_consecutive_atoms = false;

4866 for (int i = 0; i < length; i++) {	4838 for (int i = 0; i < length; i++) {

4867 while (i < length) {	4839 while (i < length) {

4868 RegExpTree* alternative = alternatives->at(i);	4840 RegExpTree* alternative = alternatives->at(i);

4869 if (alternative->IsAtom()) break;	4841 if (alternative->IsAtom()) break;

4870 i++;	4842 i++;

4871 }	4843 }

4872 // i is length or it is the index of an atom.	4844 // i is length or it is the index of an atom.

4873 if (i == length) break;	4845 if (i == length) break;

4874 int first_atom = i;	4846 int first_atom = i;

4875 i++;	4847 i++;

4876 while (i < length) {	4848 while (i < length) {

4877 RegExpTree* alternative = alternatives->at(i);	4849 RegExpTree* alternative = alternatives->at(i);

4878 if (!alternative->IsAtom()) break;	4850 if (!alternative->IsAtom()) break;

4879 i++;	4851 i++;

4880 }	4852 }

4881 // Sort atoms to get ones with common prefixes together.	4853 // Sort atoms to get ones with common prefixes together.

4882 // This step is more tricky if we are in a case-independent regexp,	4854 // This step is not valid if we are in a case-independent regexp,

4883 // because it would change /is\|I/ to /I\|is/, and order matters when	4855 // because it would change /is\|I/ to /I\|is/, and order matters when

4884 // the regexp parts don't match only disjoint starting points. To fix	4856 // the regexp parts don't match only disjoint starting points. To fix

4885 // this we have a version of CompareFirstChar that uses case-	4857 // this would need a version of CompareFirstChar that uses case-

4886 // independent character classes for comparison.	4858 // independent character classes for comparison.

4887 DCHECK_LT(first_atom, alternatives->length());	4859 if (!compiler->ignore_case()) {

4888 DCHECK_LE(i, alternatives->length());	4860 DCHECK_LT(first_atom, alternatives->length());

4889 DCHECK_LE(first_atom, i);	4861 DCHECK_LE(i, alternatives->length());

4890 if (compiler->ignore_case()) {	4862 DCHECK_LE(first_atom, i);

4891 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

4892 compiler->isolate()->regexp_macro_assembler_canonicalize();

4893 auto compare_closure =

4894 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {

4895 return CompareFirstCharCaseIndependent(canonicalize, a, b);

4896 };

4897 alternatives->StableSort(compare_closure, first_atom, i - first_atom);

4898 } else {

4899 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);	4863 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);

4900 }	4864 }

4901 if (i - first_atom > 1) found_consecutive_atoms = true;	4865 if (i - first_atom > 1) found_consecutive_atoms = true;

4902 }	4866 }

4903 return found_consecutive_atoms;	4867 return found_consecutive_atoms;

4904 }	4868 }

4905	4869

4906	4870

4907 // Optimizes ab\|ac\|az to a(?:b\|c\|d).	4871 // Optimizes ab\|ac\|az to a(?:b\|c\|d).

4908 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {	4872 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {

4909 Zone* zone = compiler->zone();	4873 Zone* zone = compiler->zone();

4910 ZoneList<RegExpTree> alternatives = this->alternatives();	4874 ZoneList<RegExpTree> alternatives = this->alternatives();

4911 int length = alternatives->length();	4875 int length = alternatives->length();

4912	4876

4913 int write_posn = 0;	4877 int write_posn = 0;

4914 int i = 0;	4878 int i = 0;

4915 while (i < length) {	4879 while (i < length) {

4916 RegExpTree* alternative = alternatives->at(i);	4880 RegExpTree* alternative = alternatives->at(i);

4917 if (!alternative->IsAtom()) {	4881 if (!alternative->IsAtom()) {

4918 alternatives->at(write_posn++) = alternatives->at(i);	4882 alternatives->at(write_posn++) = alternatives->at(i);

4919 i++;	4883 i++;

4920 continue;	4884 continue;

4921 }	4885 }

4922 RegExpAtom* atom = alternative->AsAtom();	4886 RegExpAtom* atom = alternative->AsAtom();

4923 unibrow::uchar common_prefix = atom->data().at(0);	4887 uc16 common_prefix = atom->data().at(0);

4924 int first_with_prefix = i;	4888 int first_with_prefix = i;

4925 int prefix_length = atom->length();	4889 int prefix_length = atom->length();

4926 i++;	4890 i++;

4927 while (i < length) {	4891 while (i < length) {

4928 alternative = alternatives->at(i);	4892 alternative = alternatives->at(i);

4929 if (!alternative->IsAtom()) break;	4893 if (!alternative->IsAtom()) break;

4930 atom = alternative->AsAtom();	4894 atom = alternative->AsAtom();

4931 unibrow::uchar new_prefix = atom->data().at(0);	4895 if (atom->data().at(0) != common_prefix) break;

4932 if (new_prefix != common_prefix) {

4933 if (!compiler->ignore_case()) break;

4934 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =

4935 compiler->isolate()->regexp_macro_assembler_canonicalize();

4936 new_prefix = Canonical(canonicalize, new_prefix);

4937 common_prefix = Canonical(canonicalize, common_prefix);

4938 if (new_prefix != common_prefix) break;

4939 }

4940 prefix_length = Min(prefix_length, atom->length());	4896 prefix_length = Min(prefix_length, atom->length());

4941 i++;	4897 i++;

4942 }	4898 }

4943 if (i > first_with_prefix + 2) {	4899 if (i > first_with_prefix + 2) {

4944 // Found worthwhile run of alternatives with common prefix of at least one	4900 // Found worthwhile run of alternatives with common prefix of at least one

4945 // character. The sorting function above did not sort on more than one	4901 // character. The sorting function above did not sort on more than one

4946 // character for reasons of correctness, but there may still be a longer	4902 // character for reasons of correctness, but there may still be a longer

4947 // common prefix if the terms were similar or presorted in the input.	4903 // common prefix if the terms were similar or presorted in the input.

4948 // Find out how long the common prefix is.	4904 // Find out how long the common prefix is.

4949 int run_length = i - first_with_prefix;	4905 int run_length = i - first_with_prefix;

4950 atom = alternatives->at(first_with_prefix)->AsAtom();	4906 atom = alternatives->at(first_with_prefix)->AsAtom();

4951 for (int j = 1; j < run_length && prefix_length > 1; j++) {	4907 for (int j = 1; j < run_length && prefix_length > 1; j++) {

4952 RegExpAtom* old_atom =	4908 RegExpAtom* old_atom =

4953 alternatives->at(j + first_with_prefix)->AsAtom();	4909 alternatives->at(j + first_with_prefix)->AsAtom();

4954 for (int k = 1; k < prefix_length; k++) {	4910 for (int k = 1; k < prefix_length; k++) {

4955 if (atom->data().at(k) != old_atom->data().at(k)) {	4911 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k;

4956 prefix_length = k;

4957 break;

4958 }

4959 }	4912 }

4960 }	4913 }

4961 RegExpAtom* prefix =	4914 RegExpAtom* prefix =

4962 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));	4915 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));

4963 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);	4916 ZoneList<RegExpTree> pair = new (zone) ZoneList<RegExpTree*>(2, zone);

4964 pair->Add(prefix, zone);	4917 pair->Add(prefix, zone);

4965 ZoneList<RegExpTree> suffixes =	4918 ZoneList<RegExpTree> suffixes =

4966 new (zone) ZoneList<RegExpTree*>(run_length, zone);	4919 new (zone) ZoneList<RegExpTree*>(run_length, zone);

4967 for (int j = 0; j < run_length; j++) {	4920 for (int j = 0; j < run_length; j++) {

4968 RegExpAtom* old_atom =	4921 RegExpAtom* old_atom =

(...skipping 1424 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6393 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;	6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;

6394 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&	6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&

6395 heap->isolate()->memory_allocator()->SizeExecutable() >	6348 heap->isolate()->memory_allocator()->SizeExecutable() >

6396 RegExpImpl::kRegExpExecutableMemoryLimit) {	6349 RegExpImpl::kRegExpExecutableMemoryLimit) {

6397 too_much = true;	6350 too_much = true;

6398 }	6351 }

6399 return too_much;	6352 return too_much;

6400 }	6353 }

6401 } // namespace internal	6354 } // namespace internal

6402 } // namespace v8	6355 } // namespace v8

OLD	NEW

« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »