OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/v8.h" | 5 #include "src/v8.h" |
6 | 6 |
7 #include "src/ast.h" | 7 #include "src/ast.h" |
8 #include "src/base/platform/platform.h" | 8 #include "src/base/platform/platform.h" |
9 #include "src/compilation-cache.h" | 9 #include "src/compilation-cache.h" |
10 #include "src/compiler.h" | 10 #include "src/compiler.h" |
(...skipping 4819 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4830 RegExpAtom* atom1 = (*a)->AsAtom(); | 4830 RegExpAtom* atom1 = (*a)->AsAtom(); |
4831 RegExpAtom* atom2 = (*b)->AsAtom(); | 4831 RegExpAtom* atom2 = (*b)->AsAtom(); |
4832 uc16 character1 = atom1->data().at(0); | 4832 uc16 character1 = atom1->data().at(0); |
4833 uc16 character2 = atom2->data().at(0); | 4833 uc16 character2 = atom2->data().at(0); |
4834 if (character1 < character2) return -1; | 4834 if (character1 < character2) return -1; |
4835 if (character1 > character2) return 1; | 4835 if (character1 > character2) return 1; |
4836 return 0; | 4836 return 0; |
4837 } | 4837 } |
4838 | 4838 |
4839 | 4839 |
| 4840 static unibrow::uchar Canonical( |
| 4841 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, |
| 4842 unibrow::uchar c) { |
| 4843 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; |
| 4844 int length = canonicalize->get(c, '\0', chars); |
| 4845 DCHECK_LE(length, 1); |
| 4846 unibrow::uchar canonical = c; |
| 4847 if (length == 1) canonical = chars[0]; |
| 4848 return canonical; |
| 4849 } |
| 4850 |
| 4851 |
| 4852 int CompareFirstCharCaseIndependent( |
| 4853 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, |
| 4854 RegExpTree* const* a, RegExpTree* const* b) { |
| 4855 RegExpAtom* atom1 = (*a)->AsAtom(); |
| 4856 RegExpAtom* atom2 = (*b)->AsAtom(); |
| 4857 unibrow::uchar character1 = atom1->data().at(0); |
| 4858 unibrow::uchar character2 = atom2->data().at(0); |
| 4859 if (character1 == character2) return 0; |
| 4860 if (character1 >= 'a' || character2 >= 'a') { |
| 4861 character1 = Canonical(canonicalize, character1); |
| 4862 character2 = Canonical(canonicalize, character2); |
| 4863 } |
| 4864 return static_cast<int>(character1) - static_cast<int>(character2); |
| 4865 } |
| 4866 |
| 4867 |
4840 // We can stable sort runs of atoms, since the order does not matter if they | 4868 // We can stable sort runs of atoms, since the order does not matter if they |
4841 // start with different characters. | 4869 // start with different characters. |
4842 // Returns true if any consecutive atoms were found. | 4870 // Returns true if any consecutive atoms were found. |
4843 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { | 4871 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { |
4844 ZoneList<RegExpTree*>* alternatives = this->alternatives(); | 4872 ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
4845 int length = alternatives->length(); | 4873 int length = alternatives->length(); |
4846 bool found_consecutive_atoms = false; | 4874 bool found_consecutive_atoms = false; |
4847 for (int i = 0; i < length; i++) { | 4875 for (int i = 0; i < length; i++) { |
4848 while (i < length) { | 4876 while (i < length) { |
4849 RegExpTree* alternative = alternatives->at(i); | 4877 RegExpTree* alternative = alternatives->at(i); |
4850 if (alternative->IsAtom()) break; | 4878 if (alternative->IsAtom()) break; |
4851 i++; | 4879 i++; |
4852 } | 4880 } |
4853 // i is length or it is the index of an atom. | 4881 // i is length or it is the index of an atom. |
4854 if (i == length) break; | 4882 if (i == length) break; |
4855 int first_atom = i; | 4883 int first_atom = i; |
4856 i++; | 4884 i++; |
4857 while (i < length) { | 4885 while (i < length) { |
4858 RegExpTree* alternative = alternatives->at(i); | 4886 RegExpTree* alternative = alternatives->at(i); |
4859 if (!alternative->IsAtom()) break; | 4887 if (!alternative->IsAtom()) break; |
4860 i++; | 4888 i++; |
4861 } | 4889 } |
4862 // Sort atoms to get ones with common prefixes together. | 4890 // Sort atoms to get ones with common prefixes together. |
4863 // This step is not valid if we are in a case-independent regexp, | 4891 // This step is more tricky if we are in a case-independent regexp, |
4864 // because it would change /is|I/ to /I|is/, and order matters when | 4892 // because it would change /is|I/ to /I|is/, and order matters when |
4865 // the regexp parts don't match only disjoint starting points. To fix | 4893 // the regexp parts don't match only disjoint starting points. To fix |
4866 // this would need a version of CompareFirstChar that uses case- | 4894 // this we have a version of CompareFirstChar that uses case- |
4867 // independent character classes for comparison. | 4895 // independent character classes for comparison. |
4868 if (!compiler->ignore_case()) { | 4896 DCHECK_LT(first_atom, alternatives->length()); |
4869 DCHECK_LT(first_atom, alternatives->length()); | 4897 DCHECK_LE(i, alternatives->length()); |
4870 DCHECK_LE(i, alternatives->length()); | 4898 DCHECK_LE(first_atom, i); |
4871 DCHECK_LE(first_atom, i); | 4899 if (compiler->ignore_case()) { |
| 4900 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = |
| 4901 compiler->isolate()->regexp_macro_assembler_canonicalize(); |
| 4902 auto compare_closure = |
| 4903 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { |
| 4904 return CompareFirstCharCaseIndependent(canonicalize, a, b); |
| 4905 }; |
| 4906 alternatives->StableSort(compare_closure, first_atom, i - first_atom); |
| 4907 } else { |
4872 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); | 4908 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); |
4873 } | 4909 } |
4874 if (i - first_atom > 1) found_consecutive_atoms = true; | 4910 if (i - first_atom > 1) found_consecutive_atoms = true; |
4875 } | 4911 } |
4876 return found_consecutive_atoms; | 4912 return found_consecutive_atoms; |
4877 } | 4913 } |
4878 | 4914 |
4879 | 4915 |
4880 // Optimizes ab|ac|az to a(?:b|c|d). | 4916 // Optimizes ab|ac|az to a(?:b|c|d). |
4881 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { | 4917 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { |
4882 Zone* zone = compiler->zone(); | 4918 Zone* zone = compiler->zone(); |
4883 ZoneList<RegExpTree*>* alternatives = this->alternatives(); | 4919 ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
4884 int length = alternatives->length(); | 4920 int length = alternatives->length(); |
4885 | 4921 |
4886 int write_posn = 0; | 4922 int write_posn = 0; |
4887 int i = 0; | 4923 int i = 0; |
4888 while (i < length) { | 4924 while (i < length) { |
4889 RegExpTree* alternative = alternatives->at(i); | 4925 RegExpTree* alternative = alternatives->at(i); |
4890 if (!alternative->IsAtom()) { | 4926 if (!alternative->IsAtom()) { |
4891 alternatives->at(write_posn++) = alternatives->at(i); | 4927 alternatives->at(write_posn++) = alternatives->at(i); |
4892 i++; | 4928 i++; |
4893 continue; | 4929 continue; |
4894 } | 4930 } |
4895 RegExpAtom* atom = alternative->AsAtom(); | 4931 RegExpAtom* atom = alternative->AsAtom(); |
4896 uc16 common_prefix = atom->data().at(0); | 4932 unibrow::uchar common_prefix = atom->data().at(0); |
4897 int first_with_prefix = i; | 4933 int first_with_prefix = i; |
4898 int prefix_length = atom->length(); | 4934 int prefix_length = atom->length(); |
4899 i++; | 4935 i++; |
4900 while (i < length) { | 4936 while (i < length) { |
4901 alternative = alternatives->at(i); | 4937 alternative = alternatives->at(i); |
4902 if (!alternative->IsAtom()) break; | 4938 if (!alternative->IsAtom()) break; |
4903 atom = alternative->AsAtom(); | 4939 atom = alternative->AsAtom(); |
4904 if (atom->data().at(0) != common_prefix) break; | 4940 unibrow::uchar new_prefix = atom->data().at(0); |
| 4941 if (new_prefix != common_prefix) { |
| 4942 if (!compiler->ignore_case()) break; |
| 4943 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = |
| 4944 compiler->isolate()->regexp_macro_assembler_canonicalize(); |
| 4945 new_prefix = Canonical(canonicalize, new_prefix); |
| 4946 common_prefix = Canonical(canonicalize, common_prefix); |
| 4947 if (new_prefix != common_prefix) break; |
| 4948 } |
4905 prefix_length = Min(prefix_length, atom->length()); | 4949 prefix_length = Min(prefix_length, atom->length()); |
4906 i++; | 4950 i++; |
4907 } | 4951 } |
4908 if (i > first_with_prefix + 2) { | 4952 if (i > first_with_prefix + 2) { |
4909 // Found worthwhile run of alternatives with common prefix of at least one | 4953 // Found worthwhile run of alternatives with common prefix of at least one |
4910 // character. The sorting function above did not sort on more than one | 4954 // character. The sorting function above did not sort on more than one |
4911 // character for reasons of correctness, but there may still be a longer | 4955 // character for reasons of correctness, but there may still be a longer |
4912 // common prefix if the terms were similar or presorted in the input. | 4956 // common prefix if the terms were similar or presorted in the input. |
4913 // Find out how long the common prefix is. | 4957 // Find out how long the common prefix is. |
4914 int run_length = i - first_with_prefix; | 4958 int run_length = i - first_with_prefix; |
4915 atom = alternatives->at(first_with_prefix)->AsAtom(); | 4959 atom = alternatives->at(first_with_prefix)->AsAtom(); |
4916 for (int j = 1; j < run_length && prefix_length > 1; j++) { | 4960 for (int j = 1; j < run_length && prefix_length > 1; j++) { |
4917 RegExpAtom* old_atom = | 4961 RegExpAtom* old_atom = |
4918 alternatives->at(j + first_with_prefix)->AsAtom(); | 4962 alternatives->at(j + first_with_prefix)->AsAtom(); |
4919 for (int k = 1; k < prefix_length; k++) { | 4963 for (int k = 1; k < prefix_length; k++) { |
4920 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k; | 4964 if (atom->data().at(k) != old_atom->data().at(k)) { |
| 4965 prefix_length = k; |
| 4966 break; |
| 4967 } |
4921 } | 4968 } |
4922 } | 4969 } |
4923 RegExpAtom* prefix = | 4970 RegExpAtom* prefix = |
4924 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); | 4971 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); |
4925 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); | 4972 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); |
4926 pair->Add(prefix, zone); | 4973 pair->Add(prefix, zone); |
4927 ZoneList<RegExpTree*>* suffixes = | 4974 ZoneList<RegExpTree*>* suffixes = |
4928 new (zone) ZoneList<RegExpTree*>(run_length, zone); | 4975 new (zone) ZoneList<RegExpTree*>(run_length, zone); |
4929 for (int j = 0; j < run_length; j++) { | 4976 for (int j = 0; j < run_length; j++) { |
4930 RegExpAtom* old_atom = | 4977 RegExpAtom* old_atom = |
(...skipping 1424 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6355 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; | 6402 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; |
6356 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && | 6403 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && |
6357 heap->isolate()->memory_allocator()->SizeExecutable() > | 6404 heap->isolate()->memory_allocator()->SizeExecutable() > |
6358 RegExpImpl::kRegExpExecutableMemoryLimit) { | 6405 RegExpImpl::kRegExpExecutableMemoryLimit) { |
6359 too_much = true; | 6406 too_much = true; |
6360 } | 6407 } |
6361 return too_much; | 6408 return too_much; |
6362 } | 6409 } |
6363 } // namespace internal | 6410 } // namespace internal |
6364 } // namespace v8 | 6411 } // namespace v8 |
OLD | NEW |