OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/v8.h" | 5 #include "src/v8.h" |
6 | 6 |
7 #include "src/ast.h" | 7 #include "src/ast.h" |
8 #include "src/base/platform/platform.h" | 8 #include "src/base/platform/platform.h" |
9 #include "src/compilation-cache.h" | 9 #include "src/compilation-cache.h" |
10 #include "src/compiler.h" | 10 #include "src/compiler.h" |
(...skipping 4810 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4821 RegExpAtom* atom1 = (*a)->AsAtom(); | 4821 RegExpAtom* atom1 = (*a)->AsAtom(); |
4822 RegExpAtom* atom2 = (*b)->AsAtom(); | 4822 RegExpAtom* atom2 = (*b)->AsAtom(); |
4823 uc16 character1 = atom1->data().at(0); | 4823 uc16 character1 = atom1->data().at(0); |
4824 uc16 character2 = atom2->data().at(0); | 4824 uc16 character2 = atom2->data().at(0); |
4825 if (character1 < character2) return -1; | 4825 if (character1 < character2) return -1; |
4826 if (character1 > character2) return 1; | 4826 if (character1 > character2) return 1; |
4827 return 0; | 4827 return 0; |
4828 } | 4828 } |
4829 | 4829 |
4830 | 4830 |
4831 static unibrow::uchar Canonical( | |
4832 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, | |
4833 unibrow::uchar c) { | |
4834 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth]; | |
4835 int length = canonicalize->get(c, '\0', chars); | |
4836 DCHECK_LE(length, 1); | |
4837 unibrow::uchar canonical = c; | |
4838 if (length == 1) canonical = chars[0]; | |
4839 return canonical; | |
4840 } | |
4841 | |
4842 | |
4843 int CompareFirstCharCaseIndependent( | |
4844 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize, | |
4845 RegExpTree* const* a, RegExpTree* const* b) { | |
4846 RegExpAtom* atom1 = (*a)->AsAtom(); | |
4847 RegExpAtom* atom2 = (*b)->AsAtom(); | |
4848 unibrow::uchar character1 = atom1->data().at(0); | |
4849 unibrow::uchar character2 = atom2->data().at(0); | |
4850 if (character1 == character2) return 0; | |
4851 if (character1 >= 'a' || character2 >= 'a') { | |
4852 character1 = Canonical(canonicalize, character1); | |
4853 character2 = Canonical(canonicalize, character2); | |
4854 } | |
4855 return static_cast<int>(character1) - static_cast<int>(character2); | |
4856 } | |
4857 | |
4858 | |
4859 // We can stable sort runs of atoms, since the order does not matter if they | 4831 // We can stable sort runs of atoms, since the order does not matter if they |
4860 // start with different characters. | 4832 // start with different characters. |
4861 // Returns true if any consecutive atoms were found. | 4833 // Returns true if any consecutive atoms were found. |
4862 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { | 4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { |
4863 ZoneList<RegExpTree*>* alternatives = this->alternatives(); | 4835 ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
4864 int length = alternatives->length(); | 4836 int length = alternatives->length(); |
4865 bool found_consecutive_atoms = false; | 4837 bool found_consecutive_atoms = false; |
4866 for (int i = 0; i < length; i++) { | 4838 for (int i = 0; i < length; i++) { |
4867 while (i < length) { | 4839 while (i < length) { |
4868 RegExpTree* alternative = alternatives->at(i); | 4840 RegExpTree* alternative = alternatives->at(i); |
4869 if (alternative->IsAtom()) break; | 4841 if (alternative->IsAtom()) break; |
4870 i++; | 4842 i++; |
4871 } | 4843 } |
4872 // i is length or it is the index of an atom. | 4844 // i is length or it is the index of an atom. |
4873 if (i == length) break; | 4845 if (i == length) break; |
4874 int first_atom = i; | 4846 int first_atom = i; |
4875 i++; | 4847 i++; |
4876 while (i < length) { | 4848 while (i < length) { |
4877 RegExpTree* alternative = alternatives->at(i); | 4849 RegExpTree* alternative = alternatives->at(i); |
4878 if (!alternative->IsAtom()) break; | 4850 if (!alternative->IsAtom()) break; |
4879 i++; | 4851 i++; |
4880 } | 4852 } |
4881 // Sort atoms to get ones with common prefixes together. | 4853 // Sort atoms to get ones with common prefixes together. |
4882 // This step is more tricky if we are in a case-independent regexp, | 4854 // This step is not valid if we are in a case-independent regexp, |
4883 // because it would change /is|I/ to /I|is/, and order matters when | 4855 // because it would change /is|I/ to /I|is/, and order matters when |
4884 // the regexp parts don't match only disjoint starting points. To fix | 4856 // the regexp parts don't match only disjoint starting points. To fix |
4885 // this we have a version of CompareFirstChar that uses case- | 4857 // this would need a version of CompareFirstChar that uses case- |
4886 // independent character classes for comparison. | 4858 // independent character classes for comparison. |
4887 DCHECK_LT(first_atom, alternatives->length()); | 4859 if (!compiler->ignore_case()) { |
4888 DCHECK_LE(i, alternatives->length()); | 4860 DCHECK_LT(first_atom, alternatives->length()); |
4889 DCHECK_LE(first_atom, i); | 4861 DCHECK_LE(i, alternatives->length()); |
4890 if (compiler->ignore_case()) { | 4862 DCHECK_LE(first_atom, i); |
4891 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = | |
4892 compiler->isolate()->regexp_macro_assembler_canonicalize(); | |
4893 auto compare_closure = | |
4894 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) { | |
4895 return CompareFirstCharCaseIndependent(canonicalize, a, b); | |
4896 }; | |
4897 alternatives->StableSort(compare_closure, first_atom, i - first_atom); | |
4898 } else { | |
4899 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); | 4863 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); |
4900 } | 4864 } |
4901 if (i - first_atom > 1) found_consecutive_atoms = true; | 4865 if (i - first_atom > 1) found_consecutive_atoms = true; |
4902 } | 4866 } |
4903 return found_consecutive_atoms; | 4867 return found_consecutive_atoms; |
4904 } | 4868 } |
4905 | 4869 |
4906 | 4870 |
4907 // Optimizes ab|ac|az to a(?:b|c|d). | 4871 // Optimizes ab|ac|az to a(?:b|c|d). |
4908 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { | 4872 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { |
4909 Zone* zone = compiler->zone(); | 4873 Zone* zone = compiler->zone(); |
4910 ZoneList<RegExpTree*>* alternatives = this->alternatives(); | 4874 ZoneList<RegExpTree*>* alternatives = this->alternatives(); |
4911 int length = alternatives->length(); | 4875 int length = alternatives->length(); |
4912 | 4876 |
4913 int write_posn = 0; | 4877 int write_posn = 0; |
4914 int i = 0; | 4878 int i = 0; |
4915 while (i < length) { | 4879 while (i < length) { |
4916 RegExpTree* alternative = alternatives->at(i); | 4880 RegExpTree* alternative = alternatives->at(i); |
4917 if (!alternative->IsAtom()) { | 4881 if (!alternative->IsAtom()) { |
4918 alternatives->at(write_posn++) = alternatives->at(i); | 4882 alternatives->at(write_posn++) = alternatives->at(i); |
4919 i++; | 4883 i++; |
4920 continue; | 4884 continue; |
4921 } | 4885 } |
4922 RegExpAtom* atom = alternative->AsAtom(); | 4886 RegExpAtom* atom = alternative->AsAtom(); |
4923 unibrow::uchar common_prefix = atom->data().at(0); | 4887 uc16 common_prefix = atom->data().at(0); |
4924 int first_with_prefix = i; | 4888 int first_with_prefix = i; |
4925 int prefix_length = atom->length(); | 4889 int prefix_length = atom->length(); |
4926 i++; | 4890 i++; |
4927 while (i < length) { | 4891 while (i < length) { |
4928 alternative = alternatives->at(i); | 4892 alternative = alternatives->at(i); |
4929 if (!alternative->IsAtom()) break; | 4893 if (!alternative->IsAtom()) break; |
4930 atom = alternative->AsAtom(); | 4894 atom = alternative->AsAtom(); |
4931 unibrow::uchar new_prefix = atom->data().at(0); | 4895 if (atom->data().at(0) != common_prefix) break; |
4932 if (new_prefix != common_prefix) { | |
4933 if (!compiler->ignore_case()) break; | |
4934 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize = | |
4935 compiler->isolate()->regexp_macro_assembler_canonicalize(); | |
4936 new_prefix = Canonical(canonicalize, new_prefix); | |
4937 common_prefix = Canonical(canonicalize, common_prefix); | |
4938 if (new_prefix != common_prefix) break; | |
4939 } | |
4940 prefix_length = Min(prefix_length, atom->length()); | 4896 prefix_length = Min(prefix_length, atom->length()); |
4941 i++; | 4897 i++; |
4942 } | 4898 } |
4943 if (i > first_with_prefix + 2) { | 4899 if (i > first_with_prefix + 2) { |
4944 // Found worthwhile run of alternatives with common prefix of at least one | 4900 // Found worthwhile run of alternatives with common prefix of at least one |
4945 // character. The sorting function above did not sort on more than one | 4901 // character. The sorting function above did not sort on more than one |
4946 // character for reasons of correctness, but there may still be a longer | 4902 // character for reasons of correctness, but there may still be a longer |
4947 // common prefix if the terms were similar or presorted in the input. | 4903 // common prefix if the terms were similar or presorted in the input. |
4948 // Find out how long the common prefix is. | 4904 // Find out how long the common prefix is. |
4949 int run_length = i - first_with_prefix; | 4905 int run_length = i - first_with_prefix; |
4950 atom = alternatives->at(first_with_prefix)->AsAtom(); | 4906 atom = alternatives->at(first_with_prefix)->AsAtom(); |
4951 for (int j = 1; j < run_length && prefix_length > 1; j++) { | 4907 for (int j = 1; j < run_length && prefix_length > 1; j++) { |
4952 RegExpAtom* old_atom = | 4908 RegExpAtom* old_atom = |
4953 alternatives->at(j + first_with_prefix)->AsAtom(); | 4909 alternatives->at(j + first_with_prefix)->AsAtom(); |
4954 for (int k = 1; k < prefix_length; k++) { | 4910 for (int k = 1; k < prefix_length; k++) { |
4955 if (atom->data().at(k) != old_atom->data().at(k)) { | 4911 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k; |
4956 prefix_length = k; | |
4957 break; | |
4958 } | |
4959 } | 4912 } |
4960 } | 4913 } |
4961 RegExpAtom* prefix = | 4914 RegExpAtom* prefix = |
4962 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); | 4915 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); |
4963 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); | 4916 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); |
4964 pair->Add(prefix, zone); | 4917 pair->Add(prefix, zone); |
4965 ZoneList<RegExpTree*>* suffixes = | 4918 ZoneList<RegExpTree*>* suffixes = |
4966 new (zone) ZoneList<RegExpTree*>(run_length, zone); | 4919 new (zone) ZoneList<RegExpTree*>(run_length, zone); |
4967 for (int j = 0; j < run_length; j++) { | 4920 for (int j = 0; j < run_length; j++) { |
4968 RegExpAtom* old_atom = | 4921 RegExpAtom* old_atom = |
(...skipping 1424 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6393 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; | 6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; |
6394 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && | 6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && |
6395 heap->isolate()->memory_allocator()->SizeExecutable() > | 6348 heap->isolate()->memory_allocator()->SizeExecutable() > |
6396 RegExpImpl::kRegExpExecutableMemoryLimit) { | 6349 RegExpImpl::kRegExpExecutableMemoryLimit) { |
6397 too_much = true; | 6350 too_much = true; |
6398 } | 6351 } |
6399 return too_much; | 6352 return too_much; |
6400 } | 6353 } |
6401 } // namespace internal | 6354 } // namespace internal |
6402 } // namespace v8 | 6355 } // namespace v8 |
OLD | NEW |