Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(718)

Side by Side Diff: src/jsregexp.cc

Issue 1182783009: Extend big-disjunction optimization to case-independent regexps (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Fix 80 columns in test Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/ast.h" 7 #include "src/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 4810 matching lines...) Expand 10 before | Expand all | Expand 10 after
4821 RegExpAtom* atom1 = (*a)->AsAtom(); 4821 RegExpAtom* atom1 = (*a)->AsAtom();
4822 RegExpAtom* atom2 = (*b)->AsAtom(); 4822 RegExpAtom* atom2 = (*b)->AsAtom();
4823 uc16 character1 = atom1->data().at(0); 4823 uc16 character1 = atom1->data().at(0);
4824 uc16 character2 = atom2->data().at(0); 4824 uc16 character2 = atom2->data().at(0);
4825 if (character1 < character2) return -1; 4825 if (character1 < character2) return -1;
4826 if (character1 > character2) return 1; 4826 if (character1 > character2) return 1;
4827 return 0; 4827 return 0;
4828 } 4828 }
4829 4829
4830 4830
4831 static unibrow::uchar Canonical(
4832 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4833 unibrow::uchar c) {
4834 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
4835 int length = canonicalize->get(c, '\0', chars);
4836 DCHECK_LE(length, 1);
4837 unibrow::uchar canonical = c;
4838 if (length == 1) canonical = chars[0];
4839 return canonical;
4840 }
4841
4842
4843 int CompareFirstCharCaseIndependent(
4844 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4845 RegExpTree* const* a, RegExpTree* const* b) {
4846 RegExpAtom* atom1 = (*a)->AsAtom();
4847 RegExpAtom* atom2 = (*b)->AsAtom();
4848 unibrow::uchar character1 = atom1->data().at(0);
4849 unibrow::uchar character2 = atom2->data().at(0);
4850 if (character1 == character2) return 0;
4851 if (character1 >= 'a' || character2 >= 'a') {
4852 character1 = Canonical(canonicalize, character1);
4853 character2 = Canonical(canonicalize, character2);
4854 }
4855 return static_cast<int>(character1) - static_cast<int>(character2);
4856 }
4857
4858
4831 // We can stable sort runs of atoms, since the order does not matter if they 4859 // We can stable sort runs of atoms, since the order does not matter if they
4832 // start with different characters. 4860 // start with different characters.
4833 // Returns true if any consecutive atoms were found. 4861 // Returns true if any consecutive atoms were found.
4834 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 4862 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
4835 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4863 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4836 int length = alternatives->length(); 4864 int length = alternatives->length();
4837 bool found_consecutive_atoms = false; 4865 bool found_consecutive_atoms = false;
4838 for (int i = 0; i < length; i++) { 4866 for (int i = 0; i < length; i++) {
4839 while (i < length) { 4867 while (i < length) {
4840 RegExpTree* alternative = alternatives->at(i); 4868 RegExpTree* alternative = alternatives->at(i);
4841 if (alternative->IsAtom()) break; 4869 if (alternative->IsAtom()) break;
4842 i++; 4870 i++;
4843 } 4871 }
4844 // i is length or it is the index of an atom. 4872 // i is length or it is the index of an atom.
4845 if (i == length) break; 4873 if (i == length) break;
4846 int first_atom = i; 4874 int first_atom = i;
4847 i++; 4875 i++;
4848 while (i < length) { 4876 while (i < length) {
4849 RegExpTree* alternative = alternatives->at(i); 4877 RegExpTree* alternative = alternatives->at(i);
4850 if (!alternative->IsAtom()) break; 4878 if (!alternative->IsAtom()) break;
4851 i++; 4879 i++;
4852 } 4880 }
4853 // Sort atoms to get ones with common prefixes together. 4881 // Sort atoms to get ones with common prefixes together.
4854 // This step is not valid if we are in a case-independent regexp, 4882 // This step is more tricky if we are in a case-independent regexp,
4855 // because it would change /is|I/ to /I|is/, and order matters when 4883 // because it would change /is|I/ to /I|is/, and order matters when
4856 // the regexp parts don't match only disjoint starting points. To fix 4884 // the regexp parts don't match only disjoint starting points. To fix
4857 // this would need a version of CompareFirstChar that uses case- 4885 // this we have a version of CompareFirstChar that uses case-
4858 // independent character classes for comparison. 4886 // independent character classes for comparison.
4859 if (!compiler->ignore_case()) { 4887 DCHECK_LT(first_atom, alternatives->length());
4860 DCHECK_LT(first_atom, alternatives->length()); 4888 DCHECK_LE(i, alternatives->length());
4861 DCHECK_LE(i, alternatives->length()); 4889 DCHECK_LE(first_atom, i);
4862 DCHECK_LE(first_atom, i); 4890 if (compiler->ignore_case()) {
4891 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4892 compiler->isolate()->regexp_macro_assembler_canonicalize();
4893 auto compare_closure =
4894 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
4895 return CompareFirstCharCaseIndependent(canonicalize, a, b);
4896 };
4897 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
4898 } else {
4863 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 4899 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
4864 } 4900 }
4865 if (i - first_atom > 1) found_consecutive_atoms = true; 4901 if (i - first_atom > 1) found_consecutive_atoms = true;
4866 } 4902 }
4867 return found_consecutive_atoms; 4903 return found_consecutive_atoms;
4868 } 4904 }
4869 4905
4870 4906
4871 // Optimizes ab|ac|az to a(?:b|c|d). 4907 // Optimizes ab|ac|az to a(?:b|c|d).
4872 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 4908 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
4873 Zone* zone = compiler->zone(); 4909 Zone* zone = compiler->zone();
4874 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4910 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4875 int length = alternatives->length(); 4911 int length = alternatives->length();
4876 4912
4877 int write_posn = 0; 4913 int write_posn = 0;
4878 int i = 0; 4914 int i = 0;
4879 while (i < length) { 4915 while (i < length) {
4880 RegExpTree* alternative = alternatives->at(i); 4916 RegExpTree* alternative = alternatives->at(i);
4881 if (!alternative->IsAtom()) { 4917 if (!alternative->IsAtom()) {
4882 alternatives->at(write_posn++) = alternatives->at(i); 4918 alternatives->at(write_posn++) = alternatives->at(i);
4883 i++; 4919 i++;
4884 continue; 4920 continue;
4885 } 4921 }
4886 RegExpAtom* atom = alternative->AsAtom(); 4922 RegExpAtom* atom = alternative->AsAtom();
4887 uc16 common_prefix = atom->data().at(0); 4923 unibrow::uchar common_prefix = atom->data().at(0);
4888 int first_with_prefix = i; 4924 int first_with_prefix = i;
4889 int prefix_length = atom->length(); 4925 int prefix_length = atom->length();
4890 i++; 4926 i++;
4891 while (i < length) { 4927 while (i < length) {
4892 alternative = alternatives->at(i); 4928 alternative = alternatives->at(i);
4893 if (!alternative->IsAtom()) break; 4929 if (!alternative->IsAtom()) break;
4894 atom = alternative->AsAtom(); 4930 atom = alternative->AsAtom();
4895 if (atom->data().at(0) != common_prefix) break; 4931 unibrow::uchar new_prefix = atom->data().at(0);
4932 if (new_prefix != common_prefix) {
4933 if (!compiler->ignore_case()) break;
4934 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4935 compiler->isolate()->regexp_macro_assembler_canonicalize();
4936 new_prefix = Canonical(canonicalize, new_prefix);
4937 common_prefix = Canonical(canonicalize, common_prefix);
4938 if (new_prefix != common_prefix) break;
4939 }
4896 prefix_length = Min(prefix_length, atom->length()); 4940 prefix_length = Min(prefix_length, atom->length());
4897 i++; 4941 i++;
4898 } 4942 }
4899 if (i > first_with_prefix + 2) { 4943 if (i > first_with_prefix + 2) {
4900 // Found worthwhile run of alternatives with common prefix of at least one 4944 // Found worthwhile run of alternatives with common prefix of at least one
4901 // character. The sorting function above did not sort on more than one 4945 // character. The sorting function above did not sort on more than one
4902 // character for reasons of correctness, but there may still be a longer 4946 // character for reasons of correctness, but there may still be a longer
4903 // common prefix if the terms were similar or presorted in the input. 4947 // common prefix if the terms were similar or presorted in the input.
4904 // Find out how long the common prefix is. 4948 // Find out how long the common prefix is.
4905 int run_length = i - first_with_prefix; 4949 int run_length = i - first_with_prefix;
4906 atom = alternatives->at(first_with_prefix)->AsAtom(); 4950 atom = alternatives->at(first_with_prefix)->AsAtom();
4907 for (int j = 1; j < run_length && prefix_length > 1; j++) { 4951 for (int j = 1; j < run_length && prefix_length > 1; j++) {
4908 RegExpAtom* old_atom = 4952 RegExpAtom* old_atom =
4909 alternatives->at(j + first_with_prefix)->AsAtom(); 4953 alternatives->at(j + first_with_prefix)->AsAtom();
4910 for (int k = 1; k < prefix_length; k++) { 4954 for (int k = 1; k < prefix_length; k++) {
4911 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k; 4955 if (atom->data().at(k) != old_atom->data().at(k)) {
4956 prefix_length = k;
4957 break;
4958 }
4912 } 4959 }
4913 } 4960 }
4914 RegExpAtom* prefix = 4961 RegExpAtom* prefix =
4915 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); 4962 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
4916 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); 4963 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
4917 pair->Add(prefix, zone); 4964 pair->Add(prefix, zone);
4918 ZoneList<RegExpTree*>* suffixes = 4965 ZoneList<RegExpTree*>* suffixes =
4919 new (zone) ZoneList<RegExpTree*>(run_length, zone); 4966 new (zone) ZoneList<RegExpTree*>(run_length, zone);
4920 for (int j = 0; j < run_length; j++) { 4967 for (int j = 0; j < run_length; j++) {
4921 RegExpAtom* old_atom = 4968 RegExpAtom* old_atom =
(...skipping 1424 matching lines...) Expand 10 before | Expand all | Expand 10 after
6346 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6393 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6347 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6394 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6348 heap->isolate()->memory_allocator()->SizeExecutable() > 6395 heap->isolate()->memory_allocator()->SizeExecutable() >
6349 RegExpImpl::kRegExpExecutableMemoryLimit) { 6396 RegExpImpl::kRegExpExecutableMemoryLimit) {
6350 too_much = true; 6397 too_much = true;
6351 } 6398 }
6352 return too_much; 6399 return too_much;
6353 } 6400 }
6354 } // namespace internal 6401 } // namespace internal
6355 } // namespace v8 6402 } // namespace v8
OLDNEW
« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698