Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(180)

Side by Side Diff: src/jsregexp.cc

Issue 1204123003: Reland Extend big-disjunction optimization to case-independent regexps (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Skip slow architecture-independent tests on slower CPUs and simulators Created 5 years, 6 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/v8.h" 5 #include "src/v8.h"
6 6
7 #include "src/ast.h" 7 #include "src/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 4819 matching lines...) Expand 10 before | Expand all | Expand 10 after
4830 RegExpAtom* atom1 = (*a)->AsAtom(); 4830 RegExpAtom* atom1 = (*a)->AsAtom();
4831 RegExpAtom* atom2 = (*b)->AsAtom(); 4831 RegExpAtom* atom2 = (*b)->AsAtom();
4832 uc16 character1 = atom1->data().at(0); 4832 uc16 character1 = atom1->data().at(0);
4833 uc16 character2 = atom2->data().at(0); 4833 uc16 character2 = atom2->data().at(0);
4834 if (character1 < character2) return -1; 4834 if (character1 < character2) return -1;
4835 if (character1 > character2) return 1; 4835 if (character1 > character2) return 1;
4836 return 0; 4836 return 0;
4837 } 4837 }
4838 4838
4839 4839
4840 static unibrow::uchar Canonical(
4841 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4842 unibrow::uchar c) {
4843 unibrow::uchar chars[unibrow::Ecma262Canonicalize::kMaxWidth];
4844 int length = canonicalize->get(c, '\0', chars);
4845 DCHECK_LE(length, 1);
4846 unibrow::uchar canonical = c;
4847 if (length == 1) canonical = chars[0];
4848 return canonical;
4849 }
4850
4851
4852 int CompareFirstCharCaseIndependent(
4853 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize,
4854 RegExpTree* const* a, RegExpTree* const* b) {
4855 RegExpAtom* atom1 = (*a)->AsAtom();
4856 RegExpAtom* atom2 = (*b)->AsAtom();
4857 unibrow::uchar character1 = atom1->data().at(0);
4858 unibrow::uchar character2 = atom2->data().at(0);
4859 if (character1 == character2) return 0;
4860 if (character1 >= 'a' || character2 >= 'a') {
4861 character1 = Canonical(canonicalize, character1);
4862 character2 = Canonical(canonicalize, character2);
4863 }
4864 return static_cast<int>(character1) - static_cast<int>(character2);
4865 }
4866
4867
4840 // We can stable sort runs of atoms, since the order does not matter if they 4868 // We can stable sort runs of atoms, since the order does not matter if they
4841 // start with different characters. 4869 // start with different characters.
4842 // Returns true if any consecutive atoms were found. 4870 // Returns true if any consecutive atoms were found.
4843 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) { 4871 bool RegExpDisjunction::SortConsecutiveAtoms(RegExpCompiler* compiler) {
4844 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4872 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4845 int length = alternatives->length(); 4873 int length = alternatives->length();
4846 bool found_consecutive_atoms = false; 4874 bool found_consecutive_atoms = false;
4847 for (int i = 0; i < length; i++) { 4875 for (int i = 0; i < length; i++) {
4848 while (i < length) { 4876 while (i < length) {
4849 RegExpTree* alternative = alternatives->at(i); 4877 RegExpTree* alternative = alternatives->at(i);
4850 if (alternative->IsAtom()) break; 4878 if (alternative->IsAtom()) break;
4851 i++; 4879 i++;
4852 } 4880 }
4853 // i is length or it is the index of an atom. 4881 // i is length or it is the index of an atom.
4854 if (i == length) break; 4882 if (i == length) break;
4855 int first_atom = i; 4883 int first_atom = i;
4856 i++; 4884 i++;
4857 while (i < length) { 4885 while (i < length) {
4858 RegExpTree* alternative = alternatives->at(i); 4886 RegExpTree* alternative = alternatives->at(i);
4859 if (!alternative->IsAtom()) break; 4887 if (!alternative->IsAtom()) break;
4860 i++; 4888 i++;
4861 } 4889 }
4862 // Sort atoms to get ones with common prefixes together. 4890 // Sort atoms to get ones with common prefixes together.
4863 // This step is not valid if we are in a case-independent regexp, 4891 // This step is more tricky if we are in a case-independent regexp,
4864 // because it would change /is|I/ to /I|is/, and order matters when 4892 // because it would change /is|I/ to /I|is/, and order matters when
4865 // the regexp parts don't match only disjoint starting points. To fix 4893 // the regexp parts don't match only disjoint starting points. To fix
4866 // this would need a version of CompareFirstChar that uses case- 4894 // this we have a version of CompareFirstChar that uses case-
4867 // independent character classes for comparison. 4895 // independent character classes for comparison.
4868 if (!compiler->ignore_case()) { 4896 DCHECK_LT(first_atom, alternatives->length());
4869 DCHECK_LT(first_atom, alternatives->length()); 4897 DCHECK_LE(i, alternatives->length());
4870 DCHECK_LE(i, alternatives->length()); 4898 DCHECK_LE(first_atom, i);
4871 DCHECK_LE(first_atom, i); 4899 if (compiler->ignore_case()) {
4900 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4901 compiler->isolate()->regexp_macro_assembler_canonicalize();
4902 auto compare_closure =
4903 [canonicalize](RegExpTree* const* a, RegExpTree* const* b) {
4904 return CompareFirstCharCaseIndependent(canonicalize, a, b);
4905 };
4906 alternatives->StableSort(compare_closure, first_atom, i - first_atom);
4907 } else {
4872 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom); 4908 alternatives->StableSort(CompareFirstChar, first_atom, i - first_atom);
4873 } 4909 }
4874 if (i - first_atom > 1) found_consecutive_atoms = true; 4910 if (i - first_atom > 1) found_consecutive_atoms = true;
4875 } 4911 }
4876 return found_consecutive_atoms; 4912 return found_consecutive_atoms;
4877 } 4913 }
4878 4914
4879 4915
4880 // Optimizes ab|ac|az to a(?:b|c|d). 4916 // Optimizes ab|ac|az to a(?:b|c|d).
4881 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) { 4917 void RegExpDisjunction::RationalizeConsecutiveAtoms(RegExpCompiler* compiler) {
4882 Zone* zone = compiler->zone(); 4918 Zone* zone = compiler->zone();
4883 ZoneList<RegExpTree*>* alternatives = this->alternatives(); 4919 ZoneList<RegExpTree*>* alternatives = this->alternatives();
4884 int length = alternatives->length(); 4920 int length = alternatives->length();
4885 4921
4886 int write_posn = 0; 4922 int write_posn = 0;
4887 int i = 0; 4923 int i = 0;
4888 while (i < length) { 4924 while (i < length) {
4889 RegExpTree* alternative = alternatives->at(i); 4925 RegExpTree* alternative = alternatives->at(i);
4890 if (!alternative->IsAtom()) { 4926 if (!alternative->IsAtom()) {
4891 alternatives->at(write_posn++) = alternatives->at(i); 4927 alternatives->at(write_posn++) = alternatives->at(i);
4892 i++; 4928 i++;
4893 continue; 4929 continue;
4894 } 4930 }
4895 RegExpAtom* atom = alternative->AsAtom(); 4931 RegExpAtom* atom = alternative->AsAtom();
4896 uc16 common_prefix = atom->data().at(0); 4932 unibrow::uchar common_prefix = atom->data().at(0);
4897 int first_with_prefix = i; 4933 int first_with_prefix = i;
4898 int prefix_length = atom->length(); 4934 int prefix_length = atom->length();
4899 i++; 4935 i++;
4900 while (i < length) { 4936 while (i < length) {
4901 alternative = alternatives->at(i); 4937 alternative = alternatives->at(i);
4902 if (!alternative->IsAtom()) break; 4938 if (!alternative->IsAtom()) break;
4903 atom = alternative->AsAtom(); 4939 atom = alternative->AsAtom();
4904 if (atom->data().at(0) != common_prefix) break; 4940 unibrow::uchar new_prefix = atom->data().at(0);
4941 if (new_prefix != common_prefix) {
4942 if (!compiler->ignore_case()) break;
4943 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
4944 compiler->isolate()->regexp_macro_assembler_canonicalize();
4945 new_prefix = Canonical(canonicalize, new_prefix);
4946 common_prefix = Canonical(canonicalize, common_prefix);
4947 if (new_prefix != common_prefix) break;
4948 }
4905 prefix_length = Min(prefix_length, atom->length()); 4949 prefix_length = Min(prefix_length, atom->length());
4906 i++; 4950 i++;
4907 } 4951 }
4908 if (i > first_with_prefix + 2) { 4952 if (i > first_with_prefix + 2) {
4909 // Found worthwhile run of alternatives with common prefix of at least one 4953 // Found worthwhile run of alternatives with common prefix of at least one
4910 // character. The sorting function above did not sort on more than one 4954 // character. The sorting function above did not sort on more than one
4911 // character for reasons of correctness, but there may still be a longer 4955 // character for reasons of correctness, but there may still be a longer
4912 // common prefix if the terms were similar or presorted in the input. 4956 // common prefix if the terms were similar or presorted in the input.
4913 // Find out how long the common prefix is. 4957 // Find out how long the common prefix is.
4914 int run_length = i - first_with_prefix; 4958 int run_length = i - first_with_prefix;
4915 atom = alternatives->at(first_with_prefix)->AsAtom(); 4959 atom = alternatives->at(first_with_prefix)->AsAtom();
4916 for (int j = 1; j < run_length && prefix_length > 1; j++) { 4960 for (int j = 1; j < run_length && prefix_length > 1; j++) {
4917 RegExpAtom* old_atom = 4961 RegExpAtom* old_atom =
4918 alternatives->at(j + first_with_prefix)->AsAtom(); 4962 alternatives->at(j + first_with_prefix)->AsAtom();
4919 for (int k = 1; k < prefix_length; k++) { 4963 for (int k = 1; k < prefix_length; k++) {
4920 if (atom->data().at(k) != old_atom->data().at(k)) prefix_length = k; 4964 if (atom->data().at(k) != old_atom->data().at(k)) {
4965 prefix_length = k;
4966 break;
4967 }
4921 } 4968 }
4922 } 4969 }
4923 RegExpAtom* prefix = 4970 RegExpAtom* prefix =
4924 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length)); 4971 new (zone) RegExpAtom(atom->data().SubVector(0, prefix_length));
4925 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone); 4972 ZoneList<RegExpTree*>* pair = new (zone) ZoneList<RegExpTree*>(2, zone);
4926 pair->Add(prefix, zone); 4973 pair->Add(prefix, zone);
4927 ZoneList<RegExpTree*>* suffixes = 4974 ZoneList<RegExpTree*>* suffixes =
4928 new (zone) ZoneList<RegExpTree*>(run_length, zone); 4975 new (zone) ZoneList<RegExpTree*>(run_length, zone);
4929 for (int j = 0; j < run_length; j++) { 4976 for (int j = 0; j < run_length; j++) {
4930 RegExpAtom* old_atom = 4977 RegExpAtom* old_atom =
(...skipping 1424 matching lines...) Expand 10 before | Expand all | Expand 10 after
6355 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize; 6402 bool too_much = pattern->length() > RegExpImpl::kRegExpTooLargeToOptimize;
6356 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit && 6403 if (heap->total_regexp_code_generated() > RegExpImpl::kRegExpCompiledLimit &&
6357 heap->isolate()->memory_allocator()->SizeExecutable() > 6404 heap->isolate()->memory_allocator()->SizeExecutable() >
6358 RegExpImpl::kRegExpExecutableMemoryLimit) { 6405 RegExpImpl::kRegExpExecutableMemoryLimit) {
6359 too_much = true; 6406 too_much = true;
6360 } 6407 }
6361 return too_much; 6408 return too_much;
6362 } 6409 }
6363 } // namespace internal 6410 } // namespace internal
6364 } // namespace v8 6411 } // namespace v8
OLDNEW
« no previous file with comments | « src/heap-snapshot-generator.cc ('k') | src/list.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698