Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(531)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 2813893002: [regexp] Consider surrogate pairs when optimizing disjunctions (Closed)
Patch Set: DCHECK(!IsLeadSurrogate) Created 3 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include <memory> 7 #include <memory>
8 8
9 #include "src/base/platform/platform.h" 9 #include "src/base/platform/platform.h"
10 #include "src/compilation-cache.h" 10 #include "src/compilation-cache.h"
(...skipping 3309 matching lines...) Expand 10 before | Expand all | Expand 10 after
3320 } 3320 }
3321 } 3321 }
3322 3322
3323 3323
3324 TextNode* TextNode::CreateForCharacterRanges(Zone* zone, 3324 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,
3325 ZoneList<CharacterRange>* ranges, 3325 ZoneList<CharacterRange>* ranges,
3326 bool read_backward, 3326 bool read_backward,
3327 RegExpNode* on_success) { 3327 RegExpNode* on_success) {
3328 DCHECK_NOT_NULL(ranges); 3328 DCHECK_NOT_NULL(ranges);
3329 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone); 3329 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);
3330 elms->Add( 3330 elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),
3331 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)), 3331 zone);
3332 zone);
3333 return new (zone) TextNode(elms, read_backward, on_success); 3332 return new (zone) TextNode(elms, read_backward, on_success);
3334 } 3333 }
3335 3334
3336 3335
3337 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, 3336 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,
3338 CharacterRange trail, 3337 CharacterRange trail,
3339 bool read_backward, 3338 bool read_backward,
3340 RegExpNode* on_success) { 3339 RegExpNode* on_success) {
3341 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); 3340 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);
3342 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); 3341 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);
3343 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone); 3342 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);
3344 elms->Add(TextElement::CharClass( 3343 elms->Add(
3345 new (zone) RegExpCharacterClass(lead_ranges, false)), 3344 TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),
3346 zone); 3345 zone);
3347 elms->Add(TextElement::CharClass( 3346 elms->Add(
3348 new (zone) RegExpCharacterClass(trail_ranges, false)), 3347 TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),
3349 zone); 3348 zone);
3350 return new (zone) TextNode(elms, read_backward, on_success); 3349 return new (zone) TextNode(elms, read_backward, on_success);
3351 } 3350 }
3352 3351
3353 3352
3354 // This generates the code to match a text node. A text node can contain 3353 // This generates the code to match a text node. A text node can contain
3355 // straight character sequences (possibly to be matched in a case-independent 3354 // straight character sequences (possibly to be matched in a case-independent
3356 // way) and character classes. For efficiency we do not do this in a single 3355 // way) and character classes. For efficiency we do not do this in a single
3357 // pass from left to right. Instead we pass over the text node several times, 3356 // pass from left to right. Instead we pass over the text node several times,
3358 // emitting code for some character positions every time. See the comment on 3357 // emitting code for some character positions every time. See the comment on
3359 // TextEmitPass for details. 3358 // TextEmitPass for details.
(...skipping 1484 matching lines...) Expand 10 before | Expand all | Expand 10 after
4844 return false; 4843 return false;
4845 } 4844 }
4846 } 4845 }
4847 return true; 4846 return true;
4848 } 4847 }
4849 4848
4850 4849
4851 bool RegExpCharacterClass::is_standard(Zone* zone) { 4850 bool RegExpCharacterClass::is_standard(Zone* zone) {
4852 // TODO(lrn): Remove need for this function, by not throwing away information 4851 // TODO(lrn): Remove need for this function, by not throwing away information
4853 // along the way. 4852 // along the way.
4854 if (is_negated_) { 4853 if (is_negated()) {
4855 return false; 4854 return false;
4856 } 4855 }
4857 if (set_.is_standard()) { 4856 if (set_.is_standard()) {
4858 return true; 4857 return true;
4859 } 4858 }
4860 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4859 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
4861 set_.set_standard_set_type('s'); 4860 set_.set_standard_set_type('s');
4862 return true; 4861 return true;
4863 } 4862 }
4864 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { 4863 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {
(...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after
5137 5136
5138 5137
5139 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5138 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5140 RegExpNode* on_success) { 5139 RegExpNode* on_success) {
5141 set_.Canonicalize(); 5140 set_.Canonicalize();
5142 Zone* zone = compiler->zone(); 5141 Zone* zone = compiler->zone();
5143 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5142 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5144 if (compiler->needs_unicode_case_equivalents()) { 5143 if (compiler->needs_unicode_case_equivalents()) {
5145 AddUnicodeCaseEquivalents(ranges, zone); 5144 AddUnicodeCaseEquivalents(ranges, zone);
5146 } 5145 }
5147 if (compiler->unicode() && !compiler->one_byte()) { 5146 if (compiler->unicode() && !compiler->one_byte() &&
5147 !contains_split_surrogate()) {
5148 if (is_negated()) { 5148 if (is_negated()) {
5149 ZoneList<CharacterRange>* negated = 5149 ZoneList<CharacterRange>* negated =
5150 new (zone) ZoneList<CharacterRange>(2, zone); 5150 new (zone) ZoneList<CharacterRange>(2, zone);
5151 CharacterRange::Negate(ranges, negated, zone); 5151 CharacterRange::Negate(ranges, negated, zone);
5152 ranges = negated; 5152 ranges = negated;
5153 } 5153 }
5154 if (ranges->length() == 0) { 5154 if (ranges->length() == 0) {
5155 ranges->Add(CharacterRange::Everything(), zone); 5155 ranges->Add(CharacterRange::Everything(), zone);
5156 RegExpCharacterClass* fail = 5156 RegExpCharacterClass* fail =
5157 new (zone) RegExpCharacterClass(ranges, true); 5157 new (zone) RegExpCharacterClass(ranges, NEGATED);
5158 return new (zone) TextNode(fail, compiler->read_backward(), on_success); 5158 return new (zone) TextNode(fail, compiler->read_backward(), on_success);
5159 } 5159 }
5160 if (standard_type() == '*') { 5160 if (standard_type() == '*') {
5161 return UnanchoredAdvance(compiler, on_success); 5161 return UnanchoredAdvance(compiler, on_success);
5162 } else { 5162 } else {
5163 ChoiceNode* result = new (zone) ChoiceNode(2, zone); 5163 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5164 UnicodeRangeSplitter splitter(zone, ranges); 5164 UnicodeRangeSplitter splitter(zone, ranges);
5165 AddBmpCharacters(compiler, result, on_success, &splitter); 5165 AddBmpCharacters(compiler, result, on_success, &splitter);
5166 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 5166 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5167 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5167 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after
5361 alternatives->at(write_posn++) = alternatives->at(i); 5361 alternatives->at(write_posn++) = alternatives->at(i);
5362 i++; 5362 i++;
5363 continue; 5363 continue;
5364 } 5364 }
5365 RegExpAtom* atom = alternative->AsAtom(); 5365 RegExpAtom* atom = alternative->AsAtom();
5366 if (atom->length() != 1) { 5366 if (atom->length() != 1) {
5367 alternatives->at(write_posn++) = alternatives->at(i); 5367 alternatives->at(write_posn++) = alternatives->at(i);
5368 i++; 5368 i++;
5369 continue; 5369 continue;
5370 } 5370 }
5371 DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5372 bool contains_trail_surrogate =
5373 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5371 int first_in_run = i; 5374 int first_in_run = i;
5372 i++; 5375 i++;
5373 while (i < length) { 5376 while (i < length) {
5374 alternative = alternatives->at(i); 5377 alternative = alternatives->at(i);
5375 if (!alternative->IsAtom()) break; 5378 if (!alternative->IsAtom()) break;
5376 atom = alternative->AsAtom(); 5379 atom = alternative->AsAtom();
5377 if (atom->length() != 1) break; 5380 if (atom->length() != 1) break;
5381 DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));
5382 contains_trail_surrogate |=
5383 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));
5378 i++; 5384 i++;
5379 } 5385 }
5380 if (i > first_in_run + 1) { 5386 if (i > first_in_run + 1) {
5381 // Found non-trivial run of single-character alternatives. 5387 // Found non-trivial run of single-character alternatives.
5382 int run_length = i - first_in_run; 5388 int run_length = i - first_in_run;
5383 ZoneList<CharacterRange>* ranges = 5389 ZoneList<CharacterRange>* ranges =
5384 new (zone) ZoneList<CharacterRange>(2, zone); 5390 new (zone) ZoneList<CharacterRange>(2, zone);
5385 for (int j = 0; j < run_length; j++) { 5391 for (int j = 0; j < run_length; j++) {
5386 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); 5392 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();
5387 DCHECK_EQ(old_atom->length(), 1); 5393 DCHECK_EQ(old_atom->length(), 1);
5388 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); 5394 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);
5389 } 5395 }
5396 RegExpCharacterClass::Flags flags;
5397 if (compiler->unicode() && contains_trail_surrogate) {
5398 flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;
5399 }
5390 alternatives->at(write_posn++) = 5400 alternatives->at(write_posn++) =
5391 new (zone) RegExpCharacterClass(ranges, false); 5401 new (zone) RegExpCharacterClass(ranges, flags);
5392 } else { 5402 } else {
5393 // Just copy any trivial alternatives. 5403 // Just copy any trivial alternatives.
5394 for (int j = first_in_run; j < i; j++) { 5404 for (int j = first_in_run; j < i; j++) {
5395 alternatives->at(write_posn++) = alternatives->at(j); 5405 alternatives->at(write_posn++) = alternatives->at(j);
5396 } 5406 }
5397 } 5407 }
5398 } 5408 }
5399 alternatives->Rewind(write_posn); // Trim end of array. 5409 alternatives->Rewind(write_posn); // Trim end of array.
5400 } 5410 }
5401 5411
(...skipping 1520 matching lines...) Expand 10 before | Expand all | Expand 10 after
6922 6932
6923 6933
6924 void RegExpResultsCache::Clear(FixedArray* cache) { 6934 void RegExpResultsCache::Clear(FixedArray* cache) {
6925 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6935 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6926 cache->set(i, Smi::kZero); 6936 cache->set(i, Smi::kZero);
6927 } 6937 }
6928 } 6938 }
6929 6939
6930 } // namespace internal 6940 } // namespace internal
6931 } // namespace v8 6941 } // namespace v8
OLDNEW
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698