src/regexp/jsregexp.cc - Issue 2813893002: [regexp] Consider surrogate pairs when optimizing disjunctions

Side by Side Diff: src/regexp/jsregexp.cc

Issue 2813893002: [regexp] Consider surrogate pairs when optimizing disjunctions (Closed)

Patch Set: DCHECK(!IsLeadSurrogate) Created 3 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/jsregexp.h"	5 #include "src/regexp/jsregexp.h"

6	6

7 #include <memory>	7 #include <memory>

8	8

9 #include "src/base/platform/platform.h"	9 #include "src/base/platform/platform.h"

10 #include "src/compilation-cache.h"	10 #include "src/compilation-cache.h"

(...skipping 3309 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
3320 }	3320 }

3321 }	3321 }

3322	3322

3323	3323

3324 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,	3324 TextNode* TextNode::CreateForCharacterRanges(Zone* zone,

3325 ZoneList<CharacterRange>* ranges,	3325 ZoneList<CharacterRange>* ranges,

3326 bool read_backward,	3326 bool read_backward,

3327 RegExpNode* on_success) {	3327 RegExpNode* on_success) {

3328 DCHECK_NOT_NULL(ranges);	3328 DCHECK_NOT_NULL(ranges);

3329 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);	3329 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone);

3330 elms->Add(	3330 elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)),

3331 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)),	3331 zone);

3332 zone);

3333 return new (zone) TextNode(elms, read_backward, on_success);	3332 return new (zone) TextNode(elms, read_backward, on_success);

3334 }	3333 }

3335	3334

3336	3335

3337 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,	3336 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead,

3338 CharacterRange trail,	3337 CharacterRange trail,

3339 bool read_backward,	3338 bool read_backward,

3340 RegExpNode* on_success) {	3339 RegExpNode* on_success) {

3341 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);	3340 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead);

3342 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);	3341 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail);

3343 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);	3342 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone);

3344 elms->Add(TextElement::CharClass(	3343 elms->Add(

3345 new (zone) RegExpCharacterClass(lead_ranges, false)),	3344 TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)),

3346 zone);	3345 zone);

3347 elms->Add(TextElement::CharClass(	3346 elms->Add(

3348 new (zone) RegExpCharacterClass(trail_ranges, false)),	3347 TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)),

3349 zone);	3348 zone);

3350 return new (zone) TextNode(elms, read_backward, on_success);	3349 return new (zone) TextNode(elms, read_backward, on_success);

3351 }	3350 }

3352	3351

3353	3352

3354 // This generates the code to match a text node. A text node can contain	3353 // This generates the code to match a text node. A text node can contain

3355 // straight character sequences (possibly to be matched in a case-independent	3354 // straight character sequences (possibly to be matched in a case-independent

3356 // way) and character classes. For efficiency we do not do this in a single	3355 // way) and character classes. For efficiency we do not do this in a single

3357 // pass from left to right. Instead we pass over the text node several times,	3356 // pass from left to right. Instead we pass over the text node several times,

3358 // emitting code for some character positions every time. See the comment on	3357 // emitting code for some character positions every time. See the comment on

3359 // TextEmitPass for details.	3358 // TextEmitPass for details.

(...skipping 1484 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4844 return false;	4843 return false;

4845 }	4844 }

4846 }	4845 }

4847 return true;	4846 return true;

4848 }	4847 }

4849	4848

4850	4849

4851 bool RegExpCharacterClass::is_standard(Zone* zone) {	4850 bool RegExpCharacterClass::is_standard(Zone* zone) {

4852 // TODO(lrn): Remove need for this function, by not throwing away information	4851 // TODO(lrn): Remove need for this function, by not throwing away information

4853 // along the way.	4852 // along the way.

4854 if (is_negated_) {	4853 if (is_negated()) {

4855 return false;	4854 return false;

4856 }	4855 }

4857 if (set_.is_standard()) {	4856 if (set_.is_standard()) {

4858 return true;	4857 return true;

4859 }	4858 }

4860 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {	4859 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {

4861 set_.set_standard_set_type('s');	4860 set_.set_standard_set_type('s');

4862 return true;	4861 return true;

4863 }	4862 }

4864 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {	4863 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) {

(...skipping 272 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5137	5136

5138	5137

5139 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,	5138 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

5140 RegExpNode* on_success) {	5139 RegExpNode* on_success) {

5141 set_.Canonicalize();	5140 set_.Canonicalize();

5142 Zone* zone = compiler->zone();	5141 Zone* zone = compiler->zone();

5143 ZoneList<CharacterRange>* ranges = this->ranges(zone);	5142 ZoneList<CharacterRange>* ranges = this->ranges(zone);

5144 if (compiler->needs_unicode_case_equivalents()) {	5143 if (compiler->needs_unicode_case_equivalents()) {

5145 AddUnicodeCaseEquivalents(ranges, zone);	5144 AddUnicodeCaseEquivalents(ranges, zone);

5146 }	5145 }

5147 if (compiler->unicode() && !compiler->one_byte()) {	5146 if (compiler->unicode() && !compiler->one_byte() &&

	5147 !contains_split_surrogate()) {

5148 if (is_negated()) {	5148 if (is_negated()) {

5149 ZoneList<CharacterRange>* negated =	5149 ZoneList<CharacterRange>* negated =

5150 new (zone) ZoneList<CharacterRange>(2, zone);	5150 new (zone) ZoneList<CharacterRange>(2, zone);

5151 CharacterRange::Negate(ranges, negated, zone);	5151 CharacterRange::Negate(ranges, negated, zone);

5152 ranges = negated;	5152 ranges = negated;

5153 }	5153 }

5154 if (ranges->length() == 0) {	5154 if (ranges->length() == 0) {

5155 ranges->Add(CharacterRange::Everything(), zone);	5155 ranges->Add(CharacterRange::Everything(), zone);

5156 RegExpCharacterClass* fail =	5156 RegExpCharacterClass* fail =

5157 new (zone) RegExpCharacterClass(ranges, true);	5157 new (zone) RegExpCharacterClass(ranges, NEGATED);

5158 return new (zone) TextNode(fail, compiler->read_backward(), on_success);	5158 return new (zone) TextNode(fail, compiler->read_backward(), on_success);

5159 }	5159 }

5160 if (standard_type() == '*') {	5160 if (standard_type() == '*') {

5161 return UnanchoredAdvance(compiler, on_success);	5161 return UnanchoredAdvance(compiler, on_success);

5162 } else {	5162 } else {

5163 ChoiceNode* result = new (zone) ChoiceNode(2, zone);	5163 ChoiceNode* result = new (zone) ChoiceNode(2, zone);

5164 UnicodeRangeSplitter splitter(zone, ranges);	5164 UnicodeRangeSplitter splitter(zone, ranges);

5165 AddBmpCharacters(compiler, result, on_success, &splitter);	5165 AddBmpCharacters(compiler, result, on_success, &splitter);

5166 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);	5166 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);

5167 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);	5167 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);

(...skipping 193 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5361 alternatives->at(write_posn++) = alternatives->at(i);	5361 alternatives->at(write_posn++) = alternatives->at(i);

5362 i++;	5362 i++;

5363 continue;	5363 continue;

5364 }	5364 }

5365 RegExpAtom* atom = alternative->AsAtom();	5365 RegExpAtom* atom = alternative->AsAtom();

5366 if (atom->length() != 1) {	5366 if (atom->length() != 1) {

5367 alternatives->at(write_posn++) = alternatives->at(i);	5367 alternatives->at(write_posn++) = alternatives->at(i);

5368 i++;	5368 i++;

5369 continue;	5369 continue;

5370 }	5370 }

	5371 DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));

	5372 bool contains_trail_surrogate =

	5373 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));

5371 int first_in_run = i;	5374 int first_in_run = i;

5372 i++;	5375 i++;

5373 while (i < length) {	5376 while (i < length) {

5374 alternative = alternatives->at(i);	5377 alternative = alternatives->at(i);

5375 if (!alternative->IsAtom()) break;	5378 if (!alternative->IsAtom()) break;

5376 atom = alternative->AsAtom();	5379 atom = alternative->AsAtom();

5377 if (atom->length() != 1) break;	5380 if (atom->length() != 1) break;

	5381 DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0)));

	5382 contains_trail_surrogate \|=

	5383 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0));

5378 i++;	5384 i++;

5379 }	5385 }

5380 if (i > first_in_run + 1) {	5386 if (i > first_in_run + 1) {

5381 // Found non-trivial run of single-character alternatives.	5387 // Found non-trivial run of single-character alternatives.

5382 int run_length = i - first_in_run;	5388 int run_length = i - first_in_run;

5383 ZoneList<CharacterRange>* ranges =	5389 ZoneList<CharacterRange>* ranges =

5384 new (zone) ZoneList<CharacterRange>(2, zone);	5390 new (zone) ZoneList<CharacterRange>(2, zone);

5385 for (int j = 0; j < run_length; j++) {	5391 for (int j = 0; j < run_length; j++) {

5386 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();	5392 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom();

5387 DCHECK_EQ(old_atom->length(), 1);	5393 DCHECK_EQ(old_atom->length(), 1);

5388 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);	5394 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone);

5389 }	5395 }

	5396 RegExpCharacterClass::Flags flags;

	5397 if (compiler->unicode() && contains_trail_surrogate) {

	5398 flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE;

	5399 }

5390 alternatives->at(write_posn++) =	5400 alternatives->at(write_posn++) =

5391 new (zone) RegExpCharacterClass(ranges, false);	5401 new (zone) RegExpCharacterClass(ranges, flags);

5392 } else {	5402 } else {

5393 // Just copy any trivial alternatives.	5403 // Just copy any trivial alternatives.

5394 for (int j = first_in_run; j < i; j++) {	5404 for (int j = first_in_run; j < i; j++) {

5395 alternatives->at(write_posn++) = alternatives->at(j);	5405 alternatives->at(write_posn++) = alternatives->at(j);

5396 }	5406 }

5397 }	5407 }

5398 }	5408 }

5399 alternatives->Rewind(write_posn); // Trim end of array.	5409 alternatives->Rewind(write_posn); // Trim end of array.

5400 }	5410 }

5401	5411

(...skipping 1520 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6922	6932

6923	6933

6924 void RegExpResultsCache::Clear(FixedArray* cache) {	6934 void RegExpResultsCache::Clear(FixedArray* cache) {

6925 for (int i = 0; i < kRegExpResultsCacheSize; i++) {	6935 for (int i = 0; i < kRegExpResultsCacheSize; i++) {

6926 cache->set(i, Smi::kZero);	6936 cache->set(i, Smi::kZero);

6927 }	6937 }

6928 }	6938 }

6929	6939

6930 } // namespace internal	6940 } // namespace internal

6931 } // namespace v8	6941 } // namespace v8

OLD	NEW

« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »