Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(14)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1608693003: [regexp] step back if starting unicode regexp within surrogate pair. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass
Patch Set: fix Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include "src/ast/ast.h" 7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
(...skipping 3939 matching lines...) Expand 10 before | Expand all | Expand 10 after
3950 3950
3951 state->preload_is_current_ = 3951 state->preload_is_current_ =
3952 (current_trace->characters_preloaded() == state->preload_characters_); 3952 (current_trace->characters_preloaded() == state->preload_characters_);
3953 state->preload_has_checked_bounds_ = state->preload_is_current_; 3953 state->preload_has_checked_bounds_ = state->preload_is_current_;
3954 } 3954 }
3955 3955
3956 3956
3957 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) { 3957 void ChoiceNode::Emit(RegExpCompiler* compiler, Trace* trace) {
3958 int choice_count = alternatives_->length(); 3958 int choice_count = alternatives_->length();
3959 3959
3960 if (choice_count == 1 && alternatives_->at(0).guards() == NULL) {
3961 alternatives_->at(0).node()->Emit(compiler, trace);
3962 return;
3963 }
3964
3960 AssertGuardsMentionRegisters(trace); 3965 AssertGuardsMentionRegisters(trace);
3961 3966
3962 LimitResult limit_result = LimitVersions(compiler, trace); 3967 LimitResult limit_result = LimitVersions(compiler, trace);
3963 if (limit_result == DONE) return; 3968 if (limit_result == DONE) return;
3964 DCHECK(limit_result == CONTINUE); 3969 DCHECK(limit_result == CONTINUE);
3965 3970
3966 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for 3971 // For loop nodes we already flushed (see LoopChoiceNode::Emit), but for
3967 // other choice nodes we only flush if we are out of code size budget. 3972 // other choice nodes we only flush if we are out of code size budget.
3968 if (trace->flush_budget() == 0 && trace->actions() != NULL) { 3973 if (trace->flush_budget() == 0 && trace->actions() != NULL) {
3969 trace->Flush(compiler, this); 3974 trace->Flush(compiler, this);
(...skipping 1063 matching lines...) Expand 10 before | Expand all | Expand 10 after
5033 } 5038 }
5034 5039
5035 5040
5036 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5041 void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5037 RegExpNode* on_success, 5042 RegExpNode* on_success,
5038 UnicodeRangeSplitter* splitter) { 5043 UnicodeRangeSplitter* splitter) {
5039 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates(); 5044 ZoneList<CharacterRange>* lead_surrogates = splitter->lead_surrogates();
5040 if (lead_surrogates == nullptr) return; 5045 if (lead_surrogates == nullptr) return;
5041 Zone* zone = compiler->zone(); 5046 Zone* zone = compiler->zone();
5042 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]). 5047 // E.g. \ud801 becomes \ud801(?![\udc00-\udfff]).
5043 ZoneList<CharacterRange>* trail_surrogates = 5048 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5044 new (zone) ZoneList<CharacterRange>(1, zone); 5049 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5045 trail_surrogates->Add(
5046 CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd), zone);
5047 5050
5048 RegExpNode* match = 5051 RegExpNode* match;
5049 compiler->read_backward() 5052 if (compiler->read_backward()) {
5050 // Reading backward. Assert that reading forward, there is no trail 5053 // Reading backward. Assert that reading forward, there is no trail
5051 // surrogate, and then backward match the lead surrogate. 5054 // surrogate, and then backward match the lead surrogate.
5052 ? NegativeLookaroundAgainstReadDirectionAndMatch( 5055 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5053 compiler, trail_surrogates, lead_surrogates, on_success, true) 5056 compiler, trail_surrogates, lead_surrogates, on_success, true);
5054 // Reading forward. Forwrad match the lead surrogate and assert that 5057 } else {
5055 // no 5058 // Reading forward. Forward match the lead surrogate and assert that
5056 // trail surrogate follows. 5059 // no trail surrogate follows.
5057 : MatchAndNegativeLookaroundInReadDirection( 5060 match = MatchAndNegativeLookaroundInReadDirection(
5058 compiler, lead_surrogates, trail_surrogates, on_success, false); 5061 compiler, lead_surrogates, trail_surrogates, on_success, false);
5062 }
5059 result->AddAlternative(GuardedAlternative(match)); 5063 result->AddAlternative(GuardedAlternative(match));
5060 } 5064 }
5061 5065
5062 5066
5063 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result, 5067 void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,
5064 RegExpNode* on_success, 5068 RegExpNode* on_success,
5065 UnicodeRangeSplitter* splitter) { 5069 UnicodeRangeSplitter* splitter) {
5066 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates(); 5070 ZoneList<CharacterRange>* trail_surrogates = splitter->trail_surrogates();
5067 if (trail_surrogates == nullptr) return; 5071 if (trail_surrogates == nullptr) return;
5068 Zone* zone = compiler->zone(); 5072 Zone* zone = compiler->zone();
5069 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01 5073 // E.g. \udc01 becomes (?<![\ud800-\udbff])\udc01
5070 ZoneList<CharacterRange>* lead_surrogates = 5074 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5071 new (zone) ZoneList<CharacterRange>(1, zone); 5075 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5072 lead_surrogates->Add(
5073 CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd), zone);
5074 5076
5075 RegExpNode* match = 5077 RegExpNode* match;
5076 compiler->read_backward() 5078 if (compiler->read_backward()) {
5077 // Reading backward. Backward match the trail surrogate and assert 5079 // Reading backward. Backward match the trail surrogate and assert that no
5078 // that no lead surrogate precedes it. 5080 // lead surrogate precedes it.
5079 ? MatchAndNegativeLookaroundInReadDirection( 5081 match = MatchAndNegativeLookaroundInReadDirection(
5080 compiler, trail_surrogates, lead_surrogates, on_success, true) 5082 compiler, trail_surrogates, lead_surrogates, on_success, true);
5081 // Reading forward. Assert that reading backward, there is no lead 5083 } else {
5082 // surrogate, and then forward match the trail surrogate. 5084 // Reading forward. Assert that reading backward, there is no lead
5083 : NegativeLookaroundAgainstReadDirectionAndMatch( 5085 // surrogate, and then forward match the trail surrogate.
5084 compiler, lead_surrogates, trail_surrogates, on_success, false); 5086 match = NegativeLookaroundAgainstReadDirectionAndMatch(
5087 compiler, lead_surrogates, trail_surrogates, on_success, false);
5088 }
5085 result->AddAlternative(GuardedAlternative(match)); 5089 result->AddAlternative(GuardedAlternative(match));
5086 } 5090 }
5087 5091
5088 5092
5093 void AddUnanchoredAdvance(RegExpCompiler* compiler, ChoiceNode* result,
5094 RegExpNode* on_success) {
5095 // This implements ES2015 21.2.5.2.3, AdvanceStringIndex.
5096 DCHECK(!compiler->read_backward());
5097 Zone* zone = compiler->zone();
5098 // Advancing can either consume a BMP character or a trail surrogate.
5099 ZoneList<CharacterRange>* bmp_and_trail =
5100 new (zone) ZoneList<CharacterRange>(2, zone);
5101 bmp_and_trail->Add(CharacterRange::Range(0, kLeadSurrogateStart - 1), zone);
5102 bmp_and_trail->Add(
5103 CharacterRange::Range(kLeadSurrogateEnd + 1, kNonBmpStart - 1), zone);
5104 result->AddAlternative(GuardedAlternative(TextNode::CreateForCharacterRanges(
5105 zone, bmp_and_trail, false, on_success)));
5106
5107 // Or it could consume a lead optionally followed by a trail surrogate.
5108 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
5109 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
5110 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
5111 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
5112 ChoiceNode* optional_trail = new (zone) ChoiceNode(2, zone);
5113 optional_trail->AddAlternative(
5114 GuardedAlternative(TextNode::CreateForCharacterRanges(
5115 zone, trail_surrogates, false, on_success)));
5116 optional_trail->AddAlternative(GuardedAlternative(on_success));
5117 RegExpNode* optional_pair = TextNode::CreateForCharacterRanges(
5118 zone, lead_surrogates, false, optional_trail);
5119 result->AddAlternative(GuardedAlternative(optional_pair));
5120 }
5121
5122
5089 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5123 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5090 RegExpNode* on_success) { 5124 RegExpNode* on_success) {
5091 set_.Canonicalize(); 5125 set_.Canonicalize();
5092 Zone* zone = compiler->zone(); 5126 Zone* zone = compiler->zone();
5093 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5127 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5094 if (compiler->unicode() && !compiler->one_byte()) { 5128 if (compiler->unicode() && !compiler->one_byte()) {
5095 if (is_negated()) { 5129 if (is_negated()) {
5096 ZoneList<CharacterRange>* negated = 5130 ZoneList<CharacterRange>* negated =
5097 new (zone) ZoneList<CharacterRange>(2, zone); 5131 new (zone) ZoneList<CharacterRange>(2, zone);
5098 CharacterRange::Negate(ranges, negated, zone); 5132 CharacterRange::Negate(ranges, negated, zone);
5099 ranges = negated; 5133 ranges = negated;
5100 } 5134 }
5101 if (ranges->length() == 0) { 5135 if (ranges->length() == 0) {
5102 // No matches possible. 5136 // No matches possible.
5103 return new (zone) EndNode(EndNode::BACKTRACK, zone); 5137 return new (zone) EndNode(EndNode::BACKTRACK, zone);
5104 } 5138 }
5105 UnicodeRangeSplitter splitter(zone, ranges); 5139 ChoiceNode* result = new (zone) ChoiceNode(2, zone);
5106 ChoiceNode* result = new (compiler->zone()) ChoiceNode(2, compiler->zone()); 5140 if (standard_type() == '*') {
5107 AddBmpCharacters(compiler, result, on_success, &splitter); 5141 AddUnanchoredAdvance(compiler, result, on_success);
5108 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); 5142 } else {
5109 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); 5143 UnicodeRangeSplitter splitter(zone, ranges);
5110 AddLoneTrailSurrogates(compiler, result, on_success, &splitter); 5144 AddBmpCharacters(compiler, result, on_success, &splitter);
5145 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter);
5146 AddLoneLeadSurrogates(compiler, result, on_success, &splitter);
5147 AddLoneTrailSurrogates(compiler, result, on_success, &splitter);
5148 }
5111 return result; 5149 return result;
5112 } else { 5150 } else {
5113 return new (zone) TextNode(this, compiler->read_backward(), on_success); 5151 return new (zone) TextNode(this, compiler->read_backward(), on_success);
5114 } 5152 }
5115 } 5153 }
5116 5154
5117 5155
5118 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) { 5156 int CompareFirstChar(RegExpTree* const* a, RegExpTree* const* b) {
5119 RegExpAtom* atom1 = (*a)->AsAtom(); 5157 RegExpAtom* atom1 = (*a)->AsAtom();
5120 RegExpAtom* atom2 = (*b)->AsAtom(); 5158 RegExpAtom* atom2 = (*b)->AsAtom();
(...skipping 1385 matching lines...) Expand 10 before | Expand all | Expand 10 after
6506 } 6544 }
6507 } 6545 }
6508 6546
6509 6547
6510 void DispatchTableConstructor::VisitAction(ActionNode* that) { 6548 void DispatchTableConstructor::VisitAction(ActionNode* that) {
6511 RegExpNode* target = that->on_success(); 6549 RegExpNode* target = that->on_success();
6512 target->Accept(this); 6550 target->Accept(this);
6513 } 6551 }
6514 6552
6515 6553
6554 RegExpNode* OptionallyStepBackToLeadSurrogate(RegExpCompiler* compiler,
6555 RegExpNode* on_success) {
6556 // If the regexp matching starts within a surrogate pair, step back
6557 // to the lead surrogate and start matching from there.
6558 DCHECK(!compiler->read_backward());
6559 Zone* zone = compiler->zone();
6560 ZoneList<CharacterRange>* lead_surrogates = CharacterRange::List(
6561 zone, CharacterRange::Range(kLeadSurrogateStart, kLeadSurrogateEnd));
6562 ZoneList<CharacterRange>* trail_surrogates = CharacterRange::List(
6563 zone, CharacterRange::Range(kTrailSurrogateStart, kTrailSurrogateEnd));
6564
6565 ChoiceNode* optional_step_back = new (zone) ChoiceNode(2, zone);
6566
6567 int stack_register = compiler->UnicodeLookaroundStackRegister();
6568 int position_register = compiler->UnicodeLookaroundPositionRegister();
6569 RegExpNode* step_back = TextNode::CreateForCharacterRanges(
6570 zone, lead_surrogates, true, on_success);
6571 RegExpLookaround::Builder builder(true, step_back, stack_register,
6572 position_register);
6573 RegExpNode* match_trail = TextNode::CreateForCharacterRanges(
6574 zone, trail_surrogates, false, builder.on_match_success());
6575
6576 optional_step_back->AddAlternative(
6577 GuardedAlternative(builder.ForMatch(match_trail)));
6578 optional_step_back->AddAlternative(GuardedAlternative(on_success));
6579
6580 return optional_step_back;
6581 }
6582
6583
6516 RegExpEngine::CompilationResult RegExpEngine::Compile( 6584 RegExpEngine::CompilationResult RegExpEngine::Compile(
6517 Isolate* isolate, Zone* zone, RegExpCompileData* data, 6585 Isolate* isolate, Zone* zone, RegExpCompileData* data,
6518 JSRegExp::Flags flags, Handle<String> pattern, 6586 JSRegExp::Flags flags, Handle<String> pattern,
6519 Handle<String> sample_subject, bool is_one_byte) { 6587 Handle<String> sample_subject, bool is_one_byte) {
6520 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) { 6588 if ((data->capture_count + 1) * 2 - 1 > RegExpMacroAssembler::kMaxRegister) {
6521 return IrregexpRegExpTooBig(isolate); 6589 return IrregexpRegExpTooBig(isolate);
6522 } 6590 }
6523 bool ignore_case = flags & JSRegExp::kIgnoreCase; 6591 bool ignore_case = flags & JSRegExp::kIgnoreCase;
6524 bool is_sticky = flags & JSRegExp::kSticky; 6592 bool is_sticky = flags & JSRegExp::kSticky;
6525 bool is_global = flags & JSRegExp::kGlobal; 6593 bool is_global = flags & JSRegExp::kGlobal;
(...skipping 42 matching lines...) Expand 10 before | Expand all | Expand 10 after
6568 node = loop_node; 6636 node = loop_node;
6569 } 6637 }
6570 } 6638 }
6571 if (is_one_byte) { 6639 if (is_one_byte) {
6572 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6640 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6573 // Do it again to propagate the new nodes to places where they were not 6641 // Do it again to propagate the new nodes to places where they were not
6574 // put because they had not been calculated yet. 6642 // put because they had not been calculated yet.
6575 if (node != NULL) { 6643 if (node != NULL) {
6576 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case); 6644 node = node->FilterOneByte(RegExpCompiler::kMaxRecursion, ignore_case);
6577 } 6645 }
6646 } else if (compiler.unicode() && (is_global || is_sticky)) {
6647 node = OptionallyStepBackToLeadSurrogate(&compiler, node);
6578 } 6648 }
6579 6649
6580 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone); 6650 if (node == NULL) node = new(zone) EndNode(EndNode::BACKTRACK, zone);
6581 data->node = node; 6651 data->node = node;
6582 Analysis analysis(isolate, ignore_case, is_one_byte); 6652 Analysis analysis(isolate, ignore_case, is_one_byte);
6583 analysis.EnsureAnalyzed(node); 6653 analysis.EnsureAnalyzed(node);
6584 if (analysis.has_failed()) { 6654 if (analysis.has_failed()) {
6585 const char* error_message = analysis.error_message(); 6655 const char* error_message = analysis.error_message();
6586 return CompilationResult(isolate, error_message); 6656 return CompilationResult(isolate, error_message);
6587 } 6657 }
(...skipping 171 matching lines...) Expand 10 before | Expand all | Expand 10 after
6759 6829
6760 6830
6761 void RegExpResultsCache::Clear(FixedArray* cache) { 6831 void RegExpResultsCache::Clear(FixedArray* cache) {
6762 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6832 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6763 cache->set(i, Smi::FromInt(0)); 6833 cache->set(i, Smi::FromInt(0));
6764 } 6834 }
6765 } 6835 }
6766 6836
6767 } // namespace internal 6837 } // namespace internal
6768 } // namespace v8 6838 } // namespace v8
OLDNEW
« no previous file with comments | « no previous file | src/regexp/regexp-ast.h » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698