OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/jsregexp.h" | 5 #include "src/regexp/jsregexp.h" |
6 | 6 |
7 #include <memory> | 7 #include <memory> |
8 | 8 |
9 #include "src/base/platform/platform.h" | 9 #include "src/base/platform/platform.h" |
10 #include "src/compilation-cache.h" | 10 #include "src/compilation-cache.h" |
(...skipping 3309 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
3320 } | 3320 } |
3321 } | 3321 } |
3322 | 3322 |
3323 | 3323 |
3324 TextNode* TextNode::CreateForCharacterRanges(Zone* zone, | 3324 TextNode* TextNode::CreateForCharacterRanges(Zone* zone, |
3325 ZoneList<CharacterRange>* ranges, | 3325 ZoneList<CharacterRange>* ranges, |
3326 bool read_backward, | 3326 bool read_backward, |
3327 RegExpNode* on_success) { | 3327 RegExpNode* on_success) { |
3328 DCHECK_NOT_NULL(ranges); | 3328 DCHECK_NOT_NULL(ranges); |
3329 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone); | 3329 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(1, zone); |
3330 elms->Add( | 3330 elms->Add(TextElement::CharClass(new (zone) RegExpCharacterClass(ranges)), |
3331 TextElement::CharClass(new (zone) RegExpCharacterClass(ranges, false)), | 3331 zone); |
3332 zone); | |
3333 return new (zone) TextNode(elms, read_backward, on_success); | 3332 return new (zone) TextNode(elms, read_backward, on_success); |
3334 } | 3333 } |
3335 | 3334 |
3336 | 3335 |
3337 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, | 3336 TextNode* TextNode::CreateForSurrogatePair(Zone* zone, CharacterRange lead, |
3338 CharacterRange trail, | 3337 CharacterRange trail, |
3339 bool read_backward, | 3338 bool read_backward, |
3340 RegExpNode* on_success) { | 3339 RegExpNode* on_success) { |
3341 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); | 3340 ZoneList<CharacterRange>* lead_ranges = CharacterRange::List(zone, lead); |
3342 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); | 3341 ZoneList<CharacterRange>* trail_ranges = CharacterRange::List(zone, trail); |
3343 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone); | 3342 ZoneList<TextElement>* elms = new (zone) ZoneList<TextElement>(2, zone); |
3344 elms->Add(TextElement::CharClass( | 3343 elms->Add( |
3345 new (zone) RegExpCharacterClass(lead_ranges, false)), | 3344 TextElement::CharClass(new (zone) RegExpCharacterClass(lead_ranges)), |
3346 zone); | 3345 zone); |
3347 elms->Add(TextElement::CharClass( | 3346 elms->Add( |
3348 new (zone) RegExpCharacterClass(trail_ranges, false)), | 3347 TextElement::CharClass(new (zone) RegExpCharacterClass(trail_ranges)), |
3349 zone); | 3348 zone); |
3350 return new (zone) TextNode(elms, read_backward, on_success); | 3349 return new (zone) TextNode(elms, read_backward, on_success); |
3351 } | 3350 } |
3352 | 3351 |
3353 | 3352 |
3354 // This generates the code to match a text node. A text node can contain | 3353 // This generates the code to match a text node. A text node can contain |
3355 // straight character sequences (possibly to be matched in a case-independent | 3354 // straight character sequences (possibly to be matched in a case-independent |
3356 // way) and character classes. For efficiency we do not do this in a single | 3355 // way) and character classes. For efficiency we do not do this in a single |
3357 // pass from left to right. Instead we pass over the text node several times, | 3356 // pass from left to right. Instead we pass over the text node several times, |
3358 // emitting code for some character positions every time. See the comment on | 3357 // emitting code for some character positions every time. See the comment on |
3359 // TextEmitPass for details. | 3358 // TextEmitPass for details. |
(...skipping 1484 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4844 return false; | 4843 return false; |
4845 } | 4844 } |
4846 } | 4845 } |
4847 return true; | 4846 return true; |
4848 } | 4847 } |
4849 | 4848 |
4850 | 4849 |
4851 bool RegExpCharacterClass::is_standard(Zone* zone) { | 4850 bool RegExpCharacterClass::is_standard(Zone* zone) { |
4852 // TODO(lrn): Remove need for this function, by not throwing away information | 4851 // TODO(lrn): Remove need for this function, by not throwing away information |
4853 // along the way. | 4852 // along the way. |
4854 if (is_negated_) { | 4853 if (is_negated()) { |
4855 return false; | 4854 return false; |
4856 } | 4855 } |
4857 if (set_.is_standard()) { | 4856 if (set_.is_standard()) { |
4858 return true; | 4857 return true; |
4859 } | 4858 } |
4860 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { | 4859 if (CompareRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { |
4861 set_.set_standard_set_type('s'); | 4860 set_.set_standard_set_type('s'); |
4862 return true; | 4861 return true; |
4863 } | 4862 } |
4864 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { | 4863 if (CompareInverseRanges(set_.ranges(zone), kSpaceRanges, kSpaceRangeCount)) { |
(...skipping 272 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5137 | 5136 |
5138 | 5137 |
5139 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, | 5138 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
5140 RegExpNode* on_success) { | 5139 RegExpNode* on_success) { |
5141 set_.Canonicalize(); | 5140 set_.Canonicalize(); |
5142 Zone* zone = compiler->zone(); | 5141 Zone* zone = compiler->zone(); |
5143 ZoneList<CharacterRange>* ranges = this->ranges(zone); | 5142 ZoneList<CharacterRange>* ranges = this->ranges(zone); |
5144 if (compiler->needs_unicode_case_equivalents()) { | 5143 if (compiler->needs_unicode_case_equivalents()) { |
5145 AddUnicodeCaseEquivalents(ranges, zone); | 5144 AddUnicodeCaseEquivalents(ranges, zone); |
5146 } | 5145 } |
5147 if (compiler->unicode() && !compiler->one_byte()) { | 5146 if (compiler->unicode() && !compiler->one_byte() && |
| 5147 !contains_split_surrogate()) { |
5148 if (is_negated()) { | 5148 if (is_negated()) { |
5149 ZoneList<CharacterRange>* negated = | 5149 ZoneList<CharacterRange>* negated = |
5150 new (zone) ZoneList<CharacterRange>(2, zone); | 5150 new (zone) ZoneList<CharacterRange>(2, zone); |
5151 CharacterRange::Negate(ranges, negated, zone); | 5151 CharacterRange::Negate(ranges, negated, zone); |
5152 ranges = negated; | 5152 ranges = negated; |
5153 } | 5153 } |
5154 if (ranges->length() == 0) { | 5154 if (ranges->length() == 0) { |
5155 ranges->Add(CharacterRange::Everything(), zone); | 5155 ranges->Add(CharacterRange::Everything(), zone); |
5156 RegExpCharacterClass* fail = | 5156 RegExpCharacterClass* fail = |
5157 new (zone) RegExpCharacterClass(ranges, true); | 5157 new (zone) RegExpCharacterClass(ranges, NEGATED); |
5158 return new (zone) TextNode(fail, compiler->read_backward(), on_success); | 5158 return new (zone) TextNode(fail, compiler->read_backward(), on_success); |
5159 } | 5159 } |
5160 if (standard_type() == '*') { | 5160 if (standard_type() == '*') { |
5161 return UnanchoredAdvance(compiler, on_success); | 5161 return UnanchoredAdvance(compiler, on_success); |
5162 } else { | 5162 } else { |
5163 ChoiceNode* result = new (zone) ChoiceNode(2, zone); | 5163 ChoiceNode* result = new (zone) ChoiceNode(2, zone); |
5164 UnicodeRangeSplitter splitter(zone, ranges); | 5164 UnicodeRangeSplitter splitter(zone, ranges); |
5165 AddBmpCharacters(compiler, result, on_success, &splitter); | 5165 AddBmpCharacters(compiler, result, on_success, &splitter); |
5166 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); | 5166 AddNonBmpSurrogatePairs(compiler, result, on_success, &splitter); |
5167 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); | 5167 AddLoneLeadSurrogates(compiler, result, on_success, &splitter); |
(...skipping 193 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5361 alternatives->at(write_posn++) = alternatives->at(i); | 5361 alternatives->at(write_posn++) = alternatives->at(i); |
5362 i++; | 5362 i++; |
5363 continue; | 5363 continue; |
5364 } | 5364 } |
5365 RegExpAtom* atom = alternative->AsAtom(); | 5365 RegExpAtom* atom = alternative->AsAtom(); |
5366 if (atom->length() != 1) { | 5366 if (atom->length() != 1) { |
5367 alternatives->at(write_posn++) = alternatives->at(i); | 5367 alternatives->at(write_posn++) = alternatives->at(i); |
5368 i++; | 5368 i++; |
5369 continue; | 5369 continue; |
5370 } | 5370 } |
| 5371 DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); |
| 5372 bool contains_trail_surrogate = |
| 5373 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); |
5371 int first_in_run = i; | 5374 int first_in_run = i; |
5372 i++; | 5375 i++; |
5373 while (i < length) { | 5376 while (i < length) { |
5374 alternative = alternatives->at(i); | 5377 alternative = alternatives->at(i); |
5375 if (!alternative->IsAtom()) break; | 5378 if (!alternative->IsAtom()) break; |
5376 atom = alternative->AsAtom(); | 5379 atom = alternative->AsAtom(); |
5377 if (atom->length() != 1) break; | 5380 if (atom->length() != 1) break; |
| 5381 DCHECK(!unibrow::Utf16::IsLeadSurrogate(atom->data().at(0))); |
| 5382 contains_trail_surrogate |= |
| 5383 unibrow::Utf16::IsTrailSurrogate(atom->data().at(0)); |
5378 i++; | 5384 i++; |
5379 } | 5385 } |
5380 if (i > first_in_run + 1) { | 5386 if (i > first_in_run + 1) { |
5381 // Found non-trivial run of single-character alternatives. | 5387 // Found non-trivial run of single-character alternatives. |
5382 int run_length = i - first_in_run; | 5388 int run_length = i - first_in_run; |
5383 ZoneList<CharacterRange>* ranges = | 5389 ZoneList<CharacterRange>* ranges = |
5384 new (zone) ZoneList<CharacterRange>(2, zone); | 5390 new (zone) ZoneList<CharacterRange>(2, zone); |
5385 for (int j = 0; j < run_length; j++) { | 5391 for (int j = 0; j < run_length; j++) { |
5386 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); | 5392 RegExpAtom* old_atom = alternatives->at(j + first_in_run)->AsAtom(); |
5387 DCHECK_EQ(old_atom->length(), 1); | 5393 DCHECK_EQ(old_atom->length(), 1); |
5388 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); | 5394 ranges->Add(CharacterRange::Singleton(old_atom->data().at(0)), zone); |
5389 } | 5395 } |
| 5396 RegExpCharacterClass::Flags flags; |
| 5397 if (compiler->unicode() && contains_trail_surrogate) { |
| 5398 flags = RegExpCharacterClass::CONTAINS_SPLIT_SURROGATE; |
| 5399 } |
5390 alternatives->at(write_posn++) = | 5400 alternatives->at(write_posn++) = |
5391 new (zone) RegExpCharacterClass(ranges, false); | 5401 new (zone) RegExpCharacterClass(ranges, flags); |
5392 } else { | 5402 } else { |
5393 // Just copy any trivial alternatives. | 5403 // Just copy any trivial alternatives. |
5394 for (int j = first_in_run; j < i; j++) { | 5404 for (int j = first_in_run; j < i; j++) { |
5395 alternatives->at(write_posn++) = alternatives->at(j); | 5405 alternatives->at(write_posn++) = alternatives->at(j); |
5396 } | 5406 } |
5397 } | 5407 } |
5398 } | 5408 } |
5399 alternatives->Rewind(write_posn); // Trim end of array. | 5409 alternatives->Rewind(write_posn); // Trim end of array. |
5400 } | 5410 } |
5401 | 5411 |
(...skipping 1520 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6922 | 6932 |
6923 | 6933 |
6924 void RegExpResultsCache::Clear(FixedArray* cache) { | 6934 void RegExpResultsCache::Clear(FixedArray* cache) { |
6925 for (int i = 0; i < kRegExpResultsCacheSize; i++) { | 6935 for (int i = 0; i < kRegExpResultsCacheSize; i++) { |
6926 cache->set(i, Smi::kZero); | 6936 cache->set(i, Smi::kZero); |
6927 } | 6937 } |
6928 } | 6938 } |
6929 | 6939 |
6930 } // namespace internal | 6940 } // namespace internal |
6931 } // namespace v8 | 6941 } // namespace v8 |
OLD | NEW |