OLD | NEW |
1 // Copyright 2012 the V8 project authors. All rights reserved. | 1 // Copyright 2012 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/jsregexp.h" | 5 #include "src/regexp/jsregexp.h" |
6 | 6 |
7 #include "src/ast/ast.h" | 7 #include "src/ast/ast.h" |
8 #include "src/base/platform/platform.h" | 8 #include "src/base/platform/platform.h" |
9 #include "src/compilation-cache.h" | 9 #include "src/compilation-cache.h" |
10 #include "src/compiler.h" | 10 #include "src/compiler.h" |
11 #include "src/execution.h" | 11 #include "src/execution.h" |
12 #include "src/factory.h" | 12 #include "src/factory.h" |
13 #include "src/isolate-inl.h" | 13 #include "src/isolate-inl.h" |
14 #include "src/messages.h" | 14 #include "src/messages.h" |
15 #include "src/ostreams.h" | 15 #include "src/ostreams.h" |
16 #include "src/regexp/interpreter-irregexp.h" | 16 #include "src/regexp/interpreter-irregexp.h" |
17 #include "src/regexp/jsregexp-inl.h" | 17 #include "src/regexp/jsregexp-inl.h" |
18 #include "src/regexp/regexp-macro-assembler.h" | 18 #include "src/regexp/regexp-macro-assembler.h" |
19 #include "src/regexp/regexp-macro-assembler-irregexp.h" | 19 #include "src/regexp/regexp-macro-assembler-irregexp.h" |
20 #include "src/regexp/regexp-macro-assembler-tracer.h" | 20 #include "src/regexp/regexp-macro-assembler-tracer.h" |
21 #include "src/regexp/regexp-parser.h" | 21 #include "src/regexp/regexp-parser.h" |
22 #include "src/regexp/regexp-stack.h" | 22 #include "src/regexp/regexp-stack.h" |
23 #include "src/runtime/runtime.h" | 23 #include "src/runtime/runtime.h" |
24 #include "src/splay-tree-inl.h" | 24 #include "src/splay-tree-inl.h" |
25 #include "src/string-search.h" | 25 #include "src/string-search.h" |
26 #include "src/unicode-decoder.h" | 26 #include "src/unicode-decoder.h" |
27 | 27 |
| 28 #ifdef V8_I18N_SUPPORT |
| 29 #include "unicode/uset.h" |
| 30 #include "unicode/utypes.h" |
| 31 #endif // V8_I18N_SUPPORT |
| 32 |
28 #ifndef V8_INTERPRETED_REGEXP | 33 #ifndef V8_INTERPRETED_REGEXP |
29 #if V8_TARGET_ARCH_IA32 | 34 #if V8_TARGET_ARCH_IA32 |
30 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" | 35 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" |
31 #elif V8_TARGET_ARCH_X64 | 36 #elif V8_TARGET_ARCH_X64 |
32 #include "src/regexp/x64/regexp-macro-assembler-x64.h" | 37 #include "src/regexp/x64/regexp-macro-assembler-x64.h" |
33 #elif V8_TARGET_ARCH_ARM64 | 38 #elif V8_TARGET_ARCH_ARM64 |
34 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" | 39 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" |
35 #elif V8_TARGET_ARCH_ARM | 40 #elif V8_TARGET_ARCH_ARM |
36 #include "src/regexp/arm/regexp-macro-assembler-arm.h" | 41 #include "src/regexp/arm/regexp-macro-assembler-arm.h" |
37 #elif V8_TARGET_ARCH_PPC | 42 #elif V8_TARGET_ARCH_PPC |
(...skipping 4345 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4383 } | 4388 } |
4384 | 4389 |
4385 LimitResult limit_result = LimitVersions(compiler, trace); | 4390 LimitResult limit_result = LimitVersions(compiler, trace); |
4386 if (limit_result == DONE) return; | 4391 if (limit_result == DONE) return; |
4387 DCHECK(limit_result == CONTINUE); | 4392 DCHECK(limit_result == CONTINUE); |
4388 | 4393 |
4389 RecursionCheck rc(compiler); | 4394 RecursionCheck rc(compiler); |
4390 | 4395 |
4391 DCHECK_EQ(start_reg_ + 1, end_reg_); | 4396 DCHECK_EQ(start_reg_ + 1, end_reg_); |
4392 if (compiler->ignore_case()) { | 4397 if (compiler->ignore_case()) { |
4393 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), | 4398 assembler->CheckNotBackReferenceIgnoreCase( |
4394 trace->backtrack()); | 4399 start_reg_, read_backward(), compiler->unicode(), trace->backtrack()); |
4395 } else { | 4400 } else { |
4396 assembler->CheckNotBackReference(start_reg_, read_backward(), | 4401 assembler->CheckNotBackReference(start_reg_, read_backward(), |
4397 trace->backtrack()); | 4402 trace->backtrack()); |
4398 } | 4403 } |
4399 // We are going to advance backward, so we may end up at the start. | 4404 // We are going to advance backward, so we may end up at the start. |
4400 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); | 4405 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); |
4401 on_success()->Emit(compiler, trace); | 4406 on_success()->Emit(compiler, trace); |
4402 } | 4407 } |
4403 | 4408 |
4404 | 4409 |
(...skipping 449 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
4854 return true; | 4859 return true; |
4855 } | 4860 } |
4856 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { | 4861 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { |
4857 set_.set_standard_set_type('W'); | 4862 set_.set_standard_set_type('W'); |
4858 return true; | 4863 return true; |
4859 } | 4864 } |
4860 return false; | 4865 return false; |
4861 } | 4866 } |
4862 | 4867 |
4863 | 4868 |
4864 bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) { | |
4865 ZoneList<CharacterRange>* ranges = this->ranges(zone); | |
4866 CharacterRange::Canonicalize(ranges); | |
4867 for (int i = ranges->length() - 1; i >= 0; i--) { | |
4868 uc32 from = ranges->at(i).from(); | |
4869 uc32 to = ranges->at(i).to(); | |
4870 // Check for non-BMP characters. | |
4871 if (to >= kNonBmpStart) return true; | |
4872 // Check for lone surrogates. | |
4873 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; | |
4874 } | |
4875 return false; | |
4876 } | |
4877 | |
4878 | |
4879 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, | 4869 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, |
4880 ZoneList<CharacterRange>* base) | 4870 ZoneList<CharacterRange>* base) |
4881 : zone_(zone), | 4871 : zone_(zone), |
4882 table_(zone), | 4872 table_(zone), |
4883 bmp_(nullptr), | 4873 bmp_(nullptr), |
4884 lead_surrogates_(nullptr), | 4874 lead_surrogates_(nullptr), |
4885 trail_surrogates_(nullptr), | 4875 trail_surrogates_(nullptr), |
4886 non_bmp_(nullptr) { | 4876 non_bmp_(nullptr) { |
4887 // The unicode range splitter categorizes given character ranges into: | 4877 // The unicode range splitter categorizes given character ranges into: |
4888 // - Code points from the BMP representable by one code unit. | 4878 // - Code points from the BMP representable by one code unit. |
(...skipping 190 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
5079 ? MatchAndNegativeLookaroundInReadDirection( | 5069 ? MatchAndNegativeLookaroundInReadDirection( |
5080 compiler, trail_surrogates, lead_surrogates, on_success, true) | 5070 compiler, trail_surrogates, lead_surrogates, on_success, true) |
5081 // Reading forward. Assert that reading backward, there is no lead | 5071 // Reading forward. Assert that reading backward, there is no lead |
5082 // surrogate, and then forward match the trail surrogate. | 5072 // surrogate, and then forward match the trail surrogate. |
5083 : NegativeLookaroundAgainstReadDirectionAndMatch( | 5073 : NegativeLookaroundAgainstReadDirectionAndMatch( |
5084 compiler, lead_surrogates, trail_surrogates, on_success, false); | 5074 compiler, lead_surrogates, trail_surrogates, on_success, false); |
5085 result->AddAlternative(GuardedAlternative(match)); | 5075 result->AddAlternative(GuardedAlternative(match)); |
5086 } | 5076 } |
5087 | 5077 |
5088 | 5078 |
| 5079 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler, |
| 5080 ZoneList<CharacterRange>* ranges) { |
| 5081 #ifdef V8_I18N_SUPPORT |
| 5082 // Use ICU to compute the case fold closure over the ranges. |
| 5083 DCHECK(compiler->unicode()); |
| 5084 DCHECK(compiler->ignore_case()); |
| 5085 USet* set = uset_openEmpty(); |
| 5086 for (int i = 0; i < ranges->length(); i++) { |
| 5087 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to()); |
| 5088 } |
| 5089 ranges->Clear(); |
| 5090 uset_closeOver(set, USET_CASE_INSENSITIVE); |
| 5091 // Full case mapping map single characters to multiple characters. |
| 5092 // Those are represented as strings in the set. Remove them so that |
| 5093 // we end up with only simple and common case mappings. |
| 5094 uset_removeAllStrings(set); |
| 5095 int item_count = uset_getItemCount(set); |
| 5096 int item_result = 0; |
| 5097 UErrorCode ec = U_ZERO_ERROR; |
| 5098 Zone* zone = compiler->zone(); |
| 5099 for (int i = 0; i < item_count; i++) { |
| 5100 uc32 start = 0; |
| 5101 uc32 end = 0; |
| 5102 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec); |
| 5103 ranges->Add(CharacterRange::Range(start, end), zone); |
| 5104 } |
| 5105 // No errors and everything we collected have been ranges. |
| 5106 DCHECK_EQ(U_ZERO_ERROR, ec); |
| 5107 DCHECK_EQ(0, item_result); |
| 5108 uset_close(set); |
| 5109 CharacterRange::Canonicalize(ranges); |
| 5110 #endif // V8_I18N_SUPPORT |
| 5111 } |
| 5112 |
| 5113 |
5089 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, | 5114 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, |
5090 RegExpNode* on_success) { | 5115 RegExpNode* on_success) { |
5091 set_.Canonicalize(); | 5116 set_.Canonicalize(); |
5092 Zone* zone = compiler->zone(); | 5117 Zone* zone = compiler->zone(); |
5093 ZoneList<CharacterRange>* ranges = this->ranges(zone); | 5118 ZoneList<CharacterRange>* ranges = this->ranges(zone); |
| 5119 if (compiler->unicode() && compiler->ignore_case()) { |
| 5120 AddUnicodeCaseEquivalents(compiler, ranges); |
| 5121 } |
5094 if (compiler->unicode() && !compiler->one_byte()) { | 5122 if (compiler->unicode() && !compiler->one_byte()) { |
5095 if (is_negated()) { | 5123 if (is_negated()) { |
5096 ZoneList<CharacterRange>* negated = | 5124 ZoneList<CharacterRange>* negated = |
5097 new (zone) ZoneList<CharacterRange>(2, zone); | 5125 new (zone) ZoneList<CharacterRange>(2, zone); |
5098 CharacterRange::Negate(ranges, negated, zone); | 5126 CharacterRange::Negate(ranges, negated, zone); |
5099 ranges = negated; | 5127 ranges = negated; |
5100 } | 5128 } |
5101 if (ranges->length() == 0) { | 5129 if (ranges->length() == 0) { |
5102 // No matches possible. | 5130 // No matches possible. |
5103 return new (zone) EndNode(EndNode::BACKTRACK, zone); | 5131 return new (zone) EndNode(EndNode::BACKTRACK, zone); |
(...skipping 1655 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
6759 | 6787 |
6760 | 6788 |
6761 void RegExpResultsCache::Clear(FixedArray* cache) { | 6789 void RegExpResultsCache::Clear(FixedArray* cache) { |
6762 for (int i = 0; i < kRegExpResultsCacheSize; i++) { | 6790 for (int i = 0; i < kRegExpResultsCacheSize; i++) { |
6763 cache->set(i, Smi::FromInt(0)); | 6791 cache->set(i, Smi::FromInt(0)); |
6764 } | 6792 } |
6765 } | 6793 } |
6766 | 6794 |
6767 } // namespace internal | 6795 } // namespace internal |
6768 } // namespace v8 | 6796 } // namespace v8 |
OLD | NEW |