Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(73)

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass
Patch Set: fix mips Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2012 the V8 project authors. All rights reserved. 1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/jsregexp.h" 5 #include "src/regexp/jsregexp.h"
6 6
7 #include "src/ast/ast.h" 7 #include "src/ast/ast.h"
8 #include "src/base/platform/platform.h" 8 #include "src/base/platform/platform.h"
9 #include "src/compilation-cache.h" 9 #include "src/compilation-cache.h"
10 #include "src/compiler.h" 10 #include "src/compiler.h"
11 #include "src/execution.h" 11 #include "src/execution.h"
12 #include "src/factory.h" 12 #include "src/factory.h"
13 #include "src/isolate-inl.h" 13 #include "src/isolate-inl.h"
14 #include "src/messages.h" 14 #include "src/messages.h"
15 #include "src/ostreams.h" 15 #include "src/ostreams.h"
16 #include "src/regexp/interpreter-irregexp.h" 16 #include "src/regexp/interpreter-irregexp.h"
17 #include "src/regexp/jsregexp-inl.h" 17 #include "src/regexp/jsregexp-inl.h"
18 #include "src/regexp/regexp-macro-assembler.h" 18 #include "src/regexp/regexp-macro-assembler.h"
19 #include "src/regexp/regexp-macro-assembler-irregexp.h" 19 #include "src/regexp/regexp-macro-assembler-irregexp.h"
20 #include "src/regexp/regexp-macro-assembler-tracer.h" 20 #include "src/regexp/regexp-macro-assembler-tracer.h"
21 #include "src/regexp/regexp-parser.h" 21 #include "src/regexp/regexp-parser.h"
22 #include "src/regexp/regexp-stack.h" 22 #include "src/regexp/regexp-stack.h"
23 #include "src/runtime/runtime.h" 23 #include "src/runtime/runtime.h"
24 #include "src/splay-tree-inl.h" 24 #include "src/splay-tree-inl.h"
25 #include "src/string-search.h" 25 #include "src/string-search.h"
26 #include "src/unicode-decoder.h" 26 #include "src/unicode-decoder.h"
27 27
28 #ifdef V8_I18N_SUPPORT
29 #include "unicode/uset.h"
30 #include "unicode/utypes.h"
31 #endif // V8_I18N_SUPPORT
32
28 #ifndef V8_INTERPRETED_REGEXP 33 #ifndef V8_INTERPRETED_REGEXP
29 #if V8_TARGET_ARCH_IA32 34 #if V8_TARGET_ARCH_IA32
30 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h" 35 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
31 #elif V8_TARGET_ARCH_X64 36 #elif V8_TARGET_ARCH_X64
32 #include "src/regexp/x64/regexp-macro-assembler-x64.h" 37 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
33 #elif V8_TARGET_ARCH_ARM64 38 #elif V8_TARGET_ARCH_ARM64
34 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h" 39 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
35 #elif V8_TARGET_ARCH_ARM 40 #elif V8_TARGET_ARCH_ARM
36 #include "src/regexp/arm/regexp-macro-assembler-arm.h" 41 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
37 #elif V8_TARGET_ARCH_PPC 42 #elif V8_TARGET_ARCH_PPC
(...skipping 4345 matching lines...) Expand 10 before | Expand all | Expand 10 after
4383 } 4388 }
4384 4389
4385 LimitResult limit_result = LimitVersions(compiler, trace); 4390 LimitResult limit_result = LimitVersions(compiler, trace);
4386 if (limit_result == DONE) return; 4391 if (limit_result == DONE) return;
4387 DCHECK(limit_result == CONTINUE); 4392 DCHECK(limit_result == CONTINUE);
4388 4393
4389 RecursionCheck rc(compiler); 4394 RecursionCheck rc(compiler);
4390 4395
4391 DCHECK_EQ(start_reg_ + 1, end_reg_); 4396 DCHECK_EQ(start_reg_ + 1, end_reg_);
4392 if (compiler->ignore_case()) { 4397 if (compiler->ignore_case()) {
4393 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(), 4398 assembler->CheckNotBackReferenceIgnoreCase(
4394 trace->backtrack()); 4399 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());
4395 } else { 4400 } else {
4396 assembler->CheckNotBackReference(start_reg_, read_backward(), 4401 assembler->CheckNotBackReference(start_reg_, read_backward(),
4397 trace->backtrack()); 4402 trace->backtrack());
4398 } 4403 }
4399 // We are going to advance backward, so we may end up at the start. 4404 // We are going to advance backward, so we may end up at the start.
4400 if (read_backward()) trace->set_at_start(Trace::UNKNOWN); 4405 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);
4401 on_success()->Emit(compiler, trace); 4406 on_success()->Emit(compiler, trace);
4402 } 4407 }
4403 4408
4404 4409
(...skipping 449 matching lines...) Expand 10 before | Expand all | Expand 10 after
4854 return true; 4859 return true;
4855 } 4860 }
4856 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) { 4861 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {
4857 set_.set_standard_set_type('W'); 4862 set_.set_standard_set_type('W');
4858 return true; 4863 return true;
4859 } 4864 }
4860 return false; 4865 return false;
4861 } 4866 }
4862 4867
4863 4868
4864 bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) {
4865 ZoneList<CharacterRange>* ranges = this->ranges(zone);
4866 CharacterRange::Canonicalize(ranges);
4867 for (int i = ranges->length() - 1; i >= 0; i--) {
4868 uc32 from = ranges->at(i).from();
4869 uc32 to = ranges->at(i).to();
4870 // Check for non-BMP characters.
4871 if (to >= kNonBmpStart) return true;
4872 // Check for lone surrogates.
4873 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
4874 }
4875 return false;
4876 }
4877
4878
4879 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone, 4869 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,
4880 ZoneList<CharacterRange>* base) 4870 ZoneList<CharacterRange>* base)
4881 : zone_(zone), 4871 : zone_(zone),
4882 table_(zone), 4872 table_(zone),
4883 bmp_(nullptr), 4873 bmp_(nullptr),
4884 lead_surrogates_(nullptr), 4874 lead_surrogates_(nullptr),
4885 trail_surrogates_(nullptr), 4875 trail_surrogates_(nullptr),
4886 non_bmp_(nullptr) { 4876 non_bmp_(nullptr) {
4887 // The unicode range splitter categorizes given character ranges into: 4877 // The unicode range splitter categorizes given character ranges into:
4888 // - Code points from the BMP representable by one code unit. 4878 // - Code points from the BMP representable by one code unit.
(...skipping 190 matching lines...) Expand 10 before | Expand all | Expand 10 after
5079 ? MatchAndNegativeLookaroundInReadDirection( 5069 ? MatchAndNegativeLookaroundInReadDirection(
5080 compiler, trail_surrogates, lead_surrogates, on_success, true) 5070 compiler, trail_surrogates, lead_surrogates, on_success, true)
5081 // Reading forward. Assert that reading backward, there is no lead 5071 // Reading forward. Assert that reading backward, there is no lead
5082 // surrogate, and then forward match the trail surrogate. 5072 // surrogate, and then forward match the trail surrogate.
5083 : NegativeLookaroundAgainstReadDirectionAndMatch( 5073 : NegativeLookaroundAgainstReadDirectionAndMatch(
5084 compiler, lead_surrogates, trail_surrogates, on_success, false); 5074 compiler, lead_surrogates, trail_surrogates, on_success, false);
5085 result->AddAlternative(GuardedAlternative(match)); 5075 result->AddAlternative(GuardedAlternative(match));
5086 } 5076 }
5087 5077
5088 5078
5079 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,
5080 ZoneList<CharacterRange>* ranges) {
5081 #ifdef V8_I18N_SUPPORT
5082 // Use ICU to compute the case fold closure over the ranges.
5083 DCHECK(compiler->unicode());
5084 DCHECK(compiler->ignore_case());
5085 USet* set = uset_openEmpty();
5086 for (int i = 0; i < ranges->length(); i++) {
5087 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());
5088 }
5089 ranges->Clear();
5090 uset_closeOver(set, USET_CASE_INSENSITIVE);
5091 // Full case mapping map single characters to multiple characters.
5092 // Those are represented as strings in the set. Remove them so that
5093 // we end up with only simple and common case mappings.
5094 uset_removeAllStrings(set);
5095 int item_count = uset_getItemCount(set);
5096 int item_result = 0;
5097 UErrorCode ec = U_ZERO_ERROR;
5098 Zone* zone = compiler->zone();
5099 for (int i = 0; i < item_count; i++) {
5100 uc32 start = 0;
5101 uc32 end = 0;
5102 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);
5103 ranges->Add(CharacterRange::Range(start, end), zone);
5104 }
5105 // No errors and everything we collected have been ranges.
5106 DCHECK_EQ(U_ZERO_ERROR, ec);
5107 DCHECK_EQ(0, item_result);
5108 uset_close(set);
5109 CharacterRange::Canonicalize(ranges);
5110 #endif // V8_I18N_SUPPORT
5111 }
5112
5113
5089 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler, 5114 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,
5090 RegExpNode* on_success) { 5115 RegExpNode* on_success) {
5091 set_.Canonicalize(); 5116 set_.Canonicalize();
5092 Zone* zone = compiler->zone(); 5117 Zone* zone = compiler->zone();
5093 ZoneList<CharacterRange>* ranges = this->ranges(zone); 5118 ZoneList<CharacterRange>* ranges = this->ranges(zone);
5119 if (compiler->unicode() && compiler->ignore_case()) {
5120 AddUnicodeCaseEquivalents(compiler, ranges);
5121 }
5094 if (compiler->unicode() && !compiler->one_byte()) { 5122 if (compiler->unicode() && !compiler->one_byte()) {
5095 if (is_negated()) { 5123 if (is_negated()) {
5096 ZoneList<CharacterRange>* negated = 5124 ZoneList<CharacterRange>* negated =
5097 new (zone) ZoneList<CharacterRange>(2, zone); 5125 new (zone) ZoneList<CharacterRange>(2, zone);
5098 CharacterRange::Negate(ranges, negated, zone); 5126 CharacterRange::Negate(ranges, negated, zone);
5099 ranges = negated; 5127 ranges = negated;
5100 } 5128 }
5101 if (ranges->length() == 0) { 5129 if (ranges->length() == 0) {
5102 // No matches possible. 5130 // No matches possible.
5103 return new (zone) EndNode(EndNode::BACKTRACK, zone); 5131 return new (zone) EndNode(EndNode::BACKTRACK, zone);
(...skipping 1655 matching lines...) Expand 10 before | Expand all | Expand 10 after
6759 6787
6760 6788
6761 void RegExpResultsCache::Clear(FixedArray* cache) { 6789 void RegExpResultsCache::Clear(FixedArray* cache) {
6762 for (int i = 0; i < kRegExpResultsCacheSize; i++) { 6790 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
6763 cache->set(i, Smi::FromInt(0)); 6791 cache->set(i, Smi::FromInt(0));
6764 } 6792 }
6765 } 6793 }
6766 6794
6767 } // namespace internal 6795 } // namespace internal
6768 } // namespace v8 6796 } // namespace v8
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698