src/regexp/jsregexp.cc - Issue 1599303002: [regexp] implement case-insensitive unicode regexps.

Side by Side Diff: src/regexp/jsregexp.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass

Patch Set: fix mips Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2012 the V8 project authors. All rights reserved.	1 // Copyright 2012 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/jsregexp.h"	5 #include "src/regexp/jsregexp.h"

6	6

7 #include "src/ast/ast.h"	7 #include "src/ast/ast.h"

8 #include "src/base/platform/platform.h"	8 #include "src/base/platform/platform.h"

9 #include "src/compilation-cache.h"	9 #include "src/compilation-cache.h"

10 #include "src/compiler.h"	10 #include "src/compiler.h"

11 #include "src/execution.h"	11 #include "src/execution.h"

12 #include "src/factory.h"	12 #include "src/factory.h"

13 #include "src/isolate-inl.h"	13 #include "src/isolate-inl.h"

14 #include "src/messages.h"	14 #include "src/messages.h"

15 #include "src/ostreams.h"	15 #include "src/ostreams.h"

16 #include "src/regexp/interpreter-irregexp.h"	16 #include "src/regexp/interpreter-irregexp.h"

17 #include "src/regexp/jsregexp-inl.h"	17 #include "src/regexp/jsregexp-inl.h"

18 #include "src/regexp/regexp-macro-assembler.h"	18 #include "src/regexp/regexp-macro-assembler.h"

19 #include "src/regexp/regexp-macro-assembler-irregexp.h"	19 #include "src/regexp/regexp-macro-assembler-irregexp.h"

20 #include "src/regexp/regexp-macro-assembler-tracer.h"	20 #include "src/regexp/regexp-macro-assembler-tracer.h"

21 #include "src/regexp/regexp-parser.h"	21 #include "src/regexp/regexp-parser.h"

22 #include "src/regexp/regexp-stack.h"	22 #include "src/regexp/regexp-stack.h"

23 #include "src/runtime/runtime.h"	23 #include "src/runtime/runtime.h"

24 #include "src/splay-tree-inl.h"	24 #include "src/splay-tree-inl.h"

25 #include "src/string-search.h"	25 #include "src/string-search.h"

26 #include "src/unicode-decoder.h"	26 #include "src/unicode-decoder.h"

27	27

	28 #ifdef V8_I18N_SUPPORT

	29 #include "unicode/uset.h"

	30 #include "unicode/utypes.h"

	31 #endif // V8_I18N_SUPPORT

	32

28 #ifndef V8_INTERPRETED_REGEXP	33 #ifndef V8_INTERPRETED_REGEXP

29 #if V8_TARGET_ARCH_IA32	34 #if V8_TARGET_ARCH_IA32

30 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"	35 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"

31 #elif V8_TARGET_ARCH_X64	36 #elif V8_TARGET_ARCH_X64

32 #include "src/regexp/x64/regexp-macro-assembler-x64.h"	37 #include "src/regexp/x64/regexp-macro-assembler-x64.h"

33 #elif V8_TARGET_ARCH_ARM64	38 #elif V8_TARGET_ARCH_ARM64

34 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"	39 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"

35 #elif V8_TARGET_ARCH_ARM	40 #elif V8_TARGET_ARCH_ARM

36 #include "src/regexp/arm/regexp-macro-assembler-arm.h"	41 #include "src/regexp/arm/regexp-macro-assembler-arm.h"

37 #elif V8_TARGET_ARCH_PPC	42 #elif V8_TARGET_ARCH_PPC

(...skipping 4345 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4383 }	4388 }

4384	4389

4385 LimitResult limit_result = LimitVersions(compiler, trace);	4390 LimitResult limit_result = LimitVersions(compiler, trace);

4386 if (limit_result == DONE) return;	4391 if (limit_result == DONE) return;

4387 DCHECK(limit_result == CONTINUE);	4392 DCHECK(limit_result == CONTINUE);

4388	4393

4389 RecursionCheck rc(compiler);	4394 RecursionCheck rc(compiler);

4390	4395

4391 DCHECK_EQ(start_reg_ + 1, end_reg_);	4396 DCHECK_EQ(start_reg_ + 1, end_reg_);

4392 if (compiler->ignore_case()) {	4397 if (compiler->ignore_case()) {

4393 assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),	4398 assembler->CheckNotBackReferenceIgnoreCase(

4394 trace->backtrack());	4399 start_reg_, read_backward(), compiler->unicode(), trace->backtrack());

4395 } else {	4400 } else {

4396 assembler->CheckNotBackReference(start_reg_, read_backward(),	4401 assembler->CheckNotBackReference(start_reg_, read_backward(),

4397 trace->backtrack());	4402 trace->backtrack());

4398 }	4403 }

4399 // We are going to advance backward, so we may end up at the start.	4404 // We are going to advance backward, so we may end up at the start.

4400 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);	4405 if (read_backward()) trace->set_at_start(Trace::UNKNOWN);

4401 on_success()->Emit(compiler, trace);	4406 on_success()->Emit(compiler, trace);

4402 }	4407 }

4403	4408

4404	4409

(...skipping 449 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
4854 return true;	4859 return true;

4855 }	4860 }

4856 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {	4861 if (CompareInverseRanges(set_.ranges(zone), kWordRanges, kWordRangeCount)) {

4857 set_.set_standard_set_type('W');	4862 set_.set_standard_set_type('W');

4858 return true;	4863 return true;

4859 }	4864 }

4860 return false;	4865 return false;

4861 }	4866 }

4862	4867

4863	4868

4864 bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) {

4865 ZoneList<CharacterRange>* ranges = this->ranges(zone);

4866 CharacterRange::Canonicalize(ranges);

4867 for (int i = ranges->length() - 1; i >= 0; i--) {

4868 uc32 from = ranges->at(i).from();

4869 uc32 to = ranges->at(i).to();

4870 // Check for non-BMP characters.

4871 if (to >= kNonBmpStart) return true;

4872 // Check for lone surrogates.

4873 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;

4874 }

4875 return false;

4876 }

4877

4878

4879 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,	4869 UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,

4880 ZoneList<CharacterRange>* base)	4870 ZoneList<CharacterRange>* base)

4881 : zone_(zone),	4871 : zone_(zone),

4882 table_(zone),	4872 table_(zone),

4883 bmp_(nullptr),	4873 bmp_(nullptr),

4884 lead_surrogates_(nullptr),	4874 lead_surrogates_(nullptr),

4885 trail_surrogates_(nullptr),	4875 trail_surrogates_(nullptr),

4886 non_bmp_(nullptr) {	4876 non_bmp_(nullptr) {

4887 // The unicode range splitter categorizes given character ranges into:	4877 // The unicode range splitter categorizes given character ranges into:

4888 // - Code points from the BMP representable by one code unit.	4878 // - Code points from the BMP representable by one code unit.

(...skipping 190 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
5079 ? MatchAndNegativeLookaroundInReadDirection(	5069 ? MatchAndNegativeLookaroundInReadDirection(

5080 compiler, trail_surrogates, lead_surrogates, on_success, true)	5070 compiler, trail_surrogates, lead_surrogates, on_success, true)

5081 // Reading forward. Assert that reading backward, there is no lead	5071 // Reading forward. Assert that reading backward, there is no lead

5082 // surrogate, and then forward match the trail surrogate.	5072 // surrogate, and then forward match the trail surrogate.

5083 : NegativeLookaroundAgainstReadDirectionAndMatch(	5073 : NegativeLookaroundAgainstReadDirectionAndMatch(

5084 compiler, lead_surrogates, trail_surrogates, on_success, false);	5074 compiler, lead_surrogates, trail_surrogates, on_success, false);

5085 result->AddAlternative(GuardedAlternative(match));	5075 result->AddAlternative(GuardedAlternative(match));

5086 }	5076 }

5087	5077

5088	5078

	5079 void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,

	5080 ZoneList<CharacterRange>* ranges) {

	5081 #ifdef V8_I18N_SUPPORT

	5082 // Use ICU to compute the case fold closure over the ranges.

	5083 DCHECK(compiler->unicode());

	5084 DCHECK(compiler->ignore_case());

	5085 USet* set = uset_openEmpty();

	5086 for (int i = 0; i < ranges->length(); i++) {

	5087 uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());

	5088 }

	5089 ranges->Clear();

	5090 uset_closeOver(set, USET_CASE_INSENSITIVE);

	5091 // Full case mapping map single characters to multiple characters.

	5092 // Those are represented as strings in the set. Remove them so that

	5093 // we end up with only simple and common case mappings.

	5094 uset_removeAllStrings(set);

	5095 int item_count = uset_getItemCount(set);

	5096 int item_result = 0;

	5097 UErrorCode ec = U_ZERO_ERROR;

	5098 Zone* zone = compiler->zone();

	5099 for (int i = 0; i < item_count; i++) {

	5100 uc32 start = 0;

	5101 uc32 end = 0;

	5102 item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);

	5103 ranges->Add(CharacterRange::Range(start, end), zone);

	5104 }

	5105 // No errors and everything we collected have been ranges.

	5106 DCHECK_EQ(U_ZERO_ERROR, ec);

	5107 DCHECK_EQ(0, item_result);

	5108 uset_close(set);

	5109 CharacterRange::Canonicalize(ranges);

	5110 #endif // V8_I18N_SUPPORT

	5111 }

	5112

	5113

5089 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,	5114 RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

5090 RegExpNode* on_success) {	5115 RegExpNode* on_success) {

5091 set_.Canonicalize();	5116 set_.Canonicalize();

5092 Zone* zone = compiler->zone();	5117 Zone* zone = compiler->zone();

5093 ZoneList<CharacterRange>* ranges = this->ranges(zone);	5118 ZoneList<CharacterRange>* ranges = this->ranges(zone);

	5119 if (compiler->unicode() && compiler->ignore_case()) {

	5120 AddUnicodeCaseEquivalents(compiler, ranges);

	5121 }

5094 if (compiler->unicode() && !compiler->one_byte()) {	5122 if (compiler->unicode() && !compiler->one_byte()) {

5095 if (is_negated()) {	5123 if (is_negated()) {

5096 ZoneList<CharacterRange>* negated =	5124 ZoneList<CharacterRange>* negated =

5097 new (zone) ZoneList<CharacterRange>(2, zone);	5125 new (zone) ZoneList<CharacterRange>(2, zone);

5098 CharacterRange::Negate(ranges, negated, zone);	5126 CharacterRange::Negate(ranges, negated, zone);

5099 ranges = negated;	5127 ranges = negated;

5100 }	5128 }

5101 if (ranges->length() == 0) {	5129 if (ranges->length() == 0) {

5102 // No matches possible.	5130 // No matches possible.

5103 return new (zone) EndNode(EndNode::BACKTRACK, zone);	5131 return new (zone) EndNode(EndNode::BACKTRACK, zone);

(...skipping 1655 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
6759	6787

6760	6788

6761 void RegExpResultsCache::Clear(FixedArray* cache) {	6789 void RegExpResultsCache::Clear(FixedArray* cache) {

6762 for (int i = 0; i < kRegExpResultsCacheSize; i++) {	6790 for (int i = 0; i < kRegExpResultsCacheSize; i++) {

6763 cache->set(i, Smi::FromInt(0));	6791 cache->set(i, Smi::FromInt(0));

6764 }	6792 }

6765 }	6793 }

6766	6794

6767 } // namespace internal	6795 } // namespace internal

6768 } // namespace v8	6796 } // namespace v8

OLD	NEW

« no previous file with comments | « src/regexp/interpreter-irregexp.cc ('k') | src/regexp/mips/regexp-macro-assembler-mips.h » ('j') | src/regexp/regexp-macro-assembler.cc » ('J')