src/regexp/jsregexp.cc - Issue 1599303002: [regexp] implement case-insensitive unicode regexps.

Unified Diff: src/regexp/jsregexp.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass

Patch Set: fix mips Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/jsregexp.cc

diff --git a/src/regexp/jsregexp.cc b/src/regexp/jsregexp.cc

index 3559bcd111a4c19eb5755dd264d6a4a8cf39a88a..2229dd8e175cc9483741ad75bff773a0c0a7797f 100644

--- a/src/regexp/jsregexp.cc

+++ b/src/regexp/jsregexp.cc

@@ -25,6 +25,11 @@

#include "src/string-search.h"

#include "src/unicode-decoder.h"

+#ifdef V8_I18N_SUPPORT

+#include "unicode/uset.h"

+#include "unicode/utypes.h"

+#endif // V8_I18N_SUPPORT

#ifndef V8_INTERPRETED_REGEXP

#if V8_TARGET_ARCH_IA32

#include "src/regexp/ia32/regexp-macro-assembler-ia32.h"

@@ -4390,8 +4395,8 @@ void BackReferenceNode::Emit(RegExpCompiler* compiler, Trace* trace) {

DCHECK_EQ(start_reg_ + 1, end_reg_);

if (compiler->ignore_case()) {

- assembler->CheckNotBackReferenceIgnoreCase(start_reg_, read_backward(),

- trace->backtrack());

+ assembler->CheckNotBackReferenceIgnoreCase(

+ start_reg_, read_backward(), compiler->unicode(), trace->backtrack());

} else {

assembler->CheckNotBackReference(start_reg_, read_backward(),

trace->backtrack());

@@ -4861,21 +4866,6 @@ bool RegExpCharacterClass::is_standard(Zone* zone) {

}

-bool RegExpCharacterClass::NeedsDesugaringForUnicode(Zone* zone) {

- ZoneList<CharacterRange>* ranges = this->ranges(zone);

- CharacterRange::Canonicalize(ranges);

- for (int i = ranges->length() - 1; i >= 0; i--) {

- uc32 from = ranges->at(i).from();

- uc32 to = ranges->at(i).to();

- // Check for non-BMP characters.

- if (to >= kNonBmpStart) return true;

- // Check for lone surrogates.

- if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;

- }

- return false;

UnicodeRangeSplitter::UnicodeRangeSplitter(Zone* zone,

ZoneList<CharacterRange>* base)

: zone_(zone),

@@ -5086,11 +5076,49 @@ void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode* result,

}

+void AddUnicodeCaseEquivalents(RegExpCompiler* compiler,

+ ZoneList<CharacterRange>* ranges) {

+#ifdef V8_I18N_SUPPORT

+ // Use ICU to compute the case fold closure over the ranges.

+ DCHECK(compiler->unicode());

+ DCHECK(compiler->ignore_case());

+ USet* set = uset_openEmpty();

+ for (int i = 0; i < ranges->length(); i++) {

+ uset_addRange(set, ranges->at(i).from(), ranges->at(i).to());

+ }

+ ranges->Clear();

+ uset_closeOver(set, USET_CASE_INSENSITIVE);

+ // Full case mapping map single characters to multiple characters.

+ // Those are represented as strings in the set. Remove them so that

+ // we end up with only simple and common case mappings.

+ uset_removeAllStrings(set);

+ int item_count = uset_getItemCount(set);

+ int item_result = 0;

+ UErrorCode ec = U_ZERO_ERROR;

+ Zone* zone = compiler->zone();

+ for (int i = 0; i < item_count; i++) {

+ uc32 start = 0;

+ uc32 end = 0;

+ item_result += uset_getItem(set, i, &start, &end, nullptr, 0, &ec);

+ ranges->Add(CharacterRange::Range(start, end), zone);

+ }

+ // No errors and everything we collected have been ranges.

+ DCHECK_EQ(U_ZERO_ERROR, ec);

+ DCHECK_EQ(0, item_result);

+ uset_close(set);

+ CharacterRange::Canonicalize(ranges);

+#endif // V8_I18N_SUPPORT

RegExpNode* RegExpCharacterClass::ToNode(RegExpCompiler* compiler,

RegExpNode* on_success) {

set_.Canonicalize();

Zone* zone = compiler->zone();

ZoneList<CharacterRange>* ranges = this->ranges(zone);

+ if (compiler->unicode() && compiler->ignore_case()) {

+ AddUnicodeCaseEquivalents(compiler, ranges);

+ }

if (compiler->unicode() && !compiler->one_byte()) {

if (is_negated()) {

ZoneList<CharacterRange>* negated =

« no previous file with comments | « src/regexp/interpreter-irregexp.cc ('k') | src/regexp/mips/regexp-macro-assembler-mips.h » ('j') | src/regexp/regexp-macro-assembler.cc » ('J')