src/regexp/regexp-parser.cc - Issue 1599303002: [regexp] implement case-insensitive unicode regexps.

Unified Diff: src/regexp/regexp-parser.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass

Patch Set: fixes Created 4 years, 11 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View side-by-side diff with in-line comments

Download patch

Index: src/regexp/regexp-parser.cc

diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc

index 07d5779675786b0dfbec11fb7a8cf8fa19f3aecb..be2cf7c42a880671c3f288d2186d4972a52ac9ff 100644

--- a/src/regexp/regexp-parser.cc

+++ b/src/regexp/regexp-parser.cc

@@ -11,6 +11,10 @@

#include "src/regexp/jsregexp.h"

#include "src/utils.h"

+#ifdef V8_I18N_SUPPORT

+#include "unicode/uset.h"

+#endif // V8_I18N_SUPPORT

namespace v8 {

namespace internal {

@@ -1064,13 +1068,20 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {

DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));

if (pending_surrogate_ != kNoPendingSurrogate) {

uc16 lead_surrogate = pending_surrogate_;

- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

- ZoneList<uc16> surrogate_pair(2, zone());

- surrogate_pair.Add(lead_surrogate, zone());

- surrogate_pair.Add(trail_surrogate, zone());

- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

pending_surrogate_ = kNoPendingSurrogate;

- AddAtom(atom);

+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

+ uc32 combined =

+ unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);

+ if (NeedsDesugaringForIgnoreCase(combined)) {

+ AddCharacterClass(combined);

+ } else {

+ ZoneList<uc16> surrogate_pair(2, zone());

+ surrogate_pair.Add(lead_surrogate, zone());

+ surrogate_pair.Add(trail_surrogate, zone());

+ RegExpAtom* atom =

+ new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

+ AddAtom(atom);

+ }

} else {

pending_surrogate_ = trail_surrogate;

FlushPendingSurrogate();

@@ -1080,14 +1091,10 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {

void RegExpBuilder::FlushPendingSurrogate() {

if (pending_surrogate_ != kNoPendingSurrogate) {

- // Use character class to desugar lone surrogate matching.

- RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(

- CharacterRange::List(zone(),

- CharacterRange::Singleton(pending_surrogate_)),

- false);

- pending_surrogate_ = kNoPendingSurrogate;

DCHECK(unicode());

- AddCharacterClass(cc);

+ uc32 c = pending_surrogate_;

+ pending_surrogate_ = kNoPendingSurrogate;

+ AddCharacterClass(c);

}

@@ -1123,11 +1130,15 @@ void RegExpBuilder::FlushText() {

void RegExpBuilder::AddCharacter(uc16 c) {

FlushPendingSurrogate();

pending_empty_ = false;

- if (characters_ == NULL) {

- characters_ = new (zone()) ZoneList<uc16>(4, zone());

+ if (NeedsDesugaringForIgnoreCase(c)) {

+ AddCharacterClass(c);

+ } else {

+ if (characters_ == NULL) {

+ characters_ = new (zone()) ZoneList<uc16>(4, zone());

+ }

+ characters_->Add(c, zone());

+ LAST(ADD_CHAR);

}

- characters_->Add(c, zone());

- LAST(ADD_CHAR);

}

@@ -1150,7 +1161,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

- if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {

+ if (NeedsDesugaringForUnicode(cc)) {

// In unicode mode, character class needs to be desugared, so it

// must be a standalone term instead of being part of a RegExpText.

AddTerm(cc);

@@ -1160,6 +1171,12 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

}

+void RegExpBuilder::AddCharacterClass(uc32 c) {

+ AddCharacterClass(new (zone()) RegExpCharacterClass(

+ CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));

void RegExpBuilder::AddAtom(RegExpTree* term) {

if (term->IsEmpty()) {

AddEmpty();

@@ -1210,6 +1227,47 @@ void RegExpBuilder::FlushTerms() {

}

+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {

+ if (!unicode()) return false;

+ switch (cc->standard_type()) {

+ case 's': // white space

+ case 'w': // ASCII word character

+ case 'd': // ASCII digit

+ return false; // These characters do not need desugaring.

+ default:

+ break;

+ }

+ ZoneList<CharacterRange>* ranges = cc->ranges(zone());

+ CharacterRange::Canonicalize(ranges);

+ for (int i = ranges->length() - 1; i >= 0; i--) {

+ uc32 from = ranges->at(i).from();

+ uc32 to = ranges->at(i).to();

+ // Check for non-BMP characters.

+ if (to >= kNonBmpStart) return true;

+ // Check for lone surrogates.

+ if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;

+ }

+ return false;

+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {

+#ifdef V8_I18N_SUPPORT

+ if (unicode() && ignore_case()) {

+ USet* set = uset_open(c, c);

+ uset_closeOver(set, USET_CASE_INSENSITIVE);

+ uset_removeAllStrings(set);

+ bool result = uset_size(set) > 1;

+ uset_close(set);

+ return result;

+ }

+ // In the case where ICU is not included, we act as if the unicode flag is

+ // not set, and do not desugar.

+#endif // V8_I18N_SUPPORT

+ return false;

RegExpTree* RegExpBuilder::ToRegExp() {

FlushTerms();

int num_alternatives = alternatives_.length();

« src/regexp/jsregexp.cc ('K') | « src/regexp/regexp-parser.h ('k') | src/regexp/x64/regexp-macro-assembler-x64.h » ('j') | no next file with comments »