Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(502)

Unified Diff: src/regexp/regexp-parser.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass
Patch Set: fixes Created 4 years, 11 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View side-by-side diff with in-line comments
Download patch
Index: src/regexp/regexp-parser.cc
diff --git a/src/regexp/regexp-parser.cc b/src/regexp/regexp-parser.cc
index 07d5779675786b0dfbec11fb7a8cf8fa19f3aecb..be2cf7c42a880671c3f288d2186d4972a52ac9ff 100644
--- a/src/regexp/regexp-parser.cc
+++ b/src/regexp/regexp-parser.cc
@@ -11,6 +11,10 @@
#include "src/regexp/jsregexp.h"
#include "src/utils.h"
+#ifdef V8_I18N_SUPPORT
+#include "unicode/uset.h"
+#endif // V8_I18N_SUPPORT
+
namespace v8 {
namespace internal {
@@ -1064,13 +1068,20 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
if (pending_surrogate_ != kNoPendingSurrogate) {
uc16 lead_surrogate = pending_surrogate_;
- DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
- ZoneList<uc16> surrogate_pair(2, zone());
- surrogate_pair.Add(lead_surrogate, zone());
- surrogate_pair.Add(trail_surrogate, zone());
- RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
pending_surrogate_ = kNoPendingSurrogate;
- AddAtom(atom);
+ DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
+ uc32 combined =
+ unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
+ if (NeedsDesugaringForIgnoreCase(combined)) {
+ AddCharacterClass(combined);
+ } else {
+ ZoneList<uc16> surrogate_pair(2, zone());
+ surrogate_pair.Add(lead_surrogate, zone());
+ surrogate_pair.Add(trail_surrogate, zone());
+ RegExpAtom* atom =
+ new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
+ AddAtom(atom);
+ }
} else {
pending_surrogate_ = trail_surrogate;
FlushPendingSurrogate();
@@ -1080,14 +1091,10 @@ void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
void RegExpBuilder::FlushPendingSurrogate() {
if (pending_surrogate_ != kNoPendingSurrogate) {
- // Use character class to desugar lone surrogate matching.
- RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(
- CharacterRange::List(zone(),
- CharacterRange::Singleton(pending_surrogate_)),
- false);
- pending_surrogate_ = kNoPendingSurrogate;
DCHECK(unicode());
- AddCharacterClass(cc);
+ uc32 c = pending_surrogate_;
+ pending_surrogate_ = kNoPendingSurrogate;
+ AddCharacterClass(c);
}
}
@@ -1123,11 +1130,15 @@ void RegExpBuilder::FlushText() {
void RegExpBuilder::AddCharacter(uc16 c) {
FlushPendingSurrogate();
pending_empty_ = false;
- if (characters_ == NULL) {
- characters_ = new (zone()) ZoneList<uc16>(4, zone());
+ if (NeedsDesugaringForIgnoreCase(c)) {
+ AddCharacterClass(c);
+ } else {
+ if (characters_ == NULL) {
+ characters_ = new (zone()) ZoneList<uc16>(4, zone());
+ }
+ characters_->Add(c, zone());
+ LAST(ADD_CHAR);
}
- characters_->Add(c, zone());
- LAST(ADD_CHAR);
}
@@ -1150,7 +1161,7 @@ void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
- if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {
+ if (NeedsDesugaringForUnicode(cc)) {
// In unicode mode, character class needs to be desugared, so it
// must be a standalone term instead of being part of a RegExpText.
AddTerm(cc);
@@ -1160,6 +1171,12 @@ void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
}
+void RegExpBuilder::AddCharacterClass(uc32 c) {
+ AddCharacterClass(new (zone()) RegExpCharacterClass(
+ CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
+}
+
+
void RegExpBuilder::AddAtom(RegExpTree* term) {
if (term->IsEmpty()) {
AddEmpty();
@@ -1210,6 +1227,47 @@ void RegExpBuilder::FlushTerms() {
}
+bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
+ if (!unicode()) return false;
+ switch (cc->standard_type()) {
+ case 's': // white space
+ case 'w': // ASCII word character
+ case 'd': // ASCII digit
+ return false; // These characters do not need desugaring.
+ default:
+ break;
+ }
+ ZoneList<CharacterRange>* ranges = cc->ranges(zone());
+ CharacterRange::Canonicalize(ranges);
+ for (int i = ranges->length() - 1; i >= 0; i--) {
+ uc32 from = ranges->at(i).from();
+ uc32 to = ranges->at(i).to();
+ // Check for non-BMP characters.
+ if (to >= kNonBmpStart) return true;
+ // Check for lone surrogates.
+ if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
+ }
+ return false;
+}
+
+
+bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
+#ifdef V8_I18N_SUPPORT
+ if (unicode() && ignore_case()) {
+ USet* set = uset_open(c, c);
+ uset_closeOver(set, USET_CASE_INSENSITIVE);
+ uset_removeAllStrings(set);
+ bool result = uset_size(set) > 1;
+ uset_close(set);
+ return result;
+ }
+ // In the case where ICU is not included, we act as if the unicode flag is
+ // not set, and do not desugar.
+#endif // V8_I18N_SUPPORT
+ return false;
+}
+
+
RegExpTree* RegExpBuilder::ToRegExp() {
FlushTerms();
int num_alternatives = alternatives_.length();

Powered by Google App Engine
This is Rietveld 408576698