Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(352)

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass
Patch Set: fixes Created 4 years, 10 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2016 the V8 project authors. All rights reserved. 1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/regexp/regexp-parser.h" 5 #include "src/regexp/regexp-parser.h"
6 6
7 #include "src/char-predicates-inl.h" 7 #include "src/char-predicates-inl.h"
8 #include "src/factory.h" 8 #include "src/factory.h"
9 #include "src/isolate.h" 9 #include "src/isolate.h"
10 #include "src/objects-inl.h" 10 #include "src/objects-inl.h"
11 #include "src/regexp/jsregexp.h" 11 #include "src/regexp/jsregexp.h"
12 #include "src/utils.h" 12 #include "src/utils.h"
13 13
14 #ifdef V8_I18N_SUPPORT
15 #include "unicode/uset.h"
16 #endif // V8_I18N_SUPPORT
17
14 namespace v8 { 18 namespace v8 {
15 namespace internal { 19 namespace internal {
16 20
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, 21 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,
18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) 22 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)
19 : isolate_(isolate), 23 : isolate_(isolate),
20 zone_(zone), 24 zone_(zone),
21 error_(error), 25 error_(error),
22 captures_(NULL), 26 captures_(NULL),
23 in_(in), 27 in_(in),
(...skipping 1033 matching lines...) Expand 10 before | Expand all | Expand 10 after
1057 FlushPendingSurrogate(); 1061 FlushPendingSurrogate();
1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. 1062 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.
1059 pending_surrogate_ = lead_surrogate; 1063 pending_surrogate_ = lead_surrogate;
1060 } 1064 }
1061 1065
1062 1066
1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { 1067 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {
1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); 1068 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));
1065 if (pending_surrogate_ != kNoPendingSurrogate) { 1069 if (pending_surrogate_ != kNoPendingSurrogate) {
1066 uc16 lead_surrogate = pending_surrogate_; 1070 uc16 lead_surrogate = pending_surrogate_;
1071 pending_surrogate_ = kNoPendingSurrogate;
1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); 1072 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));
1068 ZoneList<uc16> surrogate_pair(2, zone()); 1073 uc32 combined =
1069 surrogate_pair.Add(lead_surrogate, zone()); 1074 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);
1070 surrogate_pair.Add(trail_surrogate, zone()); 1075 if (NeedsDesugaringForIgnoreCase(combined)) {
1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); 1076 AddCharacterClass(combined);
1072 pending_surrogate_ = kNoPendingSurrogate; 1077 } else {
1073 AddAtom(atom); 1078 ZoneList<uc16> surrogate_pair(2, zone());
1079 surrogate_pair.Add(lead_surrogate, zone());
1080 surrogate_pair.Add(trail_surrogate, zone());
1081 RegExpAtom* atom =
1082 new (zone()) RegExpAtom(surrogate_pair.ToConstVector());
1083 AddAtom(atom);
1084 }
1074 } else { 1085 } else {
1075 pending_surrogate_ = trail_surrogate; 1086 pending_surrogate_ = trail_surrogate;
1076 FlushPendingSurrogate(); 1087 FlushPendingSurrogate();
1077 } 1088 }
1078 } 1089 }
1079 1090
1080 1091
1081 void RegExpBuilder::FlushPendingSurrogate() { 1092 void RegExpBuilder::FlushPendingSurrogate() {
1082 if (pending_surrogate_ != kNoPendingSurrogate) { 1093 if (pending_surrogate_ != kNoPendingSurrogate) {
1083 // Use character class to desugar lone surrogate matching. 1094 DCHECK(unicode());
1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( 1095 uc32 c = pending_surrogate_;
1085 CharacterRange::List(zone(),
1086 CharacterRange::Singleton(pending_surrogate_)),
1087 false);
1088 pending_surrogate_ = kNoPendingSurrogate; 1096 pending_surrogate_ = kNoPendingSurrogate;
1089 DCHECK(unicode()); 1097 AddCharacterClass(c);
1090 AddCharacterClass(cc);
1091 } 1098 }
1092 } 1099 }
1093 1100
1094 1101
1095 void RegExpBuilder::FlushCharacters() { 1102 void RegExpBuilder::FlushCharacters() {
1096 FlushPendingSurrogate(); 1103 FlushPendingSurrogate();
1097 pending_empty_ = false; 1104 pending_empty_ = false;
1098 if (characters_ != NULL) { 1105 if (characters_ != NULL) {
1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); 1106 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());
1100 characters_ = NULL; 1107 characters_ = NULL;
(...skipping 15 matching lines...) Expand all
1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); 1123 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());
1117 terms_.Add(text, zone()); 1124 terms_.Add(text, zone());
1118 } 1125 }
1119 text_.Clear(); 1126 text_.Clear();
1120 } 1127 }
1121 1128
1122 1129
1123 void RegExpBuilder::AddCharacter(uc16 c) { 1130 void RegExpBuilder::AddCharacter(uc16 c) {
1124 FlushPendingSurrogate(); 1131 FlushPendingSurrogate();
1125 pending_empty_ = false; 1132 pending_empty_ = false;
1126 if (characters_ == NULL) { 1133 if (NeedsDesugaringForIgnoreCase(c)) {
1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); 1134 AddCharacterClass(c);
1135 } else {
1136 if (characters_ == NULL) {
1137 characters_ = new (zone()) ZoneList<uc16>(4, zone());
1138 }
1139 characters_->Add(c, zone());
1140 LAST(ADD_CHAR);
1128 } 1141 }
1129 characters_->Add(c, zone());
1130 LAST(ADD_CHAR);
1131 } 1142 }
1132 1143
1133 1144
1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { 1145 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {
1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { 1146 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {
1136 DCHECK(unicode()); 1147 DCHECK(unicode());
1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); 1148 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));
1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); 1149 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));
1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { 1150 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {
1140 AddLeadSurrogate(c); 1151 AddLeadSurrogate(c);
1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { 1152 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {
1142 AddTrailSurrogate(c); 1153 AddTrailSurrogate(c);
1143 } else { 1154 } else {
1144 AddCharacter(static_cast<uc16>(c)); 1155 AddCharacter(static_cast<uc16>(c));
1145 } 1156 }
1146 } 1157 }
1147 1158
1148 1159
1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } 1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }
1150 1161
1151 1162
1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { 1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {
1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { 1164 if (NeedsDesugaringForUnicode(cc)) {
1154 // In unicode mode, character class needs to be desugared, so it 1165 // In unicode mode, character class needs to be desugared, so it
1155 // must be a standalone term instead of being part of a RegExpText. 1166 // must be a standalone term instead of being part of a RegExpText.
1156 AddTerm(cc); 1167 AddTerm(cc);
1157 } else { 1168 } else {
1158 AddAtom(cc); 1169 AddAtom(cc);
1159 } 1170 }
1160 } 1171 }
1161 1172
1162 1173
1174 void RegExpBuilder::AddCharacterClass(uc32 c) {
1175 AddCharacterClass(new (zone()) RegExpCharacterClass(
1176 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));
1177 }
1178
1179
1163 void RegExpBuilder::AddAtom(RegExpTree* term) { 1180 void RegExpBuilder::AddAtom(RegExpTree* term) {
1164 if (term->IsEmpty()) { 1181 if (term->IsEmpty()) {
1165 AddEmpty(); 1182 AddEmpty();
1166 return; 1183 return;
1167 } 1184 }
1168 if (term->IsTextElement()) { 1185 if (term->IsTextElement()) {
1169 FlushCharacters(); 1186 FlushCharacters();
1170 text_.Add(term, zone()); 1187 text_.Add(term, zone());
1171 } else { 1188 } else {
1172 FlushText(); 1189 FlushText();
(...skipping 30 matching lines...) Expand all
1203 alternative = terms_.last(); 1220 alternative = terms_.last();
1204 } else { 1221 } else {
1205 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); 1222 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));
1206 } 1223 }
1207 alternatives_.Add(alternative, zone()); 1224 alternatives_.Add(alternative, zone());
1208 terms_.Clear(); 1225 terms_.Clear();
1209 LAST(ADD_NONE); 1226 LAST(ADD_NONE);
1210 } 1227 }
1211 1228
1212 1229
1230 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {
1231 if (!unicode()) return false;
1232 switch (cc->standard_type()) {
1233 case 's': // white space
1234 case 'w': // ASCII word character
1235 case 'd': // ASCII digit
1236 return false; // These characters do not need desugaring.
1237 default:
1238 break;
1239 }
1240 ZoneList<CharacterRange>* ranges = cc->ranges(zone());
1241 CharacterRange::Canonicalize(ranges);
1242 for (int i = ranges->length() - 1; i >= 0; i--) {
1243 uc32 from = ranges->at(i).from();
1244 uc32 to = ranges->at(i).to();
1245 // Check for non-BMP characters.
1246 if (to >= kNonBmpStart) return true;
1247 // Check for lone surrogates.
1248 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;
1249 }
1250 return false;
1251 }
1252
1253
1254 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {
1255 #ifdef V8_I18N_SUPPORT
1256 if (unicode() && ignore_case()) {
1257 USet* set = uset_open(c, c);
1258 uset_closeOver(set, USET_CASE_INSENSITIVE);
1259 uset_removeAllStrings(set);
1260 bool result = uset_size(set) > 1;
1261 uset_close(set);
1262 return result;
1263 }
1264 // In the case where ICU is not included, we act as if the unicode flag is
1265 // not set, and do not desugar.
1266 #endif // V8_I18N_SUPPORT
1267 return false;
1268 }
1269
1270
1213 RegExpTree* RegExpBuilder::ToRegExp() { 1271 RegExpTree* RegExpBuilder::ToRegExp() {
1214 FlushTerms(); 1272 FlushTerms();
1215 int num_alternatives = alternatives_.length(); 1273 int num_alternatives = alternatives_.length();
1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); 1274 if (num_alternatives == 0) return new (zone()) RegExpEmpty();
1217 if (num_alternatives == 1) return alternatives_.last(); 1275 if (num_alternatives == 1) return alternatives_.last();
1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); 1276 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));
1219 } 1277 }
1220 1278
1221 1279
1222 void RegExpBuilder::AddQuantifierToAtom( 1280 void RegExpBuilder::AddQuantifierToAtom(
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after
1261 UNREACHABLE(); 1319 UNREACHABLE();
1262 return; 1320 return;
1263 } 1321 }
1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), 1322 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),
1265 zone()); 1323 zone());
1266 LAST(ADD_TERM); 1324 LAST(ADD_TERM);
1267 } 1325 }
1268 1326
1269 } // namespace internal 1327 } // namespace internal
1270 } // namespace v8 1328 } // namespace v8
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698