src/regexp/regexp-parser.cc - Issue 1599303002: [regexp] implement case-insensitive unicode regexps.

Side by Side Diff: src/regexp/regexp-parser.cc

Issue 1599303002: [regexp] implement case-insensitive unicode regexps. (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@unicodeclass

Patch Set: fixes Created 4 years, 10 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2016 the V8 project authors. All rights reserved.	1 // Copyright 2016 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/regexp/regexp-parser.h"	5 #include "src/regexp/regexp-parser.h"

6	6

7 #include "src/char-predicates-inl.h"	7 #include "src/char-predicates-inl.h"

8 #include "src/factory.h"	8 #include "src/factory.h"

9 #include "src/isolate.h"	9 #include "src/isolate.h"

10 #include "src/objects-inl.h"	10 #include "src/objects-inl.h"

11 #include "src/regexp/jsregexp.h"	11 #include "src/regexp/jsregexp.h"

12 #include "src/utils.h"	12 #include "src/utils.h"

13	13

	14 #ifdef V8_I18N_SUPPORT

	15 #include "unicode/uset.h"

	16 #endif // V8_I18N_SUPPORT

	17

14 namespace v8 {	18 namespace v8 {

15 namespace internal {	19 namespace internal {

16	20

17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,	21 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error,

18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)	22 JSRegExp::Flags flags, Isolate* isolate, Zone* zone)

19 : isolate_(isolate),	23 : isolate_(isolate),

20 zone_(zone),	24 zone_(zone),

21 error_(error),	25 error_(error),

22 captures_(NULL),	26 captures_(NULL),

23 in_(in),	27 in_(in),

(...skipping 1033 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1057 FlushPendingSurrogate();	1061 FlushPendingSurrogate();

1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.	1062 // Hold onto the lead surrogate, waiting for a trail surrogate to follow.

1059 pending_surrogate_ = lead_surrogate;	1063 pending_surrogate_ = lead_surrogate;

1060 }	1064 }

1061	1065

1062	1066

1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {	1067 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) {

1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));	1068 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate));

1065 if (pending_surrogate_ != kNoPendingSurrogate) {	1069 if (pending_surrogate_ != kNoPendingSurrogate) {

1066 uc16 lead_surrogate = pending_surrogate_;	1070 uc16 lead_surrogate = pending_surrogate_;

	1071 pending_surrogate_ = kNoPendingSurrogate;

1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));	1072 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate));

1068 ZoneList<uc16> surrogate_pair(2, zone());	1073 uc32 combined =

1069 surrogate_pair.Add(lead_surrogate, zone());	1074 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate);

1070 surrogate_pair.Add(trail_surrogate, zone());	1075 if (NeedsDesugaringForIgnoreCase(combined)) {

1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector());	1076 AddCharacterClass(combined);

1072 pending_surrogate_ = kNoPendingSurrogate;	1077 } else {

1073 AddAtom(atom);	1078 ZoneList<uc16> surrogate_pair(2, zone());

	1079 surrogate_pair.Add(lead_surrogate, zone());

	1080 surrogate_pair.Add(trail_surrogate, zone());

	1081 RegExpAtom* atom =

	1082 new (zone()) RegExpAtom(surrogate_pair.ToConstVector());

	1083 AddAtom(atom);

	1084 }

1074 } else {	1085 } else {

1075 pending_surrogate_ = trail_surrogate;	1086 pending_surrogate_ = trail_surrogate;

1076 FlushPendingSurrogate();	1087 FlushPendingSurrogate();

1077 }	1088 }

1078 }	1089 }

1079	1090

1080	1091

1081 void RegExpBuilder::FlushPendingSurrogate() {	1092 void RegExpBuilder::FlushPendingSurrogate() {

1082 if (pending_surrogate_ != kNoPendingSurrogate) {	1093 if (pending_surrogate_ != kNoPendingSurrogate) {

1083 // Use character class to desugar lone surrogate matching.	1094 DCHECK(unicode());

1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass(	1095 uc32 c = pending_surrogate_;

1085 CharacterRange::List(zone(),

1086 CharacterRange::Singleton(pending_surrogate_)),

1087 false);

1088 pending_surrogate_ = kNoPendingSurrogate;	1096 pending_surrogate_ = kNoPendingSurrogate;

1089 DCHECK(unicode());	1097 AddCharacterClass(c);

1090 AddCharacterClass(cc);

1091 }	1098 }

1092 }	1099 }

1093	1100

1094	1101

1095 void RegExpBuilder::FlushCharacters() {	1102 void RegExpBuilder::FlushCharacters() {

1096 FlushPendingSurrogate();	1103 FlushPendingSurrogate();

1097 pending_empty_ = false;	1104 pending_empty_ = false;

1098 if (characters_ != NULL) {	1105 if (characters_ != NULL) {

1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());	1106 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector());

1100 characters_ = NULL;	1107 characters_ = NULL;

(...skipping 15 matching lines...) Expand all Loading...
1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());	1123 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone());

1117 terms_.Add(text, zone());	1124 terms_.Add(text, zone());

1118 }	1125 }

1119 text_.Clear();	1126 text_.Clear();

1120 }	1127 }

1121	1128

1122	1129

1123 void RegExpBuilder::AddCharacter(uc16 c) {	1130 void RegExpBuilder::AddCharacter(uc16 c) {

1124 FlushPendingSurrogate();	1131 FlushPendingSurrogate();

1125 pending_empty_ = false;	1132 pending_empty_ = false;

1126 if (characters_ == NULL) {	1133 if (NeedsDesugaringForIgnoreCase(c)) {

1127 characters_ = new (zone()) ZoneList<uc16>(4, zone());	1134 AddCharacterClass(c);

	1135 } else {

	1136 if (characters_ == NULL) {

	1137 characters_ = new (zone()) ZoneList<uc16>(4, zone());

	1138 }

	1139 characters_->Add(c, zone());

	1140 LAST(ADD_CHAR);

1128 }	1141 }

1129 characters_->Add(c, zone());

1130 LAST(ADD_CHAR);

1131 }	1142 }

1132	1143

1133	1144

1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {	1145 void RegExpBuilder::AddUnicodeCharacter(uc32 c) {

1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {	1146 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) {

1136 DCHECK(unicode());	1147 DCHECK(unicode());

1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));	1148 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c));

1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));	1149 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c));

1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {	1150 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) {

1140 AddLeadSurrogate(c);	1151 AddLeadSurrogate(c);

1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {	1152 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) {

1142 AddTrailSurrogate(c);	1153 AddTrailSurrogate(c);

1143 } else {	1154 } else {

1144 AddCharacter(static_cast<uc16>(c));	1155 AddCharacter(static_cast<uc16>(c));

1145 }	1156 }

1146 }	1157 }

1147	1158

1148	1159

1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }	1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; }

1150	1161

1151	1162

1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {	1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) {

1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) {	1164 if (NeedsDesugaringForUnicode(cc)) {

1154 // In unicode mode, character class needs to be desugared, so it	1165 // In unicode mode, character class needs to be desugared, so it

1155 // must be a standalone term instead of being part of a RegExpText.	1166 // must be a standalone term instead of being part of a RegExpText.

1156 AddTerm(cc);	1167 AddTerm(cc);

1157 } else {	1168 } else {

1158 AddAtom(cc);	1169 AddAtom(cc);

1159 }	1170 }

1160 }	1171 }

1161	1172

1162	1173

	1174 void RegExpBuilder::AddCharacterClass(uc32 c) {

	1175 AddCharacterClass(new (zone()) RegExpCharacterClass(

	1176 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false));

	1177 }

	1178

	1179

1163 void RegExpBuilder::AddAtom(RegExpTree* term) {	1180 void RegExpBuilder::AddAtom(RegExpTree* term) {

1164 if (term->IsEmpty()) {	1181 if (term->IsEmpty()) {

1165 AddEmpty();	1182 AddEmpty();

1166 return;	1183 return;

1167 }	1184 }

1168 if (term->IsTextElement()) {	1185 if (term->IsTextElement()) {

1169 FlushCharacters();	1186 FlushCharacters();

1170 text_.Add(term, zone());	1187 text_.Add(term, zone());

1171 } else {	1188 } else {

1172 FlushText();	1189 FlushText();

(...skipping 30 matching lines...) Expand all Loading...
1203 alternative = terms_.last();	1220 alternative = terms_.last();

1204 } else {	1221 } else {

1205 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));	1222 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone()));

1206 }	1223 }

1207 alternatives_.Add(alternative, zone());	1224 alternatives_.Add(alternative, zone());

1208 terms_.Clear();	1225 terms_.Clear();

1209 LAST(ADD_NONE);	1226 LAST(ADD_NONE);

1210 }	1227 }

1211	1228

1212	1229

	1230 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) {

	1231 if (!unicode()) return false;

	1232 switch (cc->standard_type()) {

	1233 case 's': // white space

	1234 case 'w': // ASCII word character

	1235 case 'd': // ASCII digit

	1236 return false; // These characters do not need desugaring.

	1237 default:

	1238 break;

	1239 }

	1240 ZoneList<CharacterRange>* ranges = cc->ranges(zone());

	1241 CharacterRange::Canonicalize(ranges);

	1242 for (int i = ranges->length() - 1; i >= 0; i--) {

	1243 uc32 from = ranges->at(i).from();

	1244 uc32 to = ranges->at(i).to();

	1245 // Check for non-BMP characters.

	1246 if (to >= kNonBmpStart) return true;

	1247 // Check for lone surrogates.

	1248 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true;

	1249 }

	1250 return false;

	1251 }

	1252

	1253

	1254 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) {

	1255 #ifdef V8_I18N_SUPPORT

	1256 if (unicode() && ignore_case()) {

	1257 USet* set = uset_open(c, c);

	1258 uset_closeOver(set, USET_CASE_INSENSITIVE);

	1259 uset_removeAllStrings(set);

	1260 bool result = uset_size(set) > 1;

	1261 uset_close(set);

	1262 return result;

	1263 }

	1264 // In the case where ICU is not included, we act as if the unicode flag is

	1265 // not set, and do not desugar.

	1266 #endif // V8_I18N_SUPPORT

	1267 return false;

	1268 }

	1269

	1270

1213 RegExpTree* RegExpBuilder::ToRegExp() {	1271 RegExpTree* RegExpBuilder::ToRegExp() {

1214 FlushTerms();	1272 FlushTerms();

1215 int num_alternatives = alternatives_.length();	1273 int num_alternatives = alternatives_.length();

1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty();	1274 if (num_alternatives == 0) return new (zone()) RegExpEmpty();

1217 if (num_alternatives == 1) return alternatives_.last();	1275 if (num_alternatives == 1) return alternatives_.last();

1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));	1276 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone()));

1219 }	1277 }

1220	1278

1221	1279

1222 void RegExpBuilder::AddQuantifierToAtom(	1280 void RegExpBuilder::AddQuantifierToAtom(

(...skipping 38 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1261 UNREACHABLE();	1319 UNREACHABLE();

1262 return;	1320 return;

1263 }	1321 }

1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),	1322 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom),

1265 zone());	1323 zone());

1266 LAST(ADD_TERM);	1324 LAST(ADD_TERM);

1267 }	1325 }

1268	1326

1269 } // namespace internal	1327 } // namespace internal

1270 } // namespace v8	1328 } // namespace v8

OLD	NEW

« src/regexp/jsregexp.cc ('K') | « src/regexp/regexp-parser.h ('k') | src/regexp/x64/regexp-macro-assembler-x64.h » ('j') | no next file with comments »