OLD | NEW |
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
11 #include "src/regexp/jsregexp.h" | 11 #include "src/regexp/jsregexp.h" |
12 #include "src/utils.h" | 12 #include "src/utils.h" |
13 | 13 |
| 14 #ifdef V8_I18N_SUPPORT |
| 15 #include "unicode/uset.h" |
| 16 #endif // V8_I18N_SUPPORT |
| 17 |
14 namespace v8 { | 18 namespace v8 { |
15 namespace internal { | 19 namespace internal { |
16 | 20 |
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 21 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) | 22 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
19 : isolate_(isolate), | 23 : isolate_(isolate), |
20 zone_(zone), | 24 zone_(zone), |
21 error_(error), | 25 error_(error), |
22 captures_(NULL), | 26 captures_(NULL), |
23 in_(in), | 27 in_(in), |
(...skipping 1033 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1057 FlushPendingSurrogate(); | 1061 FlushPendingSurrogate(); |
1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. | 1062 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
1059 pending_surrogate_ = lead_surrogate; | 1063 pending_surrogate_ = lead_surrogate; |
1060 } | 1064 } |
1061 | 1065 |
1062 | 1066 |
1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { | 1067 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); | 1068 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
1065 if (pending_surrogate_ != kNoPendingSurrogate) { | 1069 if (pending_surrogate_ != kNoPendingSurrogate) { |
1066 uc16 lead_surrogate = pending_surrogate_; | 1070 uc16 lead_surrogate = pending_surrogate_; |
| 1071 pending_surrogate_ = kNoPendingSurrogate; |
1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | 1072 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
1068 ZoneList<uc16> surrogate_pair(2, zone()); | 1073 uc32 combined = |
1069 surrogate_pair.Add(lead_surrogate, zone()); | 1074 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
1070 surrogate_pair.Add(trail_surrogate, zone()); | 1075 if (NeedsDesugaringForIgnoreCase(combined)) { |
1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | 1076 AddCharacterClass(combined); |
1072 pending_surrogate_ = kNoPendingSurrogate; | 1077 } else { |
1073 AddAtom(atom); | 1078 ZoneList<uc16> surrogate_pair(2, zone()); |
| 1079 surrogate_pair.Add(lead_surrogate, zone()); |
| 1080 surrogate_pair.Add(trail_surrogate, zone()); |
| 1081 RegExpAtom* atom = |
| 1082 new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); |
| 1083 AddAtom(atom); |
| 1084 } |
1074 } else { | 1085 } else { |
1075 pending_surrogate_ = trail_surrogate; | 1086 pending_surrogate_ = trail_surrogate; |
1076 FlushPendingSurrogate(); | 1087 FlushPendingSurrogate(); |
1077 } | 1088 } |
1078 } | 1089 } |
1079 | 1090 |
1080 | 1091 |
1081 void RegExpBuilder::FlushPendingSurrogate() { | 1092 void RegExpBuilder::FlushPendingSurrogate() { |
1082 if (pending_surrogate_ != kNoPendingSurrogate) { | 1093 if (pending_surrogate_ != kNoPendingSurrogate) { |
1083 // Use character class to desugar lone surrogate matching. | 1094 DCHECK(unicode()); |
1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( | 1095 uc32 c = pending_surrogate_; |
1085 CharacterRange::List(zone(), | |
1086 CharacterRange::Singleton(pending_surrogate_)), | |
1087 false); | |
1088 pending_surrogate_ = kNoPendingSurrogate; | 1096 pending_surrogate_ = kNoPendingSurrogate; |
1089 DCHECK(unicode()); | 1097 AddCharacterClass(c); |
1090 AddCharacterClass(cc); | |
1091 } | 1098 } |
1092 } | 1099 } |
1093 | 1100 |
1094 | 1101 |
1095 void RegExpBuilder::FlushCharacters() { | 1102 void RegExpBuilder::FlushCharacters() { |
1096 FlushPendingSurrogate(); | 1103 FlushPendingSurrogate(); |
1097 pending_empty_ = false; | 1104 pending_empty_ = false; |
1098 if (characters_ != NULL) { | 1105 if (characters_ != NULL) { |
1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); | 1106 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
1100 characters_ = NULL; | 1107 characters_ = NULL; |
(...skipping 15 matching lines...) Expand all Loading... |
1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); | 1123 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
1117 terms_.Add(text, zone()); | 1124 terms_.Add(text, zone()); |
1118 } | 1125 } |
1119 text_.Clear(); | 1126 text_.Clear(); |
1120 } | 1127 } |
1121 | 1128 |
1122 | 1129 |
1123 void RegExpBuilder::AddCharacter(uc16 c) { | 1130 void RegExpBuilder::AddCharacter(uc16 c) { |
1124 FlushPendingSurrogate(); | 1131 FlushPendingSurrogate(); |
1125 pending_empty_ = false; | 1132 pending_empty_ = false; |
1126 if (characters_ == NULL) { | 1133 if (NeedsDesugaringForIgnoreCase(c)) { |
1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); | 1134 AddCharacterClass(c); |
| 1135 } else { |
| 1136 if (characters_ == NULL) { |
| 1137 characters_ = new (zone()) ZoneList<uc16>(4, zone()); |
| 1138 } |
| 1139 characters_->Add(c, zone()); |
| 1140 LAST(ADD_CHAR); |
1128 } | 1141 } |
1129 characters_->Add(c, zone()); | |
1130 LAST(ADD_CHAR); | |
1131 } | 1142 } |
1132 | 1143 |
1133 | 1144 |
1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 1145 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { | 1146 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
1136 DCHECK(unicode()); | 1147 DCHECK(unicode()); |
1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); | 1148 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1149 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1150 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
1140 AddLeadSurrogate(c); | 1151 AddLeadSurrogate(c); |
1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1152 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
1142 AddTrailSurrogate(c); | 1153 AddTrailSurrogate(c); |
1143 } else { | 1154 } else { |
1144 AddCharacter(static_cast<uc16>(c)); | 1155 AddCharacter(static_cast<uc16>(c)); |
1145 } | 1156 } |
1146 } | 1157 } |
1147 | 1158 |
1148 | 1159 |
1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1150 | 1161 |
1151 | 1162 |
1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { | 1164 if (NeedsDesugaringForUnicode(cc)) { |
1154 // In unicode mode, character class needs to be desugared, so it | 1165 // In unicode mode, character class needs to be desugared, so it |
1155 // must be a standalone term instead of being part of a RegExpText. | 1166 // must be a standalone term instead of being part of a RegExpText. |
1156 AddTerm(cc); | 1167 AddTerm(cc); |
1157 } else { | 1168 } else { |
1158 AddAtom(cc); | 1169 AddAtom(cc); |
1159 } | 1170 } |
1160 } | 1171 } |
1161 | 1172 |
1162 | 1173 |
| 1174 void RegExpBuilder::AddCharacterClass(uc32 c) { |
| 1175 AddCharacterClass(new (zone()) RegExpCharacterClass( |
| 1176 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); |
| 1177 } |
| 1178 |
| 1179 |
1163 void RegExpBuilder::AddAtom(RegExpTree* term) { | 1180 void RegExpBuilder::AddAtom(RegExpTree* term) { |
1164 if (term->IsEmpty()) { | 1181 if (term->IsEmpty()) { |
1165 AddEmpty(); | 1182 AddEmpty(); |
1166 return; | 1183 return; |
1167 } | 1184 } |
1168 if (term->IsTextElement()) { | 1185 if (term->IsTextElement()) { |
1169 FlushCharacters(); | 1186 FlushCharacters(); |
1170 text_.Add(term, zone()); | 1187 text_.Add(term, zone()); |
1171 } else { | 1188 } else { |
1172 FlushText(); | 1189 FlushText(); |
(...skipping 30 matching lines...) Expand all Loading... |
1203 alternative = terms_.last(); | 1220 alternative = terms_.last(); |
1204 } else { | 1221 } else { |
1205 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); | 1222 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); |
1206 } | 1223 } |
1207 alternatives_.Add(alternative, zone()); | 1224 alternatives_.Add(alternative, zone()); |
1208 terms_.Clear(); | 1225 terms_.Clear(); |
1209 LAST(ADD_NONE); | 1226 LAST(ADD_NONE); |
1210 } | 1227 } |
1211 | 1228 |
1212 | 1229 |
| 1230 bool RegExpBuilder::NeedsDesugaringForUnicode(RegExpCharacterClass* cc) { |
| 1231 if (!unicode()) return false; |
| 1232 switch (cc->standard_type()) { |
| 1233 case 's': // white space |
| 1234 case 'w': // ASCII word character |
| 1235 case 'd': // ASCII digit |
| 1236 return false; // These characters do not need desugaring. |
| 1237 default: |
| 1238 break; |
| 1239 } |
| 1240 ZoneList<CharacterRange>* ranges = cc->ranges(zone()); |
| 1241 CharacterRange::Canonicalize(ranges); |
| 1242 for (int i = ranges->length() - 1; i >= 0; i--) { |
| 1243 uc32 from = ranges->at(i).from(); |
| 1244 uc32 to = ranges->at(i).to(); |
| 1245 // Check for non-BMP characters. |
| 1246 if (to >= kNonBmpStart) return true; |
| 1247 // Check for lone surrogates. |
| 1248 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; |
| 1249 } |
| 1250 return false; |
| 1251 } |
| 1252 |
| 1253 |
| 1254 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { |
| 1255 #ifdef V8_I18N_SUPPORT |
| 1256 if (unicode() && ignore_case()) { |
| 1257 USet* set = uset_open(c, c); |
| 1258 uset_closeOver(set, USET_CASE_INSENSITIVE); |
| 1259 uset_removeAllStrings(set); |
| 1260 bool result = uset_size(set) > 1; |
| 1261 uset_close(set); |
| 1262 return result; |
| 1263 } |
| 1264 // In the case where ICU is not included, we act as if the unicode flag is |
| 1265 // not set, and do not desugar. |
| 1266 #endif // V8_I18N_SUPPORT |
| 1267 return false; |
| 1268 } |
| 1269 |
| 1270 |
1213 RegExpTree* RegExpBuilder::ToRegExp() { | 1271 RegExpTree* RegExpBuilder::ToRegExp() { |
1214 FlushTerms(); | 1272 FlushTerms(); |
1215 int num_alternatives = alternatives_.length(); | 1273 int num_alternatives = alternatives_.length(); |
1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1274 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
1217 if (num_alternatives == 1) return alternatives_.last(); | 1275 if (num_alternatives == 1) return alternatives_.last(); |
1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1276 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
1219 } | 1277 } |
1220 | 1278 |
1221 | 1279 |
1222 void RegExpBuilder::AddQuantifierToAtom( | 1280 void RegExpBuilder::AddQuantifierToAtom( |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... |
1261 UNREACHABLE(); | 1319 UNREACHABLE(); |
1262 return; | 1320 return; |
1263 } | 1321 } |
1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1322 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1265 zone()); | 1323 zone()); |
1266 LAST(ADD_TERM); | 1324 LAST(ADD_TERM); |
1267 } | 1325 } |
1268 | 1326 |
1269 } // namespace internal | 1327 } // namespace internal |
1270 } // namespace v8 | 1328 } // namespace v8 |
OLD | NEW |