OLD | NEW |
---|---|
1 // Copyright 2016 the V8 project authors. All rights reserved. | 1 // Copyright 2016 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 #include "src/regexp/regexp-parser.h" | 5 #include "src/regexp/regexp-parser.h" |
6 | 6 |
7 #include "src/char-predicates-inl.h" | 7 #include "src/char-predicates-inl.h" |
8 #include "src/factory.h" | 8 #include "src/factory.h" |
9 #include "src/isolate.h" | 9 #include "src/isolate.h" |
10 #include "src/objects-inl.h" | 10 #include "src/objects-inl.h" |
11 #include "src/regexp/jsregexp.h" | 11 #include "src/regexp/jsregexp.h" |
12 #include "src/utils.h" | 12 #include "src/utils.h" |
13 | 13 |
14 #ifdef V8_I18N_SUPPORT | |
15 #include "unicode/uset.h" | |
16 #endif // V8_I18N_SUPPORT | |
17 | |
14 namespace v8 { | 18 namespace v8 { |
15 namespace internal { | 19 namespace internal { |
16 | 20 |
17 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, | 21 RegExpParser::RegExpParser(FlatStringReader* in, Handle<String>* error, |
18 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) | 22 JSRegExp::Flags flags, Isolate* isolate, Zone* zone) |
19 : isolate_(isolate), | 23 : isolate_(isolate), |
20 zone_(zone), | 24 zone_(zone), |
21 error_(error), | 25 error_(error), |
22 captures_(NULL), | 26 captures_(NULL), |
23 in_(in), | 27 in_(in), |
(...skipping 1033 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1057 FlushPendingSurrogate(); | 1061 FlushPendingSurrogate(); |
1058 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. | 1062 // Hold onto the lead surrogate, waiting for a trail surrogate to follow. |
1059 pending_surrogate_ = lead_surrogate; | 1063 pending_surrogate_ = lead_surrogate; |
1060 } | 1064 } |
1061 | 1065 |
1062 | 1066 |
1063 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { | 1067 void RegExpBuilder::AddTrailSurrogate(uc16 trail_surrogate) { |
1064 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); | 1068 DCHECK(unibrow::Utf16::IsTrailSurrogate(trail_surrogate)); |
1065 if (pending_surrogate_ != kNoPendingSurrogate) { | 1069 if (pending_surrogate_ != kNoPendingSurrogate) { |
1066 uc16 lead_surrogate = pending_surrogate_; | 1070 uc16 lead_surrogate = pending_surrogate_; |
1071 pending_surrogate_ = kNoPendingSurrogate; | |
1067 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); | 1072 DCHECK(unibrow::Utf16::IsLeadSurrogate(lead_surrogate)); |
1068 ZoneList<uc16> surrogate_pair(2, zone()); | 1073 uc32 combined = |
1069 surrogate_pair.Add(lead_surrogate, zone()); | 1074 unibrow::Utf16::CombineSurrogatePair(lead_surrogate, trail_surrogate); |
1070 surrogate_pair.Add(trail_surrogate, zone()); | 1075 if (NeedsDesugaringForIgnoreCase(combined)) { |
1071 RegExpAtom* atom = new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | 1076 AddCharacterClass(combined); |
1072 pending_surrogate_ = kNoPendingSurrogate; | 1077 } else { |
1073 AddAtom(atom); | 1078 ZoneList<uc16> surrogate_pair(2, zone()); |
1079 surrogate_pair.Add(lead_surrogate, zone()); | |
1080 surrogate_pair.Add(trail_surrogate, zone()); | |
1081 RegExpAtom* atom = | |
1082 new (zone()) RegExpAtom(surrogate_pair.ToConstVector()); | |
1083 AddAtom(atom); | |
1084 } | |
1074 } else { | 1085 } else { |
1075 pending_surrogate_ = trail_surrogate; | 1086 pending_surrogate_ = trail_surrogate; |
1076 FlushPendingSurrogate(); | 1087 FlushPendingSurrogate(); |
1077 } | 1088 } |
1078 } | 1089 } |
1079 | 1090 |
1080 | 1091 |
1081 void RegExpBuilder::FlushPendingSurrogate() { | 1092 void RegExpBuilder::FlushPendingSurrogate() { |
1082 if (pending_surrogate_ != kNoPendingSurrogate) { | 1093 if (pending_surrogate_ != kNoPendingSurrogate) { |
1083 // Use character class to desugar lone surrogate matching. | 1094 DCHECK(unicode()); |
1084 RegExpCharacterClass* cc = new (zone()) RegExpCharacterClass( | 1095 uc32 c = pending_surrogate_; |
1085 CharacterRange::List(zone(), | |
1086 CharacterRange::Singleton(pending_surrogate_)), | |
1087 false); | |
1088 pending_surrogate_ = kNoPendingSurrogate; | 1096 pending_surrogate_ = kNoPendingSurrogate; |
1089 DCHECK(unicode()); | 1097 AddCharacterClass(c); |
1090 AddCharacterClass(cc); | |
1091 } | 1098 } |
1092 } | 1099 } |
1093 | 1100 |
1094 | 1101 |
1095 void RegExpBuilder::FlushCharacters() { | 1102 void RegExpBuilder::FlushCharacters() { |
1096 FlushPendingSurrogate(); | 1103 FlushPendingSurrogate(); |
1097 pending_empty_ = false; | 1104 pending_empty_ = false; |
1098 if (characters_ != NULL) { | 1105 if (characters_ != NULL) { |
1099 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); | 1106 RegExpTree* atom = new (zone()) RegExpAtom(characters_->ToConstVector()); |
1100 characters_ = NULL; | 1107 characters_ = NULL; |
(...skipping 15 matching lines...) Expand all Loading... | |
1116 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); | 1123 for (int i = 0; i < num_text; i++) text_.Get(i)->AppendToText(text, zone()); |
1117 terms_.Add(text, zone()); | 1124 terms_.Add(text, zone()); |
1118 } | 1125 } |
1119 text_.Clear(); | 1126 text_.Clear(); |
1120 } | 1127 } |
1121 | 1128 |
1122 | 1129 |
1123 void RegExpBuilder::AddCharacter(uc16 c) { | 1130 void RegExpBuilder::AddCharacter(uc16 c) { |
1124 FlushPendingSurrogate(); | 1131 FlushPendingSurrogate(); |
1125 pending_empty_ = false; | 1132 pending_empty_ = false; |
1126 if (characters_ == NULL) { | 1133 if (NeedsDesugaringForIgnoreCase(c)) { |
1127 characters_ = new (zone()) ZoneList<uc16>(4, zone()); | 1134 AddCharacterClass(c); |
1135 } else { | |
1136 if (characters_ == NULL) { | |
1137 characters_ = new (zone()) ZoneList<uc16>(4, zone()); | |
1138 } | |
1139 characters_->Add(c, zone()); | |
1140 LAST(ADD_CHAR); | |
1128 } | 1141 } |
1129 characters_->Add(c, zone()); | |
1130 LAST(ADD_CHAR); | |
1131 } | 1142 } |
1132 | 1143 |
1133 | 1144 |
1134 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { | 1145 void RegExpBuilder::AddUnicodeCharacter(uc32 c) { |
1135 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { | 1146 if (c > unibrow::Utf16::kMaxNonSurrogateCharCode) { |
1136 DCHECK(unicode()); | 1147 DCHECK(unicode()); |
1137 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); | 1148 AddLeadSurrogate(unibrow::Utf16::LeadSurrogate(c)); |
1138 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); | 1149 AddTrailSurrogate(unibrow::Utf16::TrailSurrogate(c)); |
1139 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { | 1150 } else if (unicode() && unibrow::Utf16::IsLeadSurrogate(c)) { |
1140 AddLeadSurrogate(c); | 1151 AddLeadSurrogate(c); |
1141 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { | 1152 } else if (unicode() && unibrow::Utf16::IsTrailSurrogate(c)) { |
1142 AddTrailSurrogate(c); | 1153 AddTrailSurrogate(c); |
1143 } else { | 1154 } else { |
1144 AddCharacter(static_cast<uc16>(c)); | 1155 AddCharacter(static_cast<uc16>(c)); |
1145 } | 1156 } |
1146 } | 1157 } |
1147 | 1158 |
1148 | 1159 |
1149 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } | 1160 void RegExpBuilder::AddEmpty() { pending_empty_ = true; } |
1150 | 1161 |
1151 | 1162 |
1152 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { | 1163 void RegExpBuilder::AddCharacterClass(RegExpCharacterClass* cc) { |
1153 if (unicode() && cc->NeedsDesugaringForUnicode(zone())) { | 1164 if (NeedsDesugaringForUnicode(cc->ranges(zone()))) { |
erikcorry
2016/01/25 10:26:37
It's a bit unfortunate, that for all the standard
Yang
2016/01/25 11:46:37
Fixed.
| |
1154 // In unicode mode, character class needs to be desugared, so it | 1165 // In unicode mode, character class needs to be desugared, so it |
1155 // must be a standalone term instead of being part of a RegExpText. | 1166 // must be a standalone term instead of being part of a RegExpText. |
1156 AddTerm(cc); | 1167 AddTerm(cc); |
1157 } else { | 1168 } else { |
1158 AddAtom(cc); | 1169 AddAtom(cc); |
1159 } | 1170 } |
1160 } | 1171 } |
1161 | 1172 |
1162 | 1173 |
1174 void RegExpBuilder::AddCharacterClass(uc32 c) { | |
1175 AddCharacterClass(new (zone()) RegExpCharacterClass( | |
1176 CharacterRange::List(zone(), CharacterRange::Singleton(c)), false)); | |
1177 } | |
1178 | |
1179 | |
1163 void RegExpBuilder::AddAtom(RegExpTree* term) { | 1180 void RegExpBuilder::AddAtom(RegExpTree* term) { |
1164 if (term->IsEmpty()) { | 1181 if (term->IsEmpty()) { |
1165 AddEmpty(); | 1182 AddEmpty(); |
1166 return; | 1183 return; |
1167 } | 1184 } |
1168 if (term->IsTextElement()) { | 1185 if (term->IsTextElement()) { |
1169 FlushCharacters(); | 1186 FlushCharacters(); |
1170 text_.Add(term, zone()); | 1187 text_.Add(term, zone()); |
1171 } else { | 1188 } else { |
1172 FlushText(); | 1189 FlushText(); |
(...skipping 30 matching lines...) Expand all Loading... | |
1203 alternative = terms_.last(); | 1220 alternative = terms_.last(); |
1204 } else { | 1221 } else { |
1205 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); | 1222 alternative = new (zone()) RegExpAlternative(terms_.GetList(zone())); |
1206 } | 1223 } |
1207 alternatives_.Add(alternative, zone()); | 1224 alternatives_.Add(alternative, zone()); |
1208 terms_.Clear(); | 1225 terms_.Clear(); |
1209 LAST(ADD_NONE); | 1226 LAST(ADD_NONE); |
1210 } | 1227 } |
1211 | 1228 |
1212 | 1229 |
1230 bool RegExpBuilder::NeedsDesugaringForUnicode( | |
1231 ZoneList<CharacterRange>* ranges) { | |
1232 if (!unicode()) return false; | |
1233 CharacterRange::Canonicalize(ranges); | |
1234 static const uc32 kLeadSurrogateStart = 0xd800; | |
1235 static const uc32 kTrailSurrogateEnd = 0xdfff; | |
1236 static const uc32 kNonBmpStart = 0x10000; | |
erikcorry
2016/01/25 10:26:37
Don't these constants already exist?
Yang
2016/01/25 11:46:37
Done.
| |
1237 for (int i = ranges->length() - 1; i >= 0; i--) { | |
1238 uc32 from = ranges->at(i).from(); | |
1239 uc32 to = ranges->at(i).to(); | |
1240 // Check for non-BMP characters. | |
1241 if (to >= kNonBmpStart) return true; | |
1242 // Check for lone surrogates. | |
1243 if (from <= kTrailSurrogateEnd && to >= kLeadSurrogateStart) return true; | |
1244 } | |
1245 return false; | |
1246 } | |
1247 | |
1248 | |
1249 bool RegExpBuilder::NeedsDesugaringForIgnoreCase(uc32 c) { | |
1250 #ifdef V8_I18N_SUPPORT | |
erikcorry
2016/01/25 10:26:37
Perhaps a comment explaining what happens in the n
Yang
2016/01/25 11:46:37
Done.
| |
1251 // Ignore-case for ASCII characters is handled at a lower layer. | |
1252 if (c < 128) return false; | |
erikcorry
2016/01/25 10:26:37
There's a named constant for this.
Yang
2016/01/25 11:46:37
Done.
| |
1253 if (unicode() && ignore_case()) { | |
1254 USet* set = uset_open(c, c); | |
1255 uset_closeOver(set, USET_CASE_INSENSITIVE); | |
1256 uset_removeAllStrings(set); | |
1257 bool result = uset_size(set) > 1; | |
1258 uset_close(set); | |
1259 return result; | |
1260 } | |
1261 #endif // V8_I18N_SUPPORT | |
1262 return false; | |
1263 } | |
1264 | |
1265 | |
1213 RegExpTree* RegExpBuilder::ToRegExp() { | 1266 RegExpTree* RegExpBuilder::ToRegExp() { |
1214 FlushTerms(); | 1267 FlushTerms(); |
1215 int num_alternatives = alternatives_.length(); | 1268 int num_alternatives = alternatives_.length(); |
1216 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); | 1269 if (num_alternatives == 0) return new (zone()) RegExpEmpty(); |
1217 if (num_alternatives == 1) return alternatives_.last(); | 1270 if (num_alternatives == 1) return alternatives_.last(); |
1218 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); | 1271 return new (zone()) RegExpDisjunction(alternatives_.GetList(zone())); |
1219 } | 1272 } |
1220 | 1273 |
1221 | 1274 |
1222 void RegExpBuilder::AddQuantifierToAtom( | 1275 void RegExpBuilder::AddQuantifierToAtom( |
(...skipping 38 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1261 UNREACHABLE(); | 1314 UNREACHABLE(); |
1262 return; | 1315 return; |
1263 } | 1316 } |
1264 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), | 1317 terms_.Add(new (zone()) RegExpQuantifier(min, max, quantifier_type, atom), |
1265 zone()); | 1318 zone()); |
1266 LAST(ADD_TERM); | 1319 LAST(ADD_TERM); |
1267 } | 1320 } |
1268 | 1321 |
1269 } // namespace internal | 1322 } // namespace internal |
1270 } // namespace v8 | 1323 } // namespace v8 |
OLD | NEW |