Chromium Code Reviews| OLD | NEW |
|---|---|
| 1 // Copyright 2014 the V8 project authors. All rights reserved. | 1 // Copyright 2014 the V8 project authors. All rights reserved. |
| 2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
| 3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
| 4 | 4 |
| 5 #include "src/runtime/runtime-utils.h" | 5 #include "src/runtime/runtime-utils.h" |
| 6 | 6 |
| 7 #include "src/arguments.h" | 7 #include "src/arguments.h" |
| 8 #include "src/conversions-inl.h" | 8 #include "src/conversions-inl.h" |
| 9 #include "src/isolate-inl.h" | 9 #include "src/isolate-inl.h" |
| 10 #include "src/regexp/jsregexp-inl.h" | 10 #include "src/regexp/jsregexp-inl.h" |
| 11 #include "src/regexp/jsregexp.h" | 11 #include "src/regexp/jsregexp.h" |
| 12 #include "src/string-builder.h" | 12 #include "src/string-builder.h" |
| 13 #include "src/string-search.h" | 13 #include "src/string-search.h" |
| 14 | 14 |
| 15 #ifdef V8_I18N_SUPPORT | |
| 16 #include "unicode/uchar.h" | |
| 17 #include "unicode/ustring.h" | |
| 18 #endif | |
| 19 | |
| 15 namespace v8 { | 20 namespace v8 { |
| 16 namespace internal { | 21 namespace internal { |
| 17 | 22 |
| 18 | 23 |
| 19 // Perform string match of pattern on subject, starting at start index. | 24 // Perform string match of pattern on subject, starting at start index. |
| 20 // Caller must ensure that 0 <= start_index <= sub->length(), | 25 // Caller must ensure that 0 <= start_index <= sub->length(), |
| 21 // and should check that pat->length() + start_index <= sub->length(). | 26 // and should check that pat->length() + start_index <= sub->length(). |
| 22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat, | 27 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat, |
| 23 int start_index) { | 28 int start_index) { |
| 24 DCHECK(0 <= start_index); | 29 DCHECK(0 <= start_index); |
| (...skipping 1037 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1062 isolate, result, isolate->factory()->NewRawOneByteString(length)); | 1067 isolate, result, isolate->factory()->NewRawOneByteString(length)); |
| 1063 } else { | 1068 } else { |
| 1064 if (length < 0) length = -length; | 1069 if (length < 0) length = -length; |
| 1065 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( | 1070 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( |
| 1066 isolate, result, isolate->factory()->NewRawTwoByteString(length)); | 1071 isolate, result, isolate->factory()->NewRawTwoByteString(length)); |
| 1067 } | 1072 } |
| 1068 return ConvertCaseHelper(isolate, *s, *result, length, mapping); | 1073 return ConvertCaseHelper(isolate, *s, *result, length, mapping); |
| 1069 } | 1074 } |
| 1070 | 1075 |
| 1071 | 1076 |
| 1077 #ifdef V8_I18N_SUPPORT | |
| 1078 namespace { | |
|
Yang
2016/01/07 09:47:38
Can we move all of that into its own file?
| |
| 1079 | |
| 1080 typedef int32_t (*case_conversion_fn)(UChar* dest, int32_t destCapacity, | |
| 1081 const UChar* src, int32_t srcLength, | |
| 1082 const char* locale, | |
| 1083 UErrorCode* pErrorCode); | |
| 1084 | |
| 1085 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s, | |
| 1086 Isolate* isolate, | |
| 1087 case_conversion_fn fn) { | |
| 1088 int32_t length = s->length(); | |
| 1089 Handle<SeqTwoByteString> result = | |
| 1090 isolate->factory()->NewRawTwoByteString(length).ToHandleChecked(); | |
| 1091 | |
| 1092 // If we already have a UTF-16 string, use that, otherwise build it | |
| 1093 base::SmartArrayPointer<uc16> sap; | |
| 1094 const UChar* src; | |
| 1095 if (StringShape(*s).IsSequentialTwoByte()) { | |
| 1096 src = | |
| 1097 reinterpret_cast<const UChar*>(SeqTwoByteString::cast(*s)->GetChars()); | |
|
Yang
2016/01/07 09:47:38
Please use String::FlatContent to access the flat
| |
| 1098 } else { | |
| 1099 sap = s->ToWideCString(ROBUST_STRING_TRAVERSAL); | |
| 1100 src = reinterpret_cast<const UChar*>(sap.get()); | |
| 1101 } | |
| 1102 | |
| 1103 UErrorCode error = U_ZERO_ERROR; | |
| 1104 int32_t real_length = fn(reinterpret_cast<UChar*>(result->GetChars()), length, | |
| 1105 src, length, "", &error); | |
| 1106 // If the lengths are equal, ICU will be unable to put the terminating \0 | |
| 1107 // but that's to be expected, as V8 strings are not null-terminated. | |
| 1108 if (error == U_STRING_NOT_TERMINATED_WARNING) { | |
| 1109 DCHECK_EQ(real_length, length); | |
| 1110 return Handle<String>(*result); | |
| 1111 } | |
| 1112 // These are the two expected error types from an oversized converted string | |
| 1113 // "buffer overflow" when needs more space; "success" when too long | |
| 1114 DCHECK(error == U_BUFFER_OVERFLOW_ERROR || error == U_ZERO_ERROR); | |
| 1115 | |
| 1116 error = U_ZERO_ERROR; | |
| 1117 result = | |
| 1118 isolate->factory()->NewRawTwoByteString(real_length).ToHandleChecked(); | |
| 1119 int32_t real_length_again = fn(reinterpret_cast<UChar*>(result->GetChars()), | |
|
Yang
2016/01/07 09:47:38
Instead of running the whole thing again, you coul
| |
| 1120 real_length, src, length, "", &error); | |
|
Yang
2016/01/07 09:47:38
src can have moved already after the allocation of
| |
| 1121 USE(real_length_again); // Shouldn't that be part of DCHECK_EQ? | |
| 1122 DCHECK_EQ(real_length, real_length_again); | |
| 1123 DCHECK_EQ(error, U_STRING_NOT_TERMINATED_WARNING); | |
| 1124 if (error != U_STRING_NOT_TERMINATED_WARNING) return s; | |
| 1125 return Handle<String>(*result); | |
| 1126 } | |
| 1127 | |
| 1128 | |
| 1129 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; } | |
| 1130 | |
| 1131 | |
| 1132 inline uint16_t ToASCIILower(uint16_t ch) { | |
| 1133 return ch | ((ch >= 'A' && ch <= 'Z') << 5); | |
| 1134 } | |
| 1135 | |
| 1136 | |
| 1137 inline uint16_t ToASCIIUpper(uint16_t ch) { | |
| 1138 return ch & ~((ch >= 'a' && ch <= 'z') << 5); | |
| 1139 } | |
| 1140 | |
| 1141 | |
| 1142 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s, | |
| 1143 Isolate* isolate) { | |
| 1144 // Note: This is a hot function in the Dromaeo benchmark, specifically the | |
| 1145 // no-op code path up through the first 'return' statement. | |
| 1146 | |
| 1147 int length = s->length(); | |
| 1148 // First scan the string for uppercase and non-ASCII characters: | |
| 1149 if (s->HasOnlyOneByteChars()) { | |
| 1150 unsigned first_index_to_lower = length; | |
| 1151 for (int index = 0; index < length; ++index) { | |
| 1152 // Blink specializes this path for one-byte strings, so it | |
| 1153 // does not need to do a generic get, but can do the equivalent | |
| 1154 // of SeqOneByteStringGet. | |
| 1155 uint16_t ch = s->Get(index); | |
|
Yang
2016/01/07 09:47:38
This can get pretty expensive if the string is a d
| |
| 1156 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) { | |
| 1157 first_index_to_lower = index; | |
| 1158 break; | |
| 1159 } | |
| 1160 } | |
| 1161 | |
| 1162 // Nothing to do if the string is all ASCII with no uppercase. | |
| 1163 if (first_index_to_lower == length) return s; | |
| 1164 | |
| 1165 // We depend here on the invariant that the length of a Latin1 | |
| 1166 // string is invariant under ToLowerCase, and the result always | |
| 1167 // fits in the Latin1 range (untrue for ToUpperCase, and might | |
| 1168 // be untrue in some locales, but this is the root locale) | |
| 1169 Handle<SeqOneByteString> result = | |
| 1170 isolate->factory()->NewRawOneByteString(length).ToHandleChecked(); | |
| 1171 // In Blink, this is a simple memcpy, but in V8, the path applies in | |
| 1172 // more cases. The optimization here is conditional on whether the | |
| 1173 // source is actually a simple 8-bit string (always true in Blink). | |
| 1174 // The broader condition lets us eliminate a bunch of duplicate code | |
| 1175 // which Blink had in a separate section below. | |
| 1176 if (StringShape(*s).IsSequentialOneByte()) { | |
|
Yang
2016/01/07 09:47:38
Let's not use StringShape. Just check for s->IsSeq
| |
| 1177 // In this path, we can use the one-byte-specific Get, and | |
| 1178 // memcpy until the first_index_to_lower. | |
| 1179 SeqOneByteString* source = SeqOneByteString::cast(*s); | |
| 1180 memcpy(result->GetChars(), source->GetChars(), first_index_to_lower); | |
|
Yang
2016/01/07 09:47:38
We should just use CopyChars for uniformity here.
| |
| 1181 for (int index = first_index_to_lower; index < length; ++index) { | |
| 1182 uint16_t ch = source->SeqOneByteStringGet(index); | |
| 1183 result->SeqOneByteStringSet(index, | |
| 1184 V8_UNLIKELY(ch & ~0x7F) | |
| 1185 ? static_cast<uint16_t>(u_tolower(ch)) | |
| 1186 : ToASCIILower(ch)); | |
| 1187 } | |
| 1188 } else { | |
| 1189 // In this path, we start from the beginning of the string, | |
| 1190 // since there is nothing to memcpy from, and we have to | |
| 1191 // use the generic Get. Another option here would be to create | |
| 1192 // a two-byte string as output, and do a memcpy from that, | |
| 1193 // as Blink does, but there's also the ConsString case. | |
|
Yang
2016/01/07 09:47:38
As explained above, we should flatten upfront, so
| |
| 1194 for (int index = 0; index < length; ++index) { | |
| 1195 uint16_t ch = s->Get(index); | |
| 1196 result->SeqOneByteStringSet(index, | |
| 1197 V8_UNLIKELY(ch & ~0x7F) | |
| 1198 ? static_cast<uint16_t>(u_tolower(ch)) | |
| 1199 : ToASCIILower(ch)); | |
| 1200 } | |
| 1201 } | |
| 1202 | |
| 1203 return Handle<String>(*result); | |
| 1204 } | |
| 1205 | |
| 1206 // Blink had an additional case here for ASCII 2-byte strings, but | |
| 1207 // that is subsumed by the above code (assuming there isn't a false | |
| 1208 // negative for HasOnlyOneByteChars). | |
| 1209 | |
| 1210 // Do a slower implementation for cases that include non-ASCII characters. | |
| 1211 return ConvertCaseICU(s, isolate, u_strToLower); | |
| 1212 } | |
| 1213 | |
| 1214 | |
| 1215 const uint16_t sharp_s = L'\u00DF'; | |
|
Yang
2016/01/07 09:47:38
0xDF should do the trick to, right?
| |
| 1216 | |
| 1217 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s, | |
| 1218 Isolate* isolate) { | |
| 1219 // This function could be optimized for no-op cases the way lower() is, | |
| 1220 // but in empirical testing, few actual calls to upper() are no-ops, so | |
| 1221 // it wouldn't be worth the extra time for pre-scanning. | |
| 1222 | |
| 1223 int32_t length = s->length(); | |
| 1224 | |
| 1225 if (s->HasOnlyOneByteChars()) { | |
| 1226 Handle<SeqOneByteString> result = | |
| 1227 isolate->factory()->NewRawOneByteString(length).ToHandleChecked(); | |
| 1228 | |
| 1229 // Do a faster loop for the case where all the characters are ASCII. | |
| 1230 uint16_t ored = 0; | |
| 1231 for (int index = 0; index < length; ++index) { | |
| 1232 uint16_t ch = s->Get(index); | |
|
Yang
2016/01/07 09:47:38
Again, please flatten upfront and use String::Flat
| |
| 1233 ored |= ch; | |
| 1234 result->SeqOneByteStringSet(index, ToASCIIUpper(ch)); | |
| 1235 } | |
| 1236 if (!(ored & ~0x7F)) return Handle<String>(*result); | |
| 1237 | |
| 1238 // Do a slower implementation for cases that include non-ASCII Latin-1 | |
| 1239 // characters. | |
| 1240 int sharp_s_count = 0; | |
| 1241 | |
| 1242 // There are two special cases. | |
| 1243 // 1. latin-1 characters when converted to upper case are 16 bit | |
| 1244 // characters. | |
| 1245 // 2. Lower case sharp-S converts to "SS" (two characters) | |
| 1246 for (int32_t index = 0; index < length; ++index) { | |
| 1247 uint16_t ch = s->Get(index); | |
| 1248 if (V8_UNLIKELY(ch == sharp_s)) ++sharp_s_count; | |
| 1249 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch))); | |
| 1250 if (V8_UNLIKELY(upper > 0xff)) { | |
| 1251 // Since this upper-cased character does not fit in an 8-bit string, we | |
| 1252 // need to take the 16-bit path. | |
| 1253 goto upconvert; | |
|
Yang
2016/01/07 09:47:38
Can't we simply "return ConvertCaseICU(s, isolate,
| |
| 1254 } | |
| 1255 result->SeqOneByteStringSet(index, upper); | |
| 1256 } | |
| 1257 | |
| 1258 if (sharp_s_count == 0) return Handle<String>(*result); | |
| 1259 | |
| 1260 // We have sharp_s_count sharp-s characters, but none of the other special | |
| 1261 // characters. | |
| 1262 result = isolate->factory() | |
| 1263 ->NewRawOneByteString(length + sharp_s_count) | |
| 1264 .ToHandleChecked(); | |
| 1265 for (int32_t index = 0, dest_index = 0; index < length; ++index) { | |
| 1266 uint16_t ch = s->Get(index); | |
| 1267 if (ch == sharp_s) { | |
| 1268 result->SeqOneByteStringSet(dest_index++, 'S'); | |
| 1269 result->SeqOneByteStringSet(dest_index++, 'S'); | |
| 1270 } else { | |
| 1271 uint16_t upper = | |
| 1272 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch))); | |
| 1273 result->SeqOneByteStringSet(dest_index++, upper); | |
| 1274 } | |
| 1275 } | |
| 1276 | |
| 1277 return Handle<String>(*result); | |
| 1278 } | |
| 1279 | |
| 1280 upconvert: | |
| 1281 return ConvertCaseICU(s, isolate, u_strToUpper); | |
| 1282 } | |
| 1283 | |
| 1284 } // namespace | |
| 1285 #endif | |
| 1286 | |
| 1287 | |
| 1072 RUNTIME_FUNCTION(Runtime_StringToLowerCase) { | 1288 RUNTIME_FUNCTION(Runtime_StringToLowerCase) { |
| 1073 HandleScope scope(isolate); | 1289 HandleScope scope(isolate); |
| 1074 DCHECK(args.length() == 1); | 1290 DCHECK_EQ(args.length(), 1); |
| 1075 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); | 1291 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); |
| 1076 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping()); | 1292 #ifdef V8_I18N_SUPPORT |
| 1293 if (FLAG_icu_case_mapping) | |
|
Yang
2016/01/07 09:47:38
the convention with multiline conditionals is to u
| |
| 1294 return *StringToLowerCase(s, isolate); | |
| 1295 else | |
| 1296 #endif | |
| 1297 return ConvertCase(s, isolate, | |
| 1298 isolate->runtime_state()->to_lower_mapping()); | |
| 1077 } | 1299 } |
| 1078 | 1300 |
| 1079 | 1301 |
| 1080 RUNTIME_FUNCTION(Runtime_StringToUpperCase) { | 1302 RUNTIME_FUNCTION(Runtime_StringToUpperCase) { |
| 1081 HandleScope scope(isolate); | 1303 HandleScope scope(isolate); |
| 1082 DCHECK(args.length() == 1); | 1304 DCHECK_EQ(args.length(), 1); |
| 1083 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); | 1305 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); |
| 1084 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping()); | 1306 #ifdef V8_I18N_SUPPORT |
| 1085 } | 1307 if (FLAG_icu_case_mapping) |
| 1086 | 1308 return *StringToUpperCase(s, isolate); |
| 1087 | 1309 else |
| 1310 #endif | |
| 1311 return ConvertCase(s, isolate, | |
| 1312 isolate->runtime_state()->to_upper_mapping()); | |
| 1313 } | |
| 1314 | |
| 1315 | |
| 1088 RUNTIME_FUNCTION(Runtime_StringTrim) { | 1316 RUNTIME_FUNCTION(Runtime_StringTrim) { |
| 1089 HandleScope scope(isolate); | 1317 HandleScope scope(isolate); |
| 1090 DCHECK(args.length() == 3); | 1318 DCHECK(args.length() == 3); |
| 1091 | 1319 |
| 1092 CONVERT_ARG_HANDLE_CHECKED(String, string, 0); | 1320 CONVERT_ARG_HANDLE_CHECKED(String, string, 0); |
| 1093 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1); | 1321 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1); |
| 1094 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2); | 1322 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2); |
| 1095 | 1323 |
| 1096 string = String::Flatten(string); | 1324 string = String::Flatten(string); |
| 1097 int length = string->length(); | 1325 int length = string->length(); |
| (...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
| 1240 SealHandleScope shs(isolate); | 1468 SealHandleScope shs(isolate); |
| 1241 DCHECK(args.length() == 2); | 1469 DCHECK(args.length() == 2); |
| 1242 if (!args[0]->IsString()) return isolate->heap()->undefined_value(); | 1470 if (!args[0]->IsString()) return isolate->heap()->undefined_value(); |
| 1243 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value(); | 1471 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value(); |
| 1244 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value(); | 1472 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value(); |
| 1245 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate); | 1473 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate); |
| 1246 } | 1474 } |
| 1247 | 1475 |
| 1248 } // namespace internal | 1476 } // namespace internal |
| 1249 } // namespace v8 | 1477 } // namespace v8 |
| OLD | NEW |