Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(59)

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 1544023002: Call out to ICU for case conversion Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Additional test case Created 4 years, 12 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/runtime/runtime-utils.h" 5 #include "src/runtime/runtime-utils.h"
6 6
7 #include "src/arguments.h" 7 #include "src/arguments.h"
8 #include "src/conversions-inl.h" 8 #include "src/conversions-inl.h"
9 #include "src/isolate-inl.h" 9 #include "src/isolate-inl.h"
10 #include "src/regexp/jsregexp-inl.h" 10 #include "src/regexp/jsregexp-inl.h"
11 #include "src/regexp/jsregexp.h" 11 #include "src/regexp/jsregexp.h"
12 #include "src/string-builder.h" 12 #include "src/string-builder.h"
13 #include "src/string-search.h" 13 #include "src/string-search.h"
14 14
15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/uchar.h"
17 #include "unicode/ustring.h"
18 #endif
19
15 namespace v8 { 20 namespace v8 {
16 namespace internal { 21 namespace internal {
17 22
18 23
19 // Perform string match of pattern on subject, starting at start index. 24 // Perform string match of pattern on subject, starting at start index.
20 // Caller must ensure that 0 <= start_index <= sub->length(), 25 // Caller must ensure that 0 <= start_index <= sub->length(),
21 // and should check that pat->length() + start_index <= sub->length(). 26 // and should check that pat->length() + start_index <= sub->length().
22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat, 27 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,
23 int start_index) { 28 int start_index) {
24 DCHECK(0 <= start_index); 29 DCHECK(0 <= start_index);
(...skipping 1037 matching lines...) Expand 10 before | Expand all | Expand 10 after
1062 isolate, result, isolate->factory()->NewRawOneByteString(length)); 1067 isolate, result, isolate->factory()->NewRawOneByteString(length));
1063 } else { 1068 } else {
1064 if (length < 0) length = -length; 1069 if (length < 0) length = -length;
1065 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( 1070 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1066 isolate, result, isolate->factory()->NewRawTwoByteString(length)); 1071 isolate, result, isolate->factory()->NewRawTwoByteString(length));
1067 } 1072 }
1068 return ConvertCaseHelper(isolate, *s, *result, length, mapping); 1073 return ConvertCaseHelper(isolate, *s, *result, length, mapping);
1069 } 1074 }
1070 1075
1071 1076
1077 #ifdef V8_I18N_SUPPORT
1078 namespace {
Yang 2016/01/07 09:47:38 Can we move all of that into its own file?
1079
1080 typedef int32_t (*case_conversion_fn)(UChar* dest, int32_t destCapacity,
1081 const UChar* src, int32_t srcLength,
1082 const char* locale,
1083 UErrorCode* pErrorCode);
1084
1085 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s,
1086 Isolate* isolate,
1087 case_conversion_fn fn) {
1088 int32_t length = s->length();
1089 Handle<SeqTwoByteString> result =
1090 isolate->factory()->NewRawTwoByteString(length).ToHandleChecked();
1091
1092 // If we already have a UTF-16 string, use that, otherwise build it
1093 base::SmartArrayPointer<uc16> sap;
1094 const UChar* src;
1095 if (StringShape(*s).IsSequentialTwoByte()) {
1096 src =
1097 reinterpret_cast<const UChar*>(SeqTwoByteString::cast(*s)->GetChars());
Yang 2016/01/07 09:47:38 Please use String::FlatContent to access the flat
1098 } else {
1099 sap = s->ToWideCString(ROBUST_STRING_TRAVERSAL);
1100 src = reinterpret_cast<const UChar*>(sap.get());
1101 }
1102
1103 UErrorCode error = U_ZERO_ERROR;
1104 int32_t real_length = fn(reinterpret_cast<UChar*>(result->GetChars()), length,
1105 src, length, "", &error);
1106 // If the lengths are equal, ICU will be unable to put the terminating \0
1107 // but that's to be expected, as V8 strings are not null-terminated.
1108 if (error == U_STRING_NOT_TERMINATED_WARNING) {
1109 DCHECK_EQ(real_length, length);
1110 return Handle<String>(*result);
1111 }
1112 // These are the two expected error types from an oversized converted string
1113 // "buffer overflow" when needs more space; "success" when too long
1114 DCHECK(error == U_BUFFER_OVERFLOW_ERROR || error == U_ZERO_ERROR);
1115
1116 error = U_ZERO_ERROR;
1117 result =
1118 isolate->factory()->NewRawTwoByteString(real_length).ToHandleChecked();
1119 int32_t real_length_again = fn(reinterpret_cast<UChar*>(result->GetChars()),
Yang 2016/01/07 09:47:38 Instead of running the whole thing again, you coul
1120 real_length, src, length, "", &error);
Yang 2016/01/07 09:47:38 src can have moved already after the allocation of
1121 USE(real_length_again); // Shouldn't that be part of DCHECK_EQ?
1122 DCHECK_EQ(real_length, real_length_again);
1123 DCHECK_EQ(error, U_STRING_NOT_TERMINATED_WARNING);
1124 if (error != U_STRING_NOT_TERMINATED_WARNING) return s;
1125 return Handle<String>(*result);
1126 }
1127
1128
1129 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
1130
1131
1132 inline uint16_t ToASCIILower(uint16_t ch) {
1133 return ch | ((ch >= 'A' && ch <= 'Z') << 5);
1134 }
1135
1136
1137 inline uint16_t ToASCIIUpper(uint16_t ch) {
1138 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
1139 }
1140
1141
1142 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s,
1143 Isolate* isolate) {
1144 // Note: This is a hot function in the Dromaeo benchmark, specifically the
1145 // no-op code path up through the first 'return' statement.
1146
1147 int length = s->length();
1148 // First scan the string for uppercase and non-ASCII characters:
1149 if (s->HasOnlyOneByteChars()) {
1150 unsigned first_index_to_lower = length;
1151 for (int index = 0; index < length; ++index) {
1152 // Blink specializes this path for one-byte strings, so it
1153 // does not need to do a generic get, but can do the equivalent
1154 // of SeqOneByteStringGet.
1155 uint16_t ch = s->Get(index);
Yang 2016/01/07 09:47:38 This can get pretty expensive if the string is a d
1156 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1157 first_index_to_lower = index;
1158 break;
1159 }
1160 }
1161
1162 // Nothing to do if the string is all ASCII with no uppercase.
1163 if (first_index_to_lower == length) return s;
1164
1165 // We depend here on the invariant that the length of a Latin1
1166 // string is invariant under ToLowerCase, and the result always
1167 // fits in the Latin1 range (untrue for ToUpperCase, and might
1168 // be untrue in some locales, but this is the root locale)
1169 Handle<SeqOneByteString> result =
1170 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1171 // In Blink, this is a simple memcpy, but in V8, the path applies in
1172 // more cases. The optimization here is conditional on whether the
1173 // source is actually a simple 8-bit string (always true in Blink).
1174 // The broader condition lets us eliminate a bunch of duplicate code
1175 // which Blink had in a separate section below.
1176 if (StringShape(*s).IsSequentialOneByte()) {
Yang 2016/01/07 09:47:38 Let's not use StringShape. Just check for s->IsSeq
1177 // In this path, we can use the one-byte-specific Get, and
1178 // memcpy until the first_index_to_lower.
1179 SeqOneByteString* source = SeqOneByteString::cast(*s);
1180 memcpy(result->GetChars(), source->GetChars(), first_index_to_lower);
Yang 2016/01/07 09:47:38 We should just use CopyChars for uniformity here.
1181 for (int index = first_index_to_lower; index < length; ++index) {
1182 uint16_t ch = source->SeqOneByteStringGet(index);
1183 result->SeqOneByteStringSet(index,
1184 V8_UNLIKELY(ch & ~0x7F)
1185 ? static_cast<uint16_t>(u_tolower(ch))
1186 : ToASCIILower(ch));
1187 }
1188 } else {
1189 // In this path, we start from the beginning of the string,
1190 // since there is nothing to memcpy from, and we have to
1191 // use the generic Get. Another option here would be to create
1192 // a two-byte string as output, and do a memcpy from that,
1193 // as Blink does, but there's also the ConsString case.
Yang 2016/01/07 09:47:38 As explained above, we should flatten upfront, so
1194 for (int index = 0; index < length; ++index) {
1195 uint16_t ch = s->Get(index);
1196 result->SeqOneByteStringSet(index,
1197 V8_UNLIKELY(ch & ~0x7F)
1198 ? static_cast<uint16_t>(u_tolower(ch))
1199 : ToASCIILower(ch));
1200 }
1201 }
1202
1203 return Handle<String>(*result);
1204 }
1205
1206 // Blink had an additional case here for ASCII 2-byte strings, but
1207 // that is subsumed by the above code (assuming there isn't a false
1208 // negative for HasOnlyOneByteChars).
1209
1210 // Do a slower implementation for cases that include non-ASCII characters.
1211 return ConvertCaseICU(s, isolate, u_strToLower);
1212 }
1213
1214
1215 const uint16_t sharp_s = L'\u00DF';
Yang 2016/01/07 09:47:38 0xDF should do the trick to, right?
1216
1217 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s,
1218 Isolate* isolate) {
1219 // This function could be optimized for no-op cases the way lower() is,
1220 // but in empirical testing, few actual calls to upper() are no-ops, so
1221 // it wouldn't be worth the extra time for pre-scanning.
1222
1223 int32_t length = s->length();
1224
1225 if (s->HasOnlyOneByteChars()) {
1226 Handle<SeqOneByteString> result =
1227 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1228
1229 // Do a faster loop for the case where all the characters are ASCII.
1230 uint16_t ored = 0;
1231 for (int index = 0; index < length; ++index) {
1232 uint16_t ch = s->Get(index);
Yang 2016/01/07 09:47:38 Again, please flatten upfront and use String::Flat
1233 ored |= ch;
1234 result->SeqOneByteStringSet(index, ToASCIIUpper(ch));
1235 }
1236 if (!(ored & ~0x7F)) return Handle<String>(*result);
1237
1238 // Do a slower implementation for cases that include non-ASCII Latin-1
1239 // characters.
1240 int sharp_s_count = 0;
1241
1242 // There are two special cases.
1243 // 1. latin-1 characters when converted to upper case are 16 bit
1244 // characters.
1245 // 2. Lower case sharp-S converts to "SS" (two characters)
1246 for (int32_t index = 0; index < length; ++index) {
1247 uint16_t ch = s->Get(index);
1248 if (V8_UNLIKELY(ch == sharp_s)) ++sharp_s_count;
1249 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));
1250 if (V8_UNLIKELY(upper > 0xff)) {
1251 // Since this upper-cased character does not fit in an 8-bit string, we
1252 // need to take the 16-bit path.
1253 goto upconvert;
Yang 2016/01/07 09:47:38 Can't we simply "return ConvertCaseICU(s, isolate,
1254 }
1255 result->SeqOneByteStringSet(index, upper);
1256 }
1257
1258 if (sharp_s_count == 0) return Handle<String>(*result);
1259
1260 // We have sharp_s_count sharp-s characters, but none of the other special
1261 // characters.
1262 result = isolate->factory()
1263 ->NewRawOneByteString(length + sharp_s_count)
1264 .ToHandleChecked();
1265 for (int32_t index = 0, dest_index = 0; index < length; ++index) {
1266 uint16_t ch = s->Get(index);
1267 if (ch == sharp_s) {
1268 result->SeqOneByteStringSet(dest_index++, 'S');
1269 result->SeqOneByteStringSet(dest_index++, 'S');
1270 } else {
1271 uint16_t upper =
1272 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));
1273 result->SeqOneByteStringSet(dest_index++, upper);
1274 }
1275 }
1276
1277 return Handle<String>(*result);
1278 }
1279
1280 upconvert:
1281 return ConvertCaseICU(s, isolate, u_strToUpper);
1282 }
1283
1284 } // namespace
1285 #endif
1286
1287
1072 RUNTIME_FUNCTION(Runtime_StringToLowerCase) { 1288 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {
1073 HandleScope scope(isolate); 1289 HandleScope scope(isolate);
1074 DCHECK(args.length() == 1); 1290 DCHECK_EQ(args.length(), 1);
1075 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1291 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1076 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping()); 1292 #ifdef V8_I18N_SUPPORT
1293 if (FLAG_icu_case_mapping)
Yang 2016/01/07 09:47:38 the convention with multiline conditionals is to u
1294 return *StringToLowerCase(s, isolate);
1295 else
1296 #endif
1297 return ConvertCase(s, isolate,
1298 isolate->runtime_state()->to_lower_mapping());
1077 } 1299 }
1078 1300
1079 1301
1080 RUNTIME_FUNCTION(Runtime_StringToUpperCase) { 1302 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {
1081 HandleScope scope(isolate); 1303 HandleScope scope(isolate);
1082 DCHECK(args.length() == 1); 1304 DCHECK_EQ(args.length(), 1);
1083 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1305 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1084 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping()); 1306 #ifdef V8_I18N_SUPPORT
1085 } 1307 if (FLAG_icu_case_mapping)
1086 1308 return *StringToUpperCase(s, isolate);
1087 1309 else
1310 #endif
1311 return ConvertCase(s, isolate,
1312 isolate->runtime_state()->to_upper_mapping());
1313 }
1314
1315
1088 RUNTIME_FUNCTION(Runtime_StringTrim) { 1316 RUNTIME_FUNCTION(Runtime_StringTrim) {
1089 HandleScope scope(isolate); 1317 HandleScope scope(isolate);
1090 DCHECK(args.length() == 3); 1318 DCHECK(args.length() == 3);
1091 1319
1092 CONVERT_ARG_HANDLE_CHECKED(String, string, 0); 1320 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);
1093 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1); 1321 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);
1094 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2); 1322 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);
1095 1323
1096 string = String::Flatten(string); 1324 string = String::Flatten(string);
1097 int length = string->length(); 1325 int length = string->length();
(...skipping 142 matching lines...) Expand 10 before | Expand all | Expand 10 after
1240 SealHandleScope shs(isolate); 1468 SealHandleScope shs(isolate);
1241 DCHECK(args.length() == 2); 1469 DCHECK(args.length() == 2);
1242 if (!args[0]->IsString()) return isolate->heap()->undefined_value(); 1470 if (!args[0]->IsString()) return isolate->heap()->undefined_value();
1243 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value(); 1471 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();
1244 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value(); 1472 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();
1245 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate); 1473 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);
1246 } 1474 }
1247 1475
1248 } // namespace internal 1476 } // namespace internal
1249 } // namespace v8 1477 } // namespace v8
OLDNEW
« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698