src/runtime/runtime-strings.cc - Issue 1544023002: Call out to ICU for case conversion

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 1544023002: Call out to ICU for case conversion Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: Additional test case Created 4 years, 12 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/runtime/runtime-utils.h"	5 #include "src/runtime/runtime-utils.h"

6	6

7 #include "src/arguments.h"	7 #include "src/arguments.h"

8 #include "src/conversions-inl.h"	8 #include "src/conversions-inl.h"

9 #include "src/isolate-inl.h"	9 #include "src/isolate-inl.h"

10 #include "src/regexp/jsregexp-inl.h"	10 #include "src/regexp/jsregexp-inl.h"

11 #include "src/regexp/jsregexp.h"	11 #include "src/regexp/jsregexp.h"

12 #include "src/string-builder.h"	12 #include "src/string-builder.h"

13 #include "src/string-search.h"	13 #include "src/string-search.h"

14	14

	15 #ifdef V8_I18N_SUPPORT

	16 #include "unicode/uchar.h"

	17 #include "unicode/ustring.h"

	18 #endif

	19

15 namespace v8 {	20 namespace v8 {

16 namespace internal {	21 namespace internal {

17	22

18	23

19 // Perform string match of pattern on subject, starting at start index.	24 // Perform string match of pattern on subject, starting at start index.

20 // Caller must ensure that 0 <= start_index <= sub->length(),	25 // Caller must ensure that 0 <= start_index <= sub->length(),

21 // and should check that pat->length() + start_index <= sub->length().	26 // and should check that pat->length() + start_index <= sub->length().

22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,	27 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,

23 int start_index) {	28 int start_index) {

24 DCHECK(0 <= start_index);	29 DCHECK(0 <= start_index);

(...skipping 1037 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1062 isolate, result, isolate->factory()->NewRawOneByteString(length));	1067 isolate, result, isolate->factory()->NewRawOneByteString(length));

1063 } else {	1068 } else {

1064 if (length < 0) length = -length;	1069 if (length < 0) length = -length;

1065 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1070 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1066 isolate, result, isolate->factory()->NewRawTwoByteString(length));	1071 isolate, result, isolate->factory()->NewRawTwoByteString(length));

1067 }	1072 }

1068 return ConvertCaseHelper(isolate, s, result, length, mapping);	1073 return ConvertCaseHelper(isolate, s, result, length, mapping);

1069 }	1074 }

1070	1075

1071	1076

	1077 #ifdef V8_I18N_SUPPORT

	1078 namespace {
	Yang 2016/01/07 09:47:38 Can we move all of that into its own file? Can we move all of that into its own file?
	1079

	1080 typedef int32_t (case_conversion_fn)(UChar dest, int32_t destCapacity,

	1081 const UChar* src, int32_t srcLength,

	1082 const char* locale,

	1083 UErrorCode* pErrorCode);

	1084

	1085 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s,

	1086 Isolate* isolate,

	1087 case_conversion_fn fn) {

	1088 int32_t length = s->length();

	1089 Handle<SeqTwoByteString> result =

	1090 isolate->factory()->NewRawTwoByteString(length).ToHandleChecked();

	1091

	1092 // If we already have a UTF-16 string, use that, otherwise build it

	1093 base::SmartArrayPointer<uc16> sap;

	1094 const UChar* src;

	1095 if (StringShape(*s).IsSequentialTwoByte()) {

	1096 src =

	1097 reinterpret_cast<const UChar>(SeqTwoByteString::cast(s)->GetChars());
	Yang 2016/01/07 09:47:38 Please use String::FlatContent to access the flat Please use String::FlatContent to access the flat content.
	1098 } else {

	1099 sap = s->ToWideCString(ROBUST_STRING_TRAVERSAL);

	1100 src = reinterpret_cast<const UChar*>(sap.get());

	1101 }

	1102

	1103 UErrorCode error = U_ZERO_ERROR;

	1104 int32_t real_length = fn(reinterpret_cast<UChar*>(result->GetChars()), length,

	1105 src, length, "", &error);

	1106 // If the lengths are equal, ICU will be unable to put the terminating \0

	1107 // but that's to be expected, as V8 strings are not null-terminated.

	1108 if (error == U_STRING_NOT_TERMINATED_WARNING) {

	1109 DCHECK_EQ(real_length, length);

	1110 return Handle<String>(*result);

	1111 }

	1112 // These are the two expected error types from an oversized converted string

	1113 // "buffer overflow" when needs more space; "success" when too long

	1114 DCHECK(error == U_BUFFER_OVERFLOW_ERROR \|\| error == U_ZERO_ERROR);

	1115

	1116 error = U_ZERO_ERROR;

	1117 result =

	1118 isolate->factory()->NewRawTwoByteString(real_length).ToHandleChecked();

	1119 int32_t real_length_again = fn(reinterpret_cast<UChar*>(result->GetChars()),
	Yang 2016/01/07 09:47:38 Instead of running the whole thing again, you coul Instead of running the whole thing again, you could simply create a new string with the excess length, convert the remaining characters, and create a cons string for both parts.
	1120 real_length, src, length, "", &error);
	Yang 2016/01/07 09:47:38 src can have moved already after the allocation of src can have moved already after the allocation of the two byte string. Please use a new instance of String::FlatContent to refetch the source.
	1121 USE(real_length_again); // Shouldn't that be part of DCHECK_EQ?

	1122 DCHECK_EQ(real_length, real_length_again);

	1123 DCHECK_EQ(error, U_STRING_NOT_TERMINATED_WARNING);

	1124 if (error != U_STRING_NOT_TERMINATED_WARNING) return s;

	1125 return Handle<String>(*result);

	1126 }

	1127

	1128

	1129 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	1130

	1131

	1132 inline uint16_t ToASCIILower(uint16_t ch) {

	1133 return ch \| ((ch >= 'A' && ch <= 'Z') << 5);

	1134 }

	1135

	1136

	1137 inline uint16_t ToASCIIUpper(uint16_t ch) {

	1138 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	1139 }

	1140

	1141

	1142 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s,

	1143 Isolate* isolate) {

	1144 // Note: This is a hot function in the Dromaeo benchmark, specifically the

	1145 // no-op code path up through the first 'return' statement.

	1146

	1147 int length = s->length();

	1148 // First scan the string for uppercase and non-ASCII characters:

	1149 if (s->HasOnlyOneByteChars()) {

	1150 unsigned first_index_to_lower = length;

	1151 for (int index = 0; index < length; ++index) {

	1152 // Blink specializes this path for one-byte strings, so it

	1153 // does not need to do a generic get, but can do the equivalent

	1154 // of SeqOneByteStringGet.

	1155 uint16_t ch = s->Get(index);
	Yang 2016/01/07 09:47:38 This can get pretty expensive if the string is a d This can get pretty expensive if the string is a deeply nested ConsString. Please flatten it and use String::FlatContent instead.
	1156 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1157 first_index_to_lower = index;

	1158 break;

	1159 }

	1160 }

	1161

	1162 // Nothing to do if the string is all ASCII with no uppercase.

	1163 if (first_index_to_lower == length) return s;

	1164

	1165 // We depend here on the invariant that the length of a Latin1

	1166 // string is invariant under ToLowerCase, and the result always

	1167 // fits in the Latin1 range (untrue for ToUpperCase, and might

	1168 // be untrue in some locales, but this is the root locale)

	1169 Handle<SeqOneByteString> result =

	1170 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1171 // In Blink, this is a simple memcpy, but in V8, the path applies in

	1172 // more cases. The optimization here is conditional on whether the

	1173 // source is actually a simple 8-bit string (always true in Blink).

	1174 // The broader condition lets us eliminate a bunch of duplicate code

	1175 // which Blink had in a separate section below.

	1176 if (StringShape(*s).IsSequentialOneByte()) {
	Yang 2016/01/07 09:47:38 Let's not use StringShape. Just check for s->IsSeq Let's not use StringShape. Just check for s->IsSeqOneByteString().
	1177 // In this path, we can use the one-byte-specific Get, and

	1178 // memcpy until the first_index_to_lower.

	1179 SeqOneByteString* source = SeqOneByteString::cast(*s);

	1180 memcpy(result->GetChars(), source->GetChars(), first_index_to_lower);
	Yang 2016/01/07 09:47:38 We should just use CopyChars for uniformity here. We should just use CopyChars for uniformity here. See below.
	1181 for (int index = first_index_to_lower; index < length; ++index) {

	1182 uint16_t ch = source->SeqOneByteStringGet(index);

	1183 result->SeqOneByteStringSet(index,

	1184 V8_UNLIKELY(ch & ~0x7F)

	1185 ? static_cast<uint16_t>(u_tolower(ch))

	1186 : ToASCIILower(ch));

	1187 }

	1188 } else {

	1189 // In this path, we start from the beginning of the string,

	1190 // since there is nothing to memcpy from, and we have to

	1191 // use the generic Get. Another option here would be to create

	1192 // a two-byte string as output, and do a memcpy from that,

	1193 // as Blink does, but there's also the ConsString case.
	Yang 2016/01/07 09:47:38 As explained above, we should flatten upfront, so As explained above, we should flatten upfront, so no ConsStrings here. We could then do CopyChars up to first_index_to_lower, which handles the two-byte to one-byte copying.
	1194 for (int index = 0; index < length; ++index) {

	1195 uint16_t ch = s->Get(index);

	1196 result->SeqOneByteStringSet(index,

	1197 V8_UNLIKELY(ch & ~0x7F)

	1198 ? static_cast<uint16_t>(u_tolower(ch))

	1199 : ToASCIILower(ch));

	1200 }

	1201 }

	1202

	1203 return Handle<String>(*result);

	1204 }

	1205

	1206 // Blink had an additional case here for ASCII 2-byte strings, but

	1207 // that is subsumed by the above code (assuming there isn't a false

	1208 // negative for HasOnlyOneByteChars).

	1209

	1210 // Do a slower implementation for cases that include non-ASCII characters.

	1211 return ConvertCaseICU(s, isolate, u_strToLower);

	1212 }

	1213

	1214

	1215 const uint16_t sharp_s = L'\u00DF';
	Yang 2016/01/07 09:47:38 0xDF should do the trick to, right? 0xDF should do the trick to, right?
	1216

	1217 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s,

	1218 Isolate* isolate) {

	1219 // This function could be optimized for no-op cases the way lower() is,

	1220 // but in empirical testing, few actual calls to upper() are no-ops, so

	1221 // it wouldn't be worth the extra time for pre-scanning.

	1222

	1223 int32_t length = s->length();

	1224

	1225 if (s->HasOnlyOneByteChars()) {

	1226 Handle<SeqOneByteString> result =

	1227 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1228

	1229 // Do a faster loop for the case where all the characters are ASCII.

	1230 uint16_t ored = 0;

	1231 for (int index = 0; index < length; ++index) {

	1232 uint16_t ch = s->Get(index);
	Yang 2016/01/07 09:47:38 Again, please flatten upfront and use String::Flat Again, please flatten upfront and use String::FlatContent.
	1233 ored \|= ch;

	1234 result->SeqOneByteStringSet(index, ToASCIIUpper(ch));

	1235 }

	1236 if (!(ored & ~0x7F)) return Handle<String>(*result);

	1237

	1238 // Do a slower implementation for cases that include non-ASCII Latin-1

	1239 // characters.

	1240 int sharp_s_count = 0;

	1241

	1242 // There are two special cases.

	1243 // 1. latin-1 characters when converted to upper case are 16 bit

	1244 // characters.

	1245 // 2. Lower case sharp-S converts to "SS" (two characters)

	1246 for (int32_t index = 0; index < length; ++index) {

	1247 uint16_t ch = s->Get(index);

	1248 if (V8_UNLIKELY(ch == sharp_s)) ++sharp_s_count;

	1249 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1250 if (V8_UNLIKELY(upper > 0xff)) {

	1251 // Since this upper-cased character does not fit in an 8-bit string, we

	1252 // need to take the 16-bit path.

	1253 goto upconvert;
	Yang 2016/01/07 09:47:38 Can't we simply "return ConvertCaseICU(s, isolate, Can't we simply "return ConvertCaseICU(s, isolate, u_strToUpper);"? I'd prefer this small code duplication way more than goto.
	1254 }

	1255 result->SeqOneByteStringSet(index, upper);

	1256 }

	1257

	1258 if (sharp_s_count == 0) return Handle<String>(*result);

	1259

	1260 // We have sharp_s_count sharp-s characters, but none of the other special

	1261 // characters.

	1262 result = isolate->factory()

	1263 ->NewRawOneByteString(length + sharp_s_count)

	1264 .ToHandleChecked();

	1265 for (int32_t index = 0, dest_index = 0; index < length; ++index) {

	1266 uint16_t ch = s->Get(index);

	1267 if (ch == sharp_s) {

	1268 result->SeqOneByteStringSet(dest_index++, 'S');

	1269 result->SeqOneByteStringSet(dest_index++, 'S');

	1270 } else {

	1271 uint16_t upper =

	1272 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1273 result->SeqOneByteStringSet(dest_index++, upper);

	1274 }

	1275 }

	1276

	1277 return Handle<String>(*result);

	1278 }

	1279

	1280 upconvert:

	1281 return ConvertCaseICU(s, isolate, u_strToUpper);

	1282 }

	1283

	1284 } // namespace

	1285 #endif

	1286

	1287

1072 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {	1288 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {

1073 HandleScope scope(isolate);	1289 HandleScope scope(isolate);

1074 DCHECK(args.length() == 1);	1290 DCHECK_EQ(args.length(), 1);

1075 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1291 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1076 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping());	1292 #ifdef V8_I18N_SUPPORT

	1293 if (FLAG_icu_case_mapping)
	Yang 2016/01/07 09:47:38 the convention with multiline conditionals is to u the convention with multiline conditionals is to use brackets.
	1294 return *StringToLowerCase(s, isolate);

	1295 else

	1296 #endif

	1297 return ConvertCase(s, isolate,

	1298 isolate->runtime_state()->to_lower_mapping());

1077 }	1299 }

1078	1300

1079	1301

1080 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {	1302 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {

1081 HandleScope scope(isolate);	1303 HandleScope scope(isolate);

1082 DCHECK(args.length() == 1);	1304 DCHECK_EQ(args.length(), 1);

1083 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1305 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1084 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping());	1306 #ifdef V8_I18N_SUPPORT

1085 }	1307 if (FLAG_icu_case_mapping)

1086	1308 return *StringToUpperCase(s, isolate);

1087	1309 else

	1310 #endif

	1311 return ConvertCase(s, isolate,

	1312 isolate->runtime_state()->to_upper_mapping());

	1313 }

	1314

	1315

1088 RUNTIME_FUNCTION(Runtime_StringTrim) {	1316 RUNTIME_FUNCTION(Runtime_StringTrim) {

1089 HandleScope scope(isolate);	1317 HandleScope scope(isolate);

1090 DCHECK(args.length() == 3);	1318 DCHECK(args.length() == 3);

1091	1319

1092 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);	1320 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);

1093 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);	1321 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);

1094 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);	1322 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);

1095	1323

1096 string = String::Flatten(string);	1324 string = String::Flatten(string);

1097 int length = string->length();	1325 int length = string->length();

(...skipping 142 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1240 SealHandleScope shs(isolate);	1468 SealHandleScope shs(isolate);

1241 DCHECK(args.length() == 2);	1469 DCHECK(args.length() == 2);

1242 if (!args[0]->IsString()) return isolate->heap()->undefined_value();	1470 if (!args[0]->IsString()) return isolate->heap()->undefined_value();

1243 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();	1471 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();

1244 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();	1472 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();

1245 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);	1473 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);

1246 }	1474 }

1247	1475

1248 } // namespace internal	1476 } // namespace internal

1249 } // namespace v8	1477 } // namespace v8

OLD	NEW

« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | no next file with comments »