src/runtime/runtime-strings.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: working now with HandleChecked Created 4 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/runtime/runtime-utils.h"	5 #include "src/runtime/runtime-utils.h"

6	6

7 #include "src/arguments.h"	7 #include "src/arguments.h"

8 #include "src/conversions-inl.h"	8 #include "src/conversions-inl.h"

9 #include "src/isolate-inl.h"	9 #include "src/isolate-inl.h"

10 #include "src/regexp/jsregexp-inl.h"	10 #include "src/regexp/jsregexp-inl.h"

11 #include "src/regexp/jsregexp.h"	11 #include "src/regexp/jsregexp.h"

12 #include "src/string-builder.h"	12 #include "src/string-builder.h"

13 #include "src/string-search.h"	13 #include "src/string-search.h"

14	14

	15 #ifdef V8_I18N_SUPPORT

	16 #include "unicode/uchar.h"

	17 #include "unicode/unistr.h"

	18 #endif

	19

15 namespace v8 {	20 namespace v8 {

16 namespace internal {	21 namespace internal {

17	22

18	23

19 // Perform string match of pattern on subject, starting at start index.	24 // Perform string match of pattern on subject, starting at start index.

20 // Caller must ensure that 0 <= start_index <= sub->length(),	25 // Caller must ensure that 0 <= start_index <= sub->length(),

21 // and should check that pat->length() + start_index <= sub->length().	26 // and should check that pat->length() + start_index <= sub->length().

22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,	27 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,

23 int start_index) {	28 int start_index) {

24 DCHECK(0 <= start_index);	29 DCHECK(0 <= start_index);

(...skipping 1047 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1072 isolate, result, isolate->factory()->NewRawOneByteString(length));	1077 isolate, result, isolate->factory()->NewRawOneByteString(length));

1073 } else {	1078 } else {

1074 if (length < 0) length = -length;	1079 if (length < 0) length = -length;

1075 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1080 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1076 isolate, result, isolate->factory()->NewRawTwoByteString(length));	1081 isolate, result, isolate->factory()->NewRawTwoByteString(length));

1077 }	1082 }

1078 return ConvertCaseHelper(isolate, s, result, length, mapping);	1083 return ConvertCaseHelper(isolate, s, result, length, mapping);

1079 }	1084 }

1080	1085

1081	1086

	1087 #ifdef V8_I18N_SUPPORT

	1088 namespace {

	1089

	1090 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s,

	1091 Isolate* isolate,

	1092 bool is_to_upper) {

	1093 int32_t length = s->length();

	1094

	1095 // If we already have a UTF-16 string, use that, otherwise build it

	1096 base::SmartArrayPointer<uc16> sap;

	1097 const UChar* src;

	1098 if (StringShape(*s).IsSequentialTwoByte()) {

	1099 src =

	1100 reinterpret_cast<const UChar>(SeqTwoByteString::cast(s)->GetChars());

	1101 } else {

	1102 sap = s->ToWideCString(ROBUST_STRING_TRAVERSAL);

	1103 src = reinterpret_cast<const UChar*>(sap.get());

	1104 }

	1105

	1106 // This UnicodeString ctor has copy-on-write semantics. It starts as a

	1107 // read-only alias but the buffer is copied when it's written to.

	1108 icu::UnicodeString converted(0, src, length);

	1109 if (is_to_upper)

	1110 converted.toUpper();

	1111 else

	1112 converted.toLower();

	1113

	1114 #if 0

	1115 Handle<String> result;

	1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1117 isolate, result,

	1118 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

	1119 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	1120 converted.length())));

	1121 #endif

	1122 return isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(

	1123 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	1124 converted.length())).ToHandleChecked();

	1125 }

	1126

	1127

	1128 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	1129

	1130

	1131 inline uint16_t ToASCIILower(uint16_t ch) {

	1132 return ch \| ((ch >= 'A' && ch <= 'Z') << 5);

	1133 }

	1134

	1135

	1136 inline uint16_t ToASCIIUpper(uint16_t ch) {

	1137 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	1138 }

	1139

	1140

	1141 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s,

	1142 Isolate* isolate) {

	1143 // Note: This is a hot function in the Dromaeo benchmark, specifically the

	1144 // no-op code path up through the first 'return' statement.

	1145

	1146 int length = s->length();

	1147 // First scan the string for uppercase and non-ASCII characters:

	1148 if (s->HasOnlyOneByteChars()) {

	1149 unsigned first_index_to_lower = length;

	1150 for (int index = 0; index < length; ++index) {

	1151 // Blink specializes this path for one-byte strings, so it

	1152 // does not need to do a generic get, but can do the equivalent

	1153 // of SeqOneByteStringGet.

	1154 uint16_t ch = s->Get(index);

	1155 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1156 first_index_to_lower = index;

	1157 break;

	1158 }

	1159 }

	1160

	1161 // Nothing to do if the string is all ASCII with no uppercase.

	1162 if (first_index_to_lower == length) return s;

	1163

	1164 // We depend here on the invariant that the length of a Latin1

	1165 // string is invariant under ToLowerCase, and the result always

	1166 // fits in the Latin1 range (untrue for ToUpperCase, and might

	1167 // be untrue in some locales, but this is the root locale)

	1168 Handle<SeqOneByteString> result =

	1169 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1170 // In Blink, this is a simple memcpy, but in V8, the path applies in

	1171 // more cases. The optimization here is conditional on whether the

	1172 // source is actually a simple 8-bit string (always true in Blink).

	1173 // The broader condition lets us eliminate a bunch of duplicate code

	1174 // which Blink had in a separate section below.

	1175 if (StringShape(*s).IsSequentialOneByte()) {

	1176 // In this path, we can use the one-byte-specific Get, and

	1177 // memcpy until the first_index_to_lower.

	1178 SeqOneByteString* source = SeqOneByteString::cast(*s);

	1179 memcpy(result->GetChars(), source->GetChars(), first_index_to_lower);

	1180 for (int index = first_index_to_lower; index < length; ++index) {

	1181 uint16_t ch = source->SeqOneByteStringGet(index);

	1182 result->SeqOneByteStringSet(index,

	1183 V8_UNLIKELY(ch & ~0x7F)

	1184 ? static_cast<uint16_t>(u_tolower(ch))

	1185 : ToASCIILower(ch));

	1186 }

	1187 } else {

	1188 // In this path, we start from the beginning of the string,

	1189 // since there is nothing to memcpy from, and we have to

	1190 // use the generic Get. Another option here would be to create

	1191 // a two-byte string as output, and do a memcpy from that,

	1192 // as Blink does, but there's also the ConsString case.

	1193 for (int index = 0; index < length; ++index) {

	1194 uint16_t ch = s->Get(index);

	1195 result->SeqOneByteStringSet(index,

	1196 V8_UNLIKELY(ch & ~0x7F)

	1197 ? static_cast<uint16_t>(u_tolower(ch))

	1198 : ToASCIILower(ch));

	1199 }

	1200 }

	1201

	1202 return Handle<String>(*result);

	1203 }

	1204

	1205 // Blink had an additional case here for ASCII 2-byte strings, but

	1206 // that is subsumed by the above code (assuming there isn't a false

	1207 // negative for HasOnlyOneByteChars).

	1208

	1209 // Do a slower implementation for cases that include non-ASCII characters.

	1210 return ConvertCaseICU(s, isolate, u_strToLower);

	1211 }

	1212

	1213

	1214 const uint16_t sharp_s = L'\u00DF';

	1215

	1216 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s,

	1217 Isolate* isolate) {

	1218 // This function could be optimized for no-op cases the way lower() is,

	1219 // but in empirical testing, few actual calls to upper() are no-ops, so

	1220 // it wouldn't be worth the extra time for pre-scanning.

	1221

	1222 int32_t length = s->length();

	1223

	1224 if (s->HasOnlyOneByteChars()) {

	1225 Handle<SeqOneByteString> result =

	1226 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1227

	1228 // Do a faster loop for the case where all the characters are ASCII.

	1229 uint16_t ored = 0;

	1230 for (int index = 0; index < length; ++index) {

	1231 uint16_t ch = s->Get(index);

	1232 ored \|= ch;

	1233 result->SeqOneByteStringSet(index, ToASCIIUpper(ch));

	1234 }

	1235 if (!(ored & ~0x7F)) return Handle<String>(*result);

	1236

	1237 // Do a slower implementation for cases that include non-ASCII Latin-1

	1238 // characters.

	1239 int sharp_s_count = 0;

	1240

	1241 // There are two special cases.

	1242 // 1. latin-1 characters when converted to upper case are 16 bit

	1243 // characters.

	1244 // 2. Lower case sharp-S converts to "SS" (two characters)

	1245 for (int32_t index = 0; index < length; ++index) {

	1246 uint16_t ch = s->Get(index);

	1247 if (V8_UNLIKELY(ch == sharp_s)) ++sharp_s_count;

	1248 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1249 if (V8_UNLIKELY(upper > 0xff)) {

	1250 // Since this upper-cased character does not fit in an 8-bit string, we

	1251 // need to take the 16-bit path.

	1252 goto upconvert;

	1253 }

	1254 result->SeqOneByteStringSet(index, upper);

	1255 }

	1256

	1257 if (sharp_s_count == 0) return Handle<String>(*result);

	1258

	1259 // We have sharp_s_count sharp-s characters, but none of the other special

	1260 // characters.

	1261 result = isolate->factory()

	1262 ->NewRawOneByteString(length + sharp_s_count)

	1263 .ToHandleChecked();

	1264 for (int32_t index = 0, dest_index = 0; index < length; ++index) {

	1265 uint16_t ch = s->Get(index);

	1266 if (ch == sharp_s) {

	1267 result->SeqOneByteStringSet(dest_index++, 'S');

	1268 result->SeqOneByteStringSet(dest_index++, 'S');

	1269 } else {

	1270 uint16_t upper =

	1271 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1272 result->SeqOneByteStringSet(dest_index++, upper);

	1273 }

	1274 }

	1275

	1276 return Handle<String>(*result);

	1277 }

	1278

	1279 upconvert:

	1280 return ConvertCaseICU(s, isolate, u_strToUpper);

	1281 }

	1282

	1283 } // namespace

	1284 #endif

	1285

	1286

1082 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {	1287 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {

1083 HandleScope scope(isolate);	1288 HandleScope scope(isolate);

1084 DCHECK(args.length() == 1);	1289 DCHECK_EQ(args.length(), 1);

1085 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1290 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1086 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping());	1291 #ifdef V8_I18N_SUPPORT

	1292 if (FLAG_icu_case_mapping)

	1293 return *StringToLowerCase(s, isolate);

	1294 else

	1295 #endif

	1296 return ConvertCase(s, isolate,

	1297 isolate->runtime_state()->to_lower_mapping());

1087 }	1298 }

1088	1299

1089	1300

1090 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {	1301 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {

1091 HandleScope scope(isolate);	1302 HandleScope scope(isolate);

1092 DCHECK(args.length() == 1);	1303 DCHECK_EQ(args.length(), 1);

1093 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1304 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1094 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping());	1305 #ifdef V8_I18N_SUPPORT

1095 }	1306 if (FLAG_icu_case_mapping)

1096	1307 return *StringToUpperCase(s, isolate);

1097	1308 else

	1309 #endif

	1310 return ConvertCase(s, isolate,

	1311 isolate->runtime_state()->to_upper_mapping());

	1312 }

	1313

	1314

1098 RUNTIME_FUNCTION(Runtime_StringTrim) {	1315 RUNTIME_FUNCTION(Runtime_StringTrim) {

1099 HandleScope scope(isolate);	1316 HandleScope scope(isolate);

1100 DCHECK(args.length() == 3);	1317 DCHECK(args.length() == 3);

1101	1318

1102 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);	1319 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);

1103 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);	1320 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);

1104 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);	1321 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);

1105	1322

1106 string = String::Flatten(string);	1323 string = String::Flatten(string);

1107 int length = string->length();	1324 int length = string->length();

(...skipping 211 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1319 SealHandleScope shs(isolate);	1536 SealHandleScope shs(isolate);

1320 DCHECK(args.length() == 2);	1537 DCHECK(args.length() == 2);

1321 if (!args[0]->IsString()) return isolate->heap()->undefined_value();	1538 if (!args[0]->IsString()) return isolate->heap()->undefined_value();

1322 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();	1539 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();

1323 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();	1540 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();

1324 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);	1541 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);

1325 }	1542 }

1326	1543

1327 } // namespace internal	1544 } // namespace internal

1328 } // namespace v8	1545 } // namespace v8

OLD	NEW

« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | test/intl/testcfg.py » ('J')