src/runtime/runtime-strings.cc - Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master

Patch Set: add a test with U+00FF Created 4 years, 8 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

View unified diff | Download patch

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5 #include "src/runtime/runtime-utils.h"	5 #include "src/runtime/runtime-utils.h"

6	6

7 #include "src/arguments.h"	7 #include "src/arguments.h"

8 #include "src/conversions-inl.h"	8 #include "src/conversions-inl.h"

9 #include "src/isolate-inl.h"	9 #include "src/isolate-inl.h"

10 #include "src/regexp/jsregexp-inl.h"	10 #include "src/regexp/jsregexp-inl.h"

11 #include "src/regexp/jsregexp.h"	11 #include "src/regexp/jsregexp.h"

12 #include "src/string-builder.h"	12 #include "src/string-builder.h"

13 #include "src/string-search.h"	13 #include "src/string-search.h"

14	14

	15 #ifdef V8_I18N_SUPPORT

	16 #include "unicode/locid.h"

	17 #include "unicode/uchar.h"

	18 #include "unicode/unistr.h"

	19 #endif

	20

15 namespace v8 {	21 namespace v8 {

16 namespace internal {	22 namespace internal {

17	23

18	24

19 // Perform string match of pattern on subject, starting at start index.	25 // Perform string match of pattern on subject, starting at start index.

20 // Caller must ensure that 0 <= start_index <= sub->length(),	26 // Caller must ensure that 0 <= start_index <= sub->length(),

21 // and should check that pat->length() + start_index <= sub->length().	27 // and should check that pat->length() + start_index <= sub->length().

22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,	28 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,

23 int start_index) {	29 int start_index) {

24 DCHECK(0 <= start_index);	30 DCHECK(0 <= start_index);

(...skipping 1045 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1070 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1076 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1071 isolate, result, isolate->factory()->NewRawOneByteString(length));	1077 isolate, result, isolate->factory()->NewRawOneByteString(length));

1072 } else {	1078 } else {

1073 if (length < 0) length = -length;	1079 if (length < 0) length = -length;

1074 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1080 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1075 isolate, result, isolate->factory()->NewRawTwoByteString(length));	1081 isolate, result, isolate->factory()->NewRawTwoByteString(length));

1076 }	1082 }

1077 return ConvertCaseHelper(isolate, s, result, length, mapping);	1083 return ConvertCaseHelper(isolate, s, result, length, mapping);

1078 }	1084 }

1079	1085

	1086 #ifdef V8_I18N_SUPPORT

	1087 namespace {

	1088

	1089 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s,

	1090 Isolate* isolate,

	1091 bool is_to_upper) {

	1092 int32_t length = s->length();

	1093 icu::UnicodeString converted;

	1094 {

	1095 DisallowHeapAllocation no_gc;

	1096 DCHECK(s->IsFlat());

	1097 String::FlatContent flat = s->GetFlatContent();

	1098

	1099 const UChar* src;

	1100 if (flat.IsOneByte()) {

	1101 base::SmartArrayPointer<uc16> sap = s->ToWideCString();

	1102 src = reinterpret_cast<const UChar*>(sap.get());

	1103 converted = icu::UnicodeString(src, length);

	1104 } else {

	1105 src = reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	1106 converted = icu::UnicodeString(src, length);

	1107 }

	1108 }

	1109

	1110 const icu::Locale& root_locale = icu::Locale::getRoot();

	1111 if (is_to_upper)

	1112 converted.toUpper(root_locale);

	1113 else

	1114 converted.toLower(root_locale);

	1115

	1116 return isolate->factory()

	1117 ->NewStringFromTwoByte(Vector<const uint16_t>(

	1118 reinterpret_cast<const uint16_t*>(converted.getBuffer()),

	1119 converted.length()))

	1120 .ToHandleChecked();

	1121 }

	1122

	1123 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	1124

	1125 inline uint16_t ToASCIILower(uint16_t ch) {

	1126 return ch \| ((ch >= 'A' && ch <= 'Z') << 5);

	1127 }

	1128

	1129 inline uint16_t ToASCIIUpper(uint16_t ch) {

	1130 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	1131 }

	1132

	1133 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s,

	1134 Isolate* isolate) {

	1135 // Note: This is a hot function in the Dromaeo benchmark, specifically the

	1136 // no-op code path up through the first 'return' statement.

	1137

	1138 int length = s->length();

	1139 s = String::Flatten(s);

	1140 // First scan the string for uppercase and non-ASCII characters:

	1141 if (s->HasOnlyOneByteChars()) {

	1142 unsigned first_index_to_lower = length;

	1143 for (int index = 0; index < length; ++index) {

	1144 // Blink specializes this path for one-byte strings, so it

	1145 // does not need to do a generic get, but can do the equivalent

	1146 // of SeqOneByteStringGet.

	1147 uint16_t ch = s->Get(index);

	1148 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1149 first_index_to_lower = index;

	1150 break;

	1151 }

	1152 }

	1153

	1154 // Nothing to do if the string is all ASCII with no uppercase.

	1155 if (first_index_to_lower == length) return s;

	1156

	1157 // We depend here on the invariant that the length of a Latin1

	1158 // string is invariant under ToLowerCase, and the result always

	1159 // fits in the Latin1 range (untrue for ToUpperCase, and might

	1160 // be untrue in some locales, but this is the root locale)

	1161 Handle<SeqOneByteString> result =

	1162 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1163 if (s->IsSeqOneByteString()) {

	1164 SeqOneByteString* source = SeqOneByteString::cast(*s);

	1165 CopyChars(result->GetChars(), source->GetChars(), first_index_to_lower);

	1166 } else {

	1167 // Do we have to worry about External{One,Two}ByteString?

	1168 DCHECK(s->IsSeqTwoByteString());

	1169 SeqTwoByteString* source = SeqTwoByteString::cast(*s);

	1170 CopyChars(result->GetChars(), source->GetChars(), first_index_to_lower);

	1171 }

	1172

	1173 for (int index = first_index_to_lower; index < length; ++index) {

	1174 uint16_t ch = s->Get(index);

	1175 result->SeqOneByteStringSet(

	1176 index, V8_UNLIKELY(ch & ~0x7F) ? static_cast<uint16_t>(u_tolower(ch))

	1177 : ToASCIILower(ch));

	1178 }

	1179

	1180 return Handle<String>(*result);

	1181 }

	1182

	1183 // Blink had an additional case here for ASCII 2-byte strings, but

	1184 // that is subsumed by the above code (assuming there isn't a false

	1185 // negative for HasOnlyOneByteChars).

	1186

	1187 // Do a slower implementation for cases that include non-ASCII characters.

	1188 return ConvertCaseICU(s, isolate, false);

	1189 }

	1190

	1191 const uint16_t sharp_s = 0x00DFu;

	1192

	1193 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s,

	1194 Isolate* isolate) {

	1195 // This function could be optimized for no-op cases the way lower() is,

	1196 // but in empirical testing, few actual calls to upper() are no-ops, so

	1197 // it wouldn't be worth the extra time for pre-scanning.

	1198

	1199 int32_t length = s->length();

	1200 s = String::Flatten(s);

	1201

	1202 if (s->HasOnlyOneByteChars()) {

	1203 Handle<SeqOneByteString> result =

	1204 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1205

	1206 // Do a faster loop for the case where all the characters are ASCII.

	1207 uint16_t ored = 0;

	1208 for (int index = 0; index < length; ++index) {

	1209 uint16_t ch = s->Get(index);

	1210 ored \|= ch;

	1211 result->SeqOneByteStringSet(index, ToASCIIUpper(ch));

	1212 }

	1213 if (!(ored & ~0x7F)) return Handle<String>(*result);

	1214

	1215 // Do a slower implementation for cases that include non-ASCII Latin-1

	1216 // characters.

	1217 int sharp_s_count = 0;

	1218

	1219 // There are two special cases.

	1220 // 1. latin-1 characters when converted to upper case are 16 bit

	1221 // characters.

	1222 // 2. Lower case sharp-S converts to "SS" (two characters)

	1223 for (int32_t index = 0; index < length; ++index) {

	1224 uint16_t ch = s->Get(index);

	1225 if (V8_UNLIKELY(ch == sharp_s)) {

	1226 ++sharp_s_count;

	1227 continue;

	1228 }

	1229 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1230 if (V8_UNLIKELY(upper > 0xff)) {

	1231 // Since this upper-cased character does not fit in an 8-bit string, we

	1232 // need to take the 16-bit path.

	1233 return ConvertCaseICU(s, isolate, true);

	1234 }

	1235 result->SeqOneByteStringSet(index, upper);

	1236 }

	1237

	1238 if (sharp_s_count == 0) return Handle<String>(*result);

	1239

	1240 // We have sharp_s_count sharp-s characters, but none of the other special

	1241 // characters.

	1242 result = isolate->factory()

	1243 ->NewRawOneByteString(length + sharp_s_count)

	1244 .ToHandleChecked();

	1245 for (int32_t index = 0, dest_index = 0; index < length; ++index) {

	1246 uint16_t ch = s->Get(index);

	1247 if (ch == sharp_s) {

	1248 result->SeqOneByteStringSet(dest_index++, 'S');

	1249 result->SeqOneByteStringSet(dest_index++, 'S');

	1250 } else {

	1251 uint16_t upper =

	1252 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));

	1253 result->SeqOneByteStringSet(dest_index++, upper);

	1254 }

	1255 }

	1256

	1257 return Handle<String>(*result);

	1258 }

	1259

	1260 return ConvertCaseICU(s, isolate, true);

	1261 }

	1262

	1263 } // namespace

	1264 #endif

1080	1265

1081 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {	1266 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {

1082 HandleScope scope(isolate);	1267 HandleScope scope(isolate);

1083 DCHECK(args.length() == 1);	1268 DCHECK_EQ(args.length(), 1);

1084 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1269 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1085 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping());	1270 #ifdef V8_I18N_SUPPORT

	1271 if (FLAG_icu_case_mapping)
	jungshik at Google 2016/04/08 19:44:42 Do we need this flag once we're satisfied with per Do we need this flag once we're satisfied with perf etc? Dan, what kind of micro benchmark test did you run? Dan Ehrenberg 2016/04/08 21:13:46 Microbenchmarks and results here: https://docs.go Show quoted text On 2016/04/08 at 19:44:42, jshin (jungshik at google) wrote: > Do we need this flag once we're satisfied with perf etc? > Dan, what kind of micro benchmark test did you run? Microbenchmarks and results here: https://docs.google.com/spreadsheets/d/1xDpYTaFVE97rtqQ5KyZCk4T_QJsKp-S_lRhQZ... Agreed that we won't need this flag long-term, but it's good to develop new features under a flag.
	1272 return *StringToLowerCase(s, isolate);

	1273 else

	1274 #endif

	1275 return ConvertCase(s, isolate,

	1276 isolate->runtime_state()->to_lower_mapping());

1086 }	1277 }

1087	1278

1088	1279

1089 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {	1280 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {

1090 HandleScope scope(isolate);	1281 HandleScope scope(isolate);

1091 DCHECK(args.length() == 1);	1282 DCHECK_EQ(args.length(), 1);

1092 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1283 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1093 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping());	1284 #ifdef V8_I18N_SUPPORT

1094 }	1285 if (FLAG_icu_case_mapping)

1095	1286 return *StringToUpperCase(s, isolate);

1096	1287 else

	1288 #endif

	1289 return ConvertCase(s, isolate,

	1290 isolate->runtime_state()->to_upper_mapping());

	1291 }

	1292

	1293

1097 RUNTIME_FUNCTION(Runtime_StringTrim) {	1294 RUNTIME_FUNCTION(Runtime_StringTrim) {

1098 HandleScope scope(isolate);	1295 HandleScope scope(isolate);

1099 DCHECK(args.length() == 3);	1296 DCHECK(args.length() == 3);

1100	1297

1101 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);	1298 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);

1102 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);	1299 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);

1103 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);	1300 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);

1104	1301

1105 string = String::Flatten(string);	1302 string = String::Flatten(string);

1106 int length = string->length();	1303 int length = string->length();

(...skipping 211 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1318 SealHandleScope shs(isolate);	1515 SealHandleScope shs(isolate);

1319 DCHECK(args.length() == 2);	1516 DCHECK(args.length() == 2);

1320 if (!args[0]->IsString()) return isolate->heap()->undefined_value();	1517 if (!args[0]->IsString()) return isolate->heap()->undefined_value();

1321 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();	1518 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();

1322 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();	1519 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();

1323 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);	1520 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);

1324 }	1521 }

1325	1522

1326 } // namespace internal	1523 } // namespace internal

1327 } // namespace v8	1524 } // namespace v8

OLD	NEW

« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | no next file with comments »