Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(3)

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: add a test with U+00FF Created 4 years, 8 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/runtime/runtime-utils.h" 5 #include "src/runtime/runtime-utils.h"
6 6
7 #include "src/arguments.h" 7 #include "src/arguments.h"
8 #include "src/conversions-inl.h" 8 #include "src/conversions-inl.h"
9 #include "src/isolate-inl.h" 9 #include "src/isolate-inl.h"
10 #include "src/regexp/jsregexp-inl.h" 10 #include "src/regexp/jsregexp-inl.h"
11 #include "src/regexp/jsregexp.h" 11 #include "src/regexp/jsregexp.h"
12 #include "src/string-builder.h" 12 #include "src/string-builder.h"
13 #include "src/string-search.h" 13 #include "src/string-search.h"
14 14
15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/locid.h"
17 #include "unicode/uchar.h"
18 #include "unicode/unistr.h"
19 #endif
20
15 namespace v8 { 21 namespace v8 {
16 namespace internal { 22 namespace internal {
17 23
18 24
19 // Perform string match of pattern on subject, starting at start index. 25 // Perform string match of pattern on subject, starting at start index.
20 // Caller must ensure that 0 <= start_index <= sub->length(), 26 // Caller must ensure that 0 <= start_index <= sub->length(),
21 // and should check that pat->length() + start_index <= sub->length(). 27 // and should check that pat->length() + start_index <= sub->length().
22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat, 28 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,
23 int start_index) { 29 int start_index) {
24 DCHECK(0 <= start_index); 30 DCHECK(0 <= start_index);
(...skipping 1045 matching lines...) Expand 10 before | Expand all | Expand 10 after
1070 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( 1076 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1071 isolate, result, isolate->factory()->NewRawOneByteString(length)); 1077 isolate, result, isolate->factory()->NewRawOneByteString(length));
1072 } else { 1078 } else {
1073 if (length < 0) length = -length; 1079 if (length < 0) length = -length;
1074 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( 1080 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1075 isolate, result, isolate->factory()->NewRawTwoByteString(length)); 1081 isolate, result, isolate->factory()->NewRawTwoByteString(length));
1076 } 1082 }
1077 return ConvertCaseHelper(isolate, *s, *result, length, mapping); 1083 return ConvertCaseHelper(isolate, *s, *result, length, mapping);
1078 } 1084 }
1079 1085
1086 #ifdef V8_I18N_SUPPORT
1087 namespace {
1088
1089 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s,
1090 Isolate* isolate,
1091 bool is_to_upper) {
1092 int32_t length = s->length();
1093 icu::UnicodeString converted;
1094 {
1095 DisallowHeapAllocation no_gc;
1096 DCHECK(s->IsFlat());
1097 String::FlatContent flat = s->GetFlatContent();
1098
1099 const UChar* src;
1100 if (flat.IsOneByte()) {
1101 base::SmartArrayPointer<uc16> sap = s->ToWideCString();
1102 src = reinterpret_cast<const UChar*>(sap.get());
1103 converted = icu::UnicodeString(src, length);
1104 } else {
1105 src = reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
1106 converted = icu::UnicodeString(src, length);
1107 }
1108 }
1109
1110 const icu::Locale& root_locale = icu::Locale::getRoot();
1111 if (is_to_upper)
1112 converted.toUpper(root_locale);
1113 else
1114 converted.toLower(root_locale);
1115
1116 return isolate->factory()
1117 ->NewStringFromTwoByte(Vector<const uint16_t>(
1118 reinterpret_cast<const uint16_t*>(converted.getBuffer()),
1119 converted.length()))
1120 .ToHandleChecked();
1121 }
1122
1123 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
1124
1125 inline uint16_t ToASCIILower(uint16_t ch) {
1126 return ch | ((ch >= 'A' && ch <= 'Z') << 5);
1127 }
1128
1129 inline uint16_t ToASCIIUpper(uint16_t ch) {
1130 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
1131 }
1132
1133 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s,
1134 Isolate* isolate) {
1135 // Note: This is a hot function in the Dromaeo benchmark, specifically the
1136 // no-op code path up through the first 'return' statement.
1137
1138 int length = s->length();
1139 s = String::Flatten(s);
1140 // First scan the string for uppercase and non-ASCII characters:
1141 if (s->HasOnlyOneByteChars()) {
1142 unsigned first_index_to_lower = length;
1143 for (int index = 0; index < length; ++index) {
1144 // Blink specializes this path for one-byte strings, so it
1145 // does not need to do a generic get, but can do the equivalent
1146 // of SeqOneByteStringGet.
1147 uint16_t ch = s->Get(index);
1148 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1149 first_index_to_lower = index;
1150 break;
1151 }
1152 }
1153
1154 // Nothing to do if the string is all ASCII with no uppercase.
1155 if (first_index_to_lower == length) return s;
1156
1157 // We depend here on the invariant that the length of a Latin1
1158 // string is invariant under ToLowerCase, and the result always
1159 // fits in the Latin1 range (untrue for ToUpperCase, and might
1160 // be untrue in some locales, but this is the root locale)
1161 Handle<SeqOneByteString> result =
1162 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1163 if (s->IsSeqOneByteString()) {
1164 SeqOneByteString* source = SeqOneByteString::cast(*s);
1165 CopyChars(result->GetChars(), source->GetChars(), first_index_to_lower);
1166 } else {
1167 // Do we have to worry about External{One,Two}ByteString?
1168 DCHECK(s->IsSeqTwoByteString());
1169 SeqTwoByteString* source = SeqTwoByteString::cast(*s);
1170 CopyChars(result->GetChars(), source->GetChars(), first_index_to_lower);
1171 }
1172
1173 for (int index = first_index_to_lower; index < length; ++index) {
1174 uint16_t ch = s->Get(index);
1175 result->SeqOneByteStringSet(
1176 index, V8_UNLIKELY(ch & ~0x7F) ? static_cast<uint16_t>(u_tolower(ch))
1177 : ToASCIILower(ch));
1178 }
1179
1180 return Handle<String>(*result);
1181 }
1182
1183 // Blink had an additional case here for ASCII 2-byte strings, but
1184 // that is subsumed by the above code (assuming there isn't a false
1185 // negative for HasOnlyOneByteChars).
1186
1187 // Do a slower implementation for cases that include non-ASCII characters.
1188 return ConvertCaseICU(s, isolate, false);
1189 }
1190
1191 const uint16_t sharp_s = 0x00DFu;
1192
1193 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s,
1194 Isolate* isolate) {
1195 // This function could be optimized for no-op cases the way lower() is,
1196 // but in empirical testing, few actual calls to upper() are no-ops, so
1197 // it wouldn't be worth the extra time for pre-scanning.
1198
1199 int32_t length = s->length();
1200 s = String::Flatten(s);
1201
1202 if (s->HasOnlyOneByteChars()) {
1203 Handle<SeqOneByteString> result =
1204 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1205
1206 // Do a faster loop for the case where all the characters are ASCII.
1207 uint16_t ored = 0;
1208 for (int index = 0; index < length; ++index) {
1209 uint16_t ch = s->Get(index);
1210 ored |= ch;
1211 result->SeqOneByteStringSet(index, ToASCIIUpper(ch));
1212 }
1213 if (!(ored & ~0x7F)) return Handle<String>(*result);
1214
1215 // Do a slower implementation for cases that include non-ASCII Latin-1
1216 // characters.
1217 int sharp_s_count = 0;
1218
1219 // There are two special cases.
1220 // 1. latin-1 characters when converted to upper case are 16 bit
1221 // characters.
1222 // 2. Lower case sharp-S converts to "SS" (two characters)
1223 for (int32_t index = 0; index < length; ++index) {
1224 uint16_t ch = s->Get(index);
1225 if (V8_UNLIKELY(ch == sharp_s)) {
1226 ++sharp_s_count;
1227 continue;
1228 }
1229 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));
1230 if (V8_UNLIKELY(upper > 0xff)) {
1231 // Since this upper-cased character does not fit in an 8-bit string, we
1232 // need to take the 16-bit path.
1233 return ConvertCaseICU(s, isolate, true);
1234 }
1235 result->SeqOneByteStringSet(index, upper);
1236 }
1237
1238 if (sharp_s_count == 0) return Handle<String>(*result);
1239
1240 // We have sharp_s_count sharp-s characters, but none of the other special
1241 // characters.
1242 result = isolate->factory()
1243 ->NewRawOneByteString(length + sharp_s_count)
1244 .ToHandleChecked();
1245 for (int32_t index = 0, dest_index = 0; index < length; ++index) {
1246 uint16_t ch = s->Get(index);
1247 if (ch == sharp_s) {
1248 result->SeqOneByteStringSet(dest_index++, 'S');
1249 result->SeqOneByteStringSet(dest_index++, 'S');
1250 } else {
1251 uint16_t upper =
1252 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));
1253 result->SeqOneByteStringSet(dest_index++, upper);
1254 }
1255 }
1256
1257 return Handle<String>(*result);
1258 }
1259
1260 return ConvertCaseICU(s, isolate, true);
1261 }
1262
1263 } // namespace
1264 #endif
1080 1265
1081 RUNTIME_FUNCTION(Runtime_StringToLowerCase) { 1266 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {
1082 HandleScope scope(isolate); 1267 HandleScope scope(isolate);
1083 DCHECK(args.length() == 1); 1268 DCHECK_EQ(args.length(), 1);
1084 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1269 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1085 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping()); 1270 #ifdef V8_I18N_SUPPORT
1271 if (FLAG_icu_case_mapping)
jungshik at Google 2016/04/08 19:44:42 Do we need this flag once we're satisfied with per
Dan Ehrenberg 2016/04/08 21:13:46 Microbenchmarks and results here: https://docs.go
1272 return *StringToLowerCase(s, isolate);
1273 else
1274 #endif
1275 return ConvertCase(s, isolate,
1276 isolate->runtime_state()->to_lower_mapping());
1086 } 1277 }
1087 1278
1088 1279
1089 RUNTIME_FUNCTION(Runtime_StringToUpperCase) { 1280 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {
1090 HandleScope scope(isolate); 1281 HandleScope scope(isolate);
1091 DCHECK(args.length() == 1); 1282 DCHECK_EQ(args.length(), 1);
1092 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1283 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1093 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping()); 1284 #ifdef V8_I18N_SUPPORT
1094 } 1285 if (FLAG_icu_case_mapping)
1095 1286 return *StringToUpperCase(s, isolate);
1096 1287 else
1288 #endif
1289 return ConvertCase(s, isolate,
1290 isolate->runtime_state()->to_upper_mapping());
1291 }
1292
1293
1097 RUNTIME_FUNCTION(Runtime_StringTrim) { 1294 RUNTIME_FUNCTION(Runtime_StringTrim) {
1098 HandleScope scope(isolate); 1295 HandleScope scope(isolate);
1099 DCHECK(args.length() == 3); 1296 DCHECK(args.length() == 3);
1100 1297
1101 CONVERT_ARG_HANDLE_CHECKED(String, string, 0); 1298 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);
1102 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1); 1299 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);
1103 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2); 1300 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);
1104 1301
1105 string = String::Flatten(string); 1302 string = String::Flatten(string);
1106 int length = string->length(); 1303 int length = string->length();
(...skipping 211 matching lines...) Expand 10 before | Expand all | Expand 10 after
1318 SealHandleScope shs(isolate); 1515 SealHandleScope shs(isolate);
1319 DCHECK(args.length() == 2); 1516 DCHECK(args.length() == 2);
1320 if (!args[0]->IsString()) return isolate->heap()->undefined_value(); 1517 if (!args[0]->IsString()) return isolate->heap()->undefined_value();
1321 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value(); 1518 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();
1322 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value(); 1519 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();
1323 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate); 1520 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);
1324 } 1521 }
1325 1522
1326 } // namespace internal 1523 } // namespace internal
1327 } // namespace v8 1524 } // namespace v8
OLDNEW
« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698