Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(102)

Side by Side Diff: src/runtime/runtime-strings.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: working now with HandleChecked Created 4 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 #include "src/runtime/runtime-utils.h" 5 #include "src/runtime/runtime-utils.h"
6 6
7 #include "src/arguments.h" 7 #include "src/arguments.h"
8 #include "src/conversions-inl.h" 8 #include "src/conversions-inl.h"
9 #include "src/isolate-inl.h" 9 #include "src/isolate-inl.h"
10 #include "src/regexp/jsregexp-inl.h" 10 #include "src/regexp/jsregexp-inl.h"
11 #include "src/regexp/jsregexp.h" 11 #include "src/regexp/jsregexp.h"
12 #include "src/string-builder.h" 12 #include "src/string-builder.h"
13 #include "src/string-search.h" 13 #include "src/string-search.h"
14 14
15 #ifdef V8_I18N_SUPPORT
16 #include "unicode/uchar.h"
17 #include "unicode/unistr.h"
18 #endif
19
15 namespace v8 { 20 namespace v8 {
16 namespace internal { 21 namespace internal {
17 22
18 23
19 // Perform string match of pattern on subject, starting at start index. 24 // Perform string match of pattern on subject, starting at start index.
20 // Caller must ensure that 0 <= start_index <= sub->length(), 25 // Caller must ensure that 0 <= start_index <= sub->length(),
21 // and should check that pat->length() + start_index <= sub->length(). 26 // and should check that pat->length() + start_index <= sub->length().
22 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat, 27 int StringMatch(Isolate* isolate, Handle<String> sub, Handle<String> pat,
23 int start_index) { 28 int start_index) {
24 DCHECK(0 <= start_index); 29 DCHECK(0 <= start_index);
(...skipping 1047 matching lines...) Expand 10 before | Expand all | Expand 10 after
1072 isolate, result, isolate->factory()->NewRawOneByteString(length)); 1077 isolate, result, isolate->factory()->NewRawOneByteString(length));
1073 } else { 1078 } else {
1074 if (length < 0) length = -length; 1079 if (length < 0) length = -length;
1075 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( 1080 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1076 isolate, result, isolate->factory()->NewRawTwoByteString(length)); 1081 isolate, result, isolate->factory()->NewRawTwoByteString(length));
1077 } 1082 }
1078 return ConvertCaseHelper(isolate, *s, *result, length, mapping); 1083 return ConvertCaseHelper(isolate, *s, *result, length, mapping);
1079 } 1084 }
1080 1085
1081 1086
1087 #ifdef V8_I18N_SUPPORT
1088 namespace {
1089
1090 MUST_USE_RESULT static Handle<String> ConvertCaseICU(Handle<String> s,
1091 Isolate* isolate,
1092 bool is_to_upper) {
1093 int32_t length = s->length();
1094
1095 // If we already have a UTF-16 string, use that, otherwise build it
1096 base::SmartArrayPointer<uc16> sap;
1097 const UChar* src;
1098 if (StringShape(*s).IsSequentialTwoByte()) {
1099 src =
1100 reinterpret_cast<const UChar*>(SeqTwoByteString::cast(*s)->GetChars());
1101 } else {
1102 sap = s->ToWideCString(ROBUST_STRING_TRAVERSAL);
1103 src = reinterpret_cast<const UChar*>(sap.get());
1104 }
1105
1106 // This UnicodeString ctor has copy-on-write semantics. It starts as a
1107 // read-only alias but the buffer is copied when it's written to.
1108 icu::UnicodeString converted(0, src, length);
1109 if (is_to_upper)
1110 converted.toUpper();
1111 else
1112 converted.toLower();
1113
1114 #if 0
1115 Handle<String> result;
1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1117 isolate, result,
1118 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(
1119 reinterpret_cast<const uint16_t*>(converted.getBuffer()),
1120 converted.length())));
1121 #endif
1122 return isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(
1123 reinterpret_cast<const uint16_t*>(converted.getBuffer()),
1124 converted.length())).ToHandleChecked();
1125 }
1126
1127
1128 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
1129
1130
1131 inline uint16_t ToASCIILower(uint16_t ch) {
1132 return ch | ((ch >= 'A' && ch <= 'Z') << 5);
1133 }
1134
1135
1136 inline uint16_t ToASCIIUpper(uint16_t ch) {
1137 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
1138 }
1139
1140
1141 MUST_USE_RESULT Handle<String> StringToLowerCase(Handle<String> s,
1142 Isolate* isolate) {
1143 // Note: This is a hot function in the Dromaeo benchmark, specifically the
1144 // no-op code path up through the first 'return' statement.
1145
1146 int length = s->length();
1147 // First scan the string for uppercase and non-ASCII characters:
1148 if (s->HasOnlyOneByteChars()) {
1149 unsigned first_index_to_lower = length;
1150 for (int index = 0; index < length; ++index) {
1151 // Blink specializes this path for one-byte strings, so it
1152 // does not need to do a generic get, but can do the equivalent
1153 // of SeqOneByteStringGet.
1154 uint16_t ch = s->Get(index);
1155 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1156 first_index_to_lower = index;
1157 break;
1158 }
1159 }
1160
1161 // Nothing to do if the string is all ASCII with no uppercase.
1162 if (first_index_to_lower == length) return s;
1163
1164 // We depend here on the invariant that the length of a Latin1
1165 // string is invariant under ToLowerCase, and the result always
1166 // fits in the Latin1 range (untrue for ToUpperCase, and might
1167 // be untrue in some locales, but this is the root locale)
1168 Handle<SeqOneByteString> result =
1169 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1170 // In Blink, this is a simple memcpy, but in V8, the path applies in
1171 // more cases. The optimization here is conditional on whether the
1172 // source is actually a simple 8-bit string (always true in Blink).
1173 // The broader condition lets us eliminate a bunch of duplicate code
1174 // which Blink had in a separate section below.
1175 if (StringShape(*s).IsSequentialOneByte()) {
1176 // In this path, we can use the one-byte-specific Get, and
1177 // memcpy until the first_index_to_lower.
1178 SeqOneByteString* source = SeqOneByteString::cast(*s);
1179 memcpy(result->GetChars(), source->GetChars(), first_index_to_lower);
1180 for (int index = first_index_to_lower; index < length; ++index) {
1181 uint16_t ch = source->SeqOneByteStringGet(index);
1182 result->SeqOneByteStringSet(index,
1183 V8_UNLIKELY(ch & ~0x7F)
1184 ? static_cast<uint16_t>(u_tolower(ch))
1185 : ToASCIILower(ch));
1186 }
1187 } else {
1188 // In this path, we start from the beginning of the string,
1189 // since there is nothing to memcpy from, and we have to
1190 // use the generic Get. Another option here would be to create
1191 // a two-byte string as output, and do a memcpy from that,
1192 // as Blink does, but there's also the ConsString case.
1193 for (int index = 0; index < length; ++index) {
1194 uint16_t ch = s->Get(index);
1195 result->SeqOneByteStringSet(index,
1196 V8_UNLIKELY(ch & ~0x7F)
1197 ? static_cast<uint16_t>(u_tolower(ch))
1198 : ToASCIILower(ch));
1199 }
1200 }
1201
1202 return Handle<String>(*result);
1203 }
1204
1205 // Blink had an additional case here for ASCII 2-byte strings, but
1206 // that is subsumed by the above code (assuming there isn't a false
1207 // negative for HasOnlyOneByteChars).
1208
1209 // Do a slower implementation for cases that include non-ASCII characters.
1210 return ConvertCaseICU(s, isolate, u_strToLower);
1211 }
1212
1213
1214 const uint16_t sharp_s = L'\u00DF';
1215
1216 MUST_USE_RESULT Handle<String> StringToUpperCase(Handle<String> s,
1217 Isolate* isolate) {
1218 // This function could be optimized for no-op cases the way lower() is,
1219 // but in empirical testing, few actual calls to upper() are no-ops, so
1220 // it wouldn't be worth the extra time for pre-scanning.
1221
1222 int32_t length = s->length();
1223
1224 if (s->HasOnlyOneByteChars()) {
1225 Handle<SeqOneByteString> result =
1226 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1227
1228 // Do a faster loop for the case where all the characters are ASCII.
1229 uint16_t ored = 0;
1230 for (int index = 0; index < length; ++index) {
1231 uint16_t ch = s->Get(index);
1232 ored |= ch;
1233 result->SeqOneByteStringSet(index, ToASCIIUpper(ch));
1234 }
1235 if (!(ored & ~0x7F)) return Handle<String>(*result);
1236
1237 // Do a slower implementation for cases that include non-ASCII Latin-1
1238 // characters.
1239 int sharp_s_count = 0;
1240
1241 // There are two special cases.
1242 // 1. latin-1 characters when converted to upper case are 16 bit
1243 // characters.
1244 // 2. Lower case sharp-S converts to "SS" (two characters)
1245 for (int32_t index = 0; index < length; ++index) {
1246 uint16_t ch = s->Get(index);
1247 if (V8_UNLIKELY(ch == sharp_s)) ++sharp_s_count;
1248 uint16_t upper = static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));
1249 if (V8_UNLIKELY(upper > 0xff)) {
1250 // Since this upper-cased character does not fit in an 8-bit string, we
1251 // need to take the 16-bit path.
1252 goto upconvert;
1253 }
1254 result->SeqOneByteStringSet(index, upper);
1255 }
1256
1257 if (sharp_s_count == 0) return Handle<String>(*result);
1258
1259 // We have sharp_s_count sharp-s characters, but none of the other special
1260 // characters.
1261 result = isolate->factory()
1262 ->NewRawOneByteString(length + sharp_s_count)
1263 .ToHandleChecked();
1264 for (int32_t index = 0, dest_index = 0; index < length; ++index) {
1265 uint16_t ch = s->Get(index);
1266 if (ch == sharp_s) {
1267 result->SeqOneByteStringSet(dest_index++, 'S');
1268 result->SeqOneByteStringSet(dest_index++, 'S');
1269 } else {
1270 uint16_t upper =
1271 static_cast<uint16_t>(u_toupper(static_cast<UChar>(ch)));
1272 result->SeqOneByteStringSet(dest_index++, upper);
1273 }
1274 }
1275
1276 return Handle<String>(*result);
1277 }
1278
1279 upconvert:
1280 return ConvertCaseICU(s, isolate, u_strToUpper);
1281 }
1282
1283 } // namespace
1284 #endif
1285
1286
1082 RUNTIME_FUNCTION(Runtime_StringToLowerCase) { 1287 RUNTIME_FUNCTION(Runtime_StringToLowerCase) {
1083 HandleScope scope(isolate); 1288 HandleScope scope(isolate);
1084 DCHECK(args.length() == 1); 1289 DCHECK_EQ(args.length(), 1);
1085 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1290 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1086 return ConvertCase(s, isolate, isolate->runtime_state()->to_lower_mapping()); 1291 #ifdef V8_I18N_SUPPORT
1292 if (FLAG_icu_case_mapping)
1293 return *StringToLowerCase(s, isolate);
1294 else
1295 #endif
1296 return ConvertCase(s, isolate,
1297 isolate->runtime_state()->to_lower_mapping());
1087 } 1298 }
1088 1299
1089 1300
1090 RUNTIME_FUNCTION(Runtime_StringToUpperCase) { 1301 RUNTIME_FUNCTION(Runtime_StringToUpperCase) {
1091 HandleScope scope(isolate); 1302 HandleScope scope(isolate);
1092 DCHECK(args.length() == 1); 1303 DCHECK_EQ(args.length(), 1);
1093 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1304 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1094 return ConvertCase(s, isolate, isolate->runtime_state()->to_upper_mapping()); 1305 #ifdef V8_I18N_SUPPORT
1095 } 1306 if (FLAG_icu_case_mapping)
1096 1307 return *StringToUpperCase(s, isolate);
1097 1308 else
1309 #endif
1310 return ConvertCase(s, isolate,
1311 isolate->runtime_state()->to_upper_mapping());
1312 }
1313
1314
1098 RUNTIME_FUNCTION(Runtime_StringTrim) { 1315 RUNTIME_FUNCTION(Runtime_StringTrim) {
1099 HandleScope scope(isolate); 1316 HandleScope scope(isolate);
1100 DCHECK(args.length() == 3); 1317 DCHECK(args.length() == 3);
1101 1318
1102 CONVERT_ARG_HANDLE_CHECKED(String, string, 0); 1319 CONVERT_ARG_HANDLE_CHECKED(String, string, 0);
1103 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1); 1320 CONVERT_BOOLEAN_ARG_CHECKED(trimLeft, 1);
1104 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2); 1321 CONVERT_BOOLEAN_ARG_CHECKED(trimRight, 2);
1105 1322
1106 string = String::Flatten(string); 1323 string = String::Flatten(string);
1107 int length = string->length(); 1324 int length = string->length();
(...skipping 211 matching lines...) Expand 10 before | Expand all | Expand 10 after
1319 SealHandleScope shs(isolate); 1536 SealHandleScope shs(isolate);
1320 DCHECK(args.length() == 2); 1537 DCHECK(args.length() == 2);
1321 if (!args[0]->IsString()) return isolate->heap()->undefined_value(); 1538 if (!args[0]->IsString()) return isolate->heap()->undefined_value();
1322 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value(); 1539 if (!args[1]->IsNumber()) return isolate->heap()->undefined_value();
1323 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value(); 1540 if (std::isinf(args.number_at(1))) return isolate->heap()->nan_value();
1324 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate); 1541 return __RT_impl_Runtime_StringCharCodeAtRT(args, isolate);
1325 } 1542 }
1326 1543
1327 } // namespace internal 1544 } // namespace internal
1328 } // namespace v8 1545 } // namespace v8
OLDNEW
« no previous file with comments | « src/flag-definitions.h ('k') | test/intl/general/case-mapping.js » ('j') | test/intl/testcfg.py » ('J')

Powered by Google App Engine
This is Rietveld 408576698