Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(151)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 2533983006: Optimize case conversion with icu_case_mapping (Closed)
Patch Set: drop an unused variable: -Wunused-variable Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/js/i18n.js ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 5
6 #ifdef V8_I18N_SUPPORT 6 #ifdef V8_I18N_SUPPORT
7 #include "src/runtime/runtime-utils.h" 7 #include "src/runtime/runtime-utils.h"
8 8
9 #include <memory> 9 #include <memory>
10 10
11 #include "src/api-natives.h"
11 #include "src/api.h" 12 #include "src/api.h"
12 #include "src/api-natives.h"
13 #include "src/arguments.h" 13 #include "src/arguments.h"
14 #include "src/factory.h" 14 #include "src/factory.h"
15 #include "src/i18n.h" 15 #include "src/i18n.h"
16 #include "src/isolate-inl.h" 16 #include "src/isolate-inl.h"
17 #include "src/messages.h" 17 #include "src/messages.h"
18 #include "src/string-case.h"
19 #include "src/utils.h"
18 20
19 #include "unicode/brkiter.h" 21 #include "unicode/brkiter.h"
20 #include "unicode/calendar.h" 22 #include "unicode/calendar.h"
21 #include "unicode/coll.h" 23 #include "unicode/coll.h"
22 #include "unicode/curramt.h" 24 #include "unicode/curramt.h"
23 #include "unicode/datefmt.h" 25 #include "unicode/datefmt.h"
24 #include "unicode/dcfmtsym.h" 26 #include "unicode/dcfmtsym.h"
25 #include "unicode/decimfmt.h" 27 #include "unicode/decimfmt.h"
26 #include "unicode/dtfmtsym.h" 28 #include "unicode/dtfmtsym.h"
27 #include "unicode/dtptngen.h" 29 #include "unicode/dtptngen.h"
(...skipping 1006 matching lines...) Expand 10 before | Expand all | Expand 10 after
1034 uint16_t ch = static_cast<uint16_t>(*it); 1036 uint16_t ch = static_cast<uint16_t>(*it);
1035 ored |= ch; 1037 ored |= ch;
1036 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch)); 1038 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
1037 } 1039 }
1038 return !(ored & ~0x7F); 1040 return !(ored & ~0x7F);
1039 } 1041 }
1040 1042
1041 const uint16_t sharp_s = 0xDF; 1043 const uint16_t sharp_s = 0xDF;
1042 1044
1043 template <typename Char> 1045 template <typename Char>
1044 bool ToUpperOneByte(const Vector<const Char>& src, 1046 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,
1045 Handle<SeqOneByteString> result, int* sharp_s_count) { 1047 int* sharp_s_count) {
1046 // Still pretty-fast path for the input with non-ASCII Latin-1 characters. 1048 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
1047 1049
1048 // There are two special cases. 1050 // There are two special cases.
1049 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF. 1051 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
1050 // 2. Lower case sharp-S converts to "SS" (two characters) 1052 // 2. Lower case sharp-S converts to "SS" (two characters)
1051 *sharp_s_count = 0; 1053 *sharp_s_count = 0;
1052 int32_t index = 0;
1053 for (auto it = src.begin(); it != src.end(); ++it) { 1054 for (auto it = src.begin(); it != src.end(); ++it) {
1054 uint16_t ch = static_cast<uint16_t>(*it); 1055 uint16_t ch = static_cast<uint16_t>(*it);
1055 if (V8_UNLIKELY(ch == sharp_s)) { 1056 if (V8_UNLIKELY(ch == sharp_s)) {
1056 ++(*sharp_s_count); 1057 ++(*sharp_s_count);
1057 continue; 1058 continue;
1058 } 1059 }
1059 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) { 1060 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
1060 // Since this upper-cased character does not fit in an 8-bit string, we 1061 // Since this upper-cased character does not fit in an 8-bit string, we
1061 // need to take the 16-bit path. 1062 // need to take the 16-bit path.
1062 return false; 1063 return false;
1063 } 1064 }
1064 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch)); 1065 *dest++ = ToLatin1Upper(ch);
1065 } 1066 }
1066 1067
1067 return true; 1068 return true;
1068 } 1069 }
1069 1070
1070 template <typename Char> 1071 template <typename Char>
1071 void ToUpperWithSharpS(const Vector<const Char>& src, 1072 void ToUpperWithSharpS(const Vector<const Char>& src,
1072 Handle<SeqOneByteString> result) { 1073 Handle<SeqOneByteString> result) {
1073 int32_t dest_index = 0; 1074 int32_t dest_index = 0;
1074 for (auto it = src.begin(); it != src.end(); ++it) { 1075 for (auto it = src.begin(); it != src.end(); ++it) {
1075 uint16_t ch = static_cast<uint16_t>(*it); 1076 uint16_t ch = static_cast<uint16_t>(*it);
1076 if (ch == sharp_s) { 1077 if (ch == sharp_s) {
1077 result->SeqOneByteStringSet(dest_index++, 'S'); 1078 result->SeqOneByteStringSet(dest_index++, 'S');
1078 result->SeqOneByteStringSet(dest_index++, 'S'); 1079 result->SeqOneByteStringSet(dest_index++, 'S');
1079 } else { 1080 } else {
1080 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch)); 1081 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
1081 } 1082 }
1082 } 1083 }
1083 } 1084 }
1084 1085
1086 inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {
1087 for (int index = 0; index < length; ++index) {
1088 uint16_t ch = s->Get(index);
1089 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1090 return index;
1091 }
1092 }
1093 return length;
1094 }
1095
1085 } // namespace 1096 } // namespace
1086 1097
1087 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) { 1098 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
1088 HandleScope scope(isolate); 1099 HandleScope scope(isolate);
1089 DCHECK_EQ(args.length(), 1); 1100 DCHECK_EQ(args.length(), 1);
1090 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1101 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1091 1102
1092 int length = s->length(); 1103 int length = s->length();
1093 s = String::Flatten(s); 1104 s = String::Flatten(s);
1094 // First scan the string for uppercase and non-ASCII characters:
1095 if (s->HasOnlyOneByteChars()) {
1096 int first_index_to_lower = length;
1097 for (int index = 0; index < length; ++index) {
1098 // Blink specializes this path for one-byte strings, so it
1099 // does not need to do a generic get, but can do the equivalent
1100 // of SeqOneByteStringGet.
1101 uint16_t ch = s->Get(index);
1102 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1103 first_index_to_lower = index;
1104 break;
1105 }
1106 }
1107 1105
1108 // Nothing to do if the string is all ASCII with no uppercase. 1106 if (!s->HasOnlyOneByteChars()) {
1109 if (first_index_to_lower == length) return *s; 1107 // Use a slower implementation for strings with characters beyond U+00FF.
1110 1108 return LocaleConvertCase(s, isolate, false, "");
1111 // We depend here on the invariant that the length of a Latin1
1112 // string is invariant under ToLowerCase, and the result always
1113 // fits in the Latin1 range in the *root locale*. It does not hold
1114 // for ToUpperCase even in the root locale.
1115 Handle<SeqOneByteString> result;
1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1117 isolate, result, isolate->factory()->NewRawOneByteString(length));
1118
1119 DisallowHeapAllocation no_gc;
1120 String::FlatContent flat = s->GetFlatContent();
1121 if (flat.IsOneByte()) {
1122 const uint8_t* src = flat.ToOneByteVector().start();
1123 CopyChars(result->GetChars(), src,
1124 static_cast<size_t>(first_index_to_lower));
1125 for (int index = first_index_to_lower; index < length; ++index) {
1126 uint16_t ch = static_cast<uint16_t>(src[index]);
1127 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1128 }
1129 } else {
1130 const uint16_t* src = flat.ToUC16Vector().start();
1131 CopyChars(result->GetChars(), src,
1132 static_cast<size_t>(first_index_to_lower));
1133 for (int index = first_index_to_lower; index < length; ++index) {
1134 uint16_t ch = src[index];
1135 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1136 }
1137 }
1138
1139 return *result;
1140 } 1109 }
1141 1110
1142 // Blink had an additional case here for ASCII 2-byte strings, but 1111 // We depend here on the invariant that the length of a Latin1
1143 // that is subsumed by the above code (assuming there isn't a false 1112 // string is invariant under ToLowerCase, and the result always
1144 // negative for HasOnlyOneByteChars). 1113 // fits in the Latin1 range in the *root locale*. It does not hold
1114 // for ToUpperCase even in the root locale.
1145 1115
1146 // Do a slower implementation for cases that include non-ASCII characters. 1116 // Scan the string for uppercase and non-ASCII characters for strings
1147 return LocaleConvertCase(s, isolate, false, ""); 1117 // shorter than a machine-word without any memory allocation overhead.
1118 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()
1119 // to two parts, one for scanning the prefix with no change and the other for
1120 // handling ASCII-only characters.
1121 int index_to_first_unprocessed = length;
1122 const bool is_short = length < static_cast<int>(sizeof(uintptr_t));
1123 if (is_short) {
1124 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
1125 // Nothing to do if the string is all ASCII with no uppercase.
1126 if (index_to_first_unprocessed == length) return *s;
1127 }
1128
1129 Handle<SeqOneByteString> result =
1130 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1131
1132 DisallowHeapAllocation no_gc;
1133 String::FlatContent flat = s->GetFlatContent();
1134 uint8_t* dest = result->GetChars();
1135 if (flat.IsOneByte()) {
1136 const uint8_t* src = flat.ToOneByteVector().start();
1137 bool has_changed_character = false;
1138 index_to_first_unprocessed = FastAsciiConvert<true>(
1139 reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src),
1140 length, &has_changed_character);
1141 // If not ASCII, we keep the result up to index_to_first_unprocessed and
1142 // process the rest.
1143 if (index_to_first_unprocessed == length)
1144 return has_changed_character ? *result : *s;
1145
1146 for (int index = index_to_first_unprocessed; index < length; ++index) {
1147 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
1148 }
1149 } else {
1150 if (index_to_first_unprocessed == length) {
1151 DCHECK(!is_short);
1152 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
1153 }
1154 // Nothing to do if the string is all ASCII with no uppercase.
1155 if (index_to_first_unprocessed == length) return *s;
1156 const uint16_t* src = flat.ToUC16Vector().start();
1157 CopyChars(dest, src, index_to_first_unprocessed);
1158 for (int index = index_to_first_unprocessed; index < length; ++index) {
1159 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
1160 }
1161 }
1162
1163 return *result;
1148 } 1164 }
1149 1165
1150 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) { 1166 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
1151 HandleScope scope(isolate); 1167 HandleScope scope(isolate);
1152 DCHECK_EQ(args.length(), 1); 1168 DCHECK_EQ(args.length(), 1);
1153 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1169 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1154 1170
1155 // This function could be optimized for no-op cases the way lowercase
1156 // counterpart is, but in empirical testing, few actual calls to upper()
1157 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
1158
1159 int32_t length = s->length(); 1171 int32_t length = s->length();
1160 s = String::Flatten(s); 1172 s = String::Flatten(s);
1161 1173
1162 if (s->HasOnlyOneByteChars()) { 1174 if (s->HasOnlyOneByteChars()) {
1163 Handle<SeqOneByteString> result; 1175 Handle<SeqOneByteString> result =
1164 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( 1176 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1165 isolate, result, isolate->factory()->NewRawOneByteString(length));
1166 1177
1167 int sharp_s_count; 1178 int sharp_s_count;
1168 bool is_result_single_byte; 1179 bool is_result_single_byte;
1169 { 1180 {
1170 DisallowHeapAllocation no_gc; 1181 DisallowHeapAllocation no_gc;
1171 String::FlatContent flat = s->GetFlatContent(); 1182 String::FlatContent flat = s->GetFlatContent();
1172 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII 1183 uint8_t* dest = result->GetChars();
1173 // could be removed because ToUpperOneByte is pretty fast now (it
1174 // does not call ICU API any more.).
1175 if (flat.IsOneByte()) { 1184 if (flat.IsOneByte()) {
1176 Vector<const uint8_t> src = flat.ToOneByteVector(); 1185 Vector<const uint8_t> src = flat.ToOneByteVector();
1177 if (ToUpperFastASCII(src, result)) return *result; 1186 bool has_changed_character = false;
1178 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); 1187 int index_to_first_unprocessed =
1188 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
1189 reinterpret_cast<const char*>(src.start()),
1190 length, &has_changed_character);
1191 if (index_to_first_unprocessed == length)
1192 return has_changed_character ? *result : *s;
1193 // If not ASCII, we keep the result up to index_to_first_unprocessed and
1194 // process the rest.
1195 is_result_single_byte =
1196 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),
1197 dest + index_to_first_unprocessed, &sharp_s_count);
1179 } else { 1198 } else {
1180 DCHECK(flat.IsTwoByte()); 1199 DCHECK(flat.IsTwoByte());
1181 Vector<const uint16_t> src = flat.ToUC16Vector(); 1200 Vector<const uint16_t> src = flat.ToUC16Vector();
1182 if (ToUpperFastASCII(src, result)) return *result; 1201 if (ToUpperFastASCII(src, result)) return *result;
1183 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); 1202 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);
1184 } 1203 }
1185 } 1204 }
1186 1205
1187 // Go to the full Unicode path if there are characters whose uppercase 1206 // Go to the full Unicode path if there are characters whose uppercase
1188 // is beyond the Latin-1 range (cannot be represented in OneByteString). 1207 // is beyond the Latin-1 range (cannot be represented in OneByteString).
1189 if (V8_UNLIKELY(!is_result_single_byte)) { 1208 if (V8_UNLIKELY(!is_result_single_byte)) {
1190 return LocaleConvertCase(s, isolate, true, ""); 1209 return LocaleConvertCase(s, isolate, true, "");
1191 } 1210 }
1192 1211
1193 if (sharp_s_count == 0) return *result; 1212 if (sharp_s_count == 0) return *result;
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after
1247 Handle<FixedArray> date_cache_version = 1266 Handle<FixedArray> date_cache_version =
1248 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton( 1267 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(
1249 EternalHandles::DATE_CACHE_VERSION)); 1268 EternalHandles::DATE_CACHE_VERSION));
1250 return date_cache_version->get(0); 1269 return date_cache_version->get(0);
1251 } 1270 }
1252 1271
1253 } // namespace internal 1272 } // namespace internal
1254 } // namespace v8 1273 } // namespace v8
1255 1274
1256 #endif // V8_I18N_SUPPORT 1275 #endif // V8_I18N_SUPPORT
OLDNEW
« no previous file with comments | « src/js/i18n.js ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698