Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(141)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 2533983006: Optimize case conversion with icu_case_mapping (Closed)
Patch Set: do not use ASSIGN_RETURN_FAILURE_ON_EXCEPTION in ToUpper Created 4 years ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 5
6 #ifdef V8_I18N_SUPPORT 6 #ifdef V8_I18N_SUPPORT
7 #include "src/runtime/runtime-utils.h" 7 #include "src/runtime/runtime-utils.h"
8 8
9 #include <memory> 9 #include <memory>
10 10
11 #include "src/api-natives.h"
11 #include "src/api.h" 12 #include "src/api.h"
12 #include "src/api-natives.h"
13 #include "src/arguments.h" 13 #include "src/arguments.h"
14 #include "src/factory.h" 14 #include "src/factory.h"
15 #include "src/i18n.h" 15 #include "src/i18n.h"
16 #include "src/isolate-inl.h" 16 #include "src/isolate-inl.h"
17 #include "src/messages.h" 17 #include "src/messages.h"
18 #include "src/utils.h"
18 19
19 #include "unicode/brkiter.h" 20 #include "unicode/brkiter.h"
20 #include "unicode/calendar.h" 21 #include "unicode/calendar.h"
21 #include "unicode/coll.h" 22 #include "unicode/coll.h"
22 #include "unicode/curramt.h" 23 #include "unicode/curramt.h"
23 #include "unicode/datefmt.h" 24 #include "unicode/datefmt.h"
24 #include "unicode/dcfmtsym.h" 25 #include "unicode/dcfmtsym.h"
25 #include "unicode/decimfmt.h" 26 #include "unicode/decimfmt.h"
26 #include "unicode/dtfmtsym.h" 27 #include "unicode/dtfmtsym.h"
27 #include "unicode/dtptngen.h" 28 #include "unicode/dtptngen.h"
(...skipping 1056 matching lines...) Expand 10 before | Expand all | Expand 10 after
1084 1085
1085 } // namespace 1086 } // namespace
1086 1087
1087 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) { 1088 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
1088 HandleScope scope(isolate); 1089 HandleScope scope(isolate);
1089 DCHECK_EQ(args.length(), 1); 1090 DCHECK_EQ(args.length(), 1);
1090 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1091 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1091 1092
1092 int length = s->length(); 1093 int length = s->length();
1093 s = String::Flatten(s); 1094 s = String::Flatten(s);
1094 // First scan the string for uppercase and non-ASCII characters: 1095
1096 bool is_ascii = true;
1095 if (s->HasOnlyOneByteChars()) { 1097 if (s->HasOnlyOneByteChars()) {
1096 int first_index_to_lower = length; 1098 // Scan the string for uppercase and non-ASCII characters for strings
1097 for (int index = 0; index < length; ++index) { 1099 // shorter than a machine-word without any memory allocation overhead.
Yang 2016/12/05 19:19:29 What is the rationale for doing this only to short
1098 // Blink specializes this path for one-byte strings, so it 1100 int index_to_first_upper = length;
1099 // does not need to do a generic get, but can do the equivalent 1101 if (static_cast<size_t>(length) < sizeof(uintptr_t)) {
1100 // of SeqOneByteStringGet. 1102 for (int index = 0; index < length; ++index) {
1101 uint16_t ch = s->Get(index); 1103 uint16_t ch = s->Get(index);
1102 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) { 1104 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1103 first_index_to_lower = index; 1105 is_ascii = !(ch & ~0x7F);
1104 break; 1106 index_to_first_upper = index;
1107 break;
1108 }
1105 } 1109 }
1110 // Nothing to do if the string is all ASCII with no uppercase.
1111 if (index_to_first_upper == length) return *s;
1106 } 1112 }
Yang 2016/12/05 19:19:29 So if the string is longer than a word, we always
1107 1113
1108 // Nothing to do if the string is all ASCII with no uppercase.
1109 if (first_index_to_lower == length) return *s;
1110
1111 // We depend here on the invariant that the length of a Latin1 1114 // We depend here on the invariant that the length of a Latin1
1112 // string is invariant under ToLowerCase, and the result always 1115 // string is invariant under ToLowerCase, and the result always
1113 // fits in the Latin1 range in the *root locale*. It does not hold 1116 // fits in the Latin1 range in the *root locale*. It does not hold
1114 // for ToUpperCase even in the root locale. 1117 // for ToUpperCase even in the root locale.
1115 Handle<SeqOneByteString> result; 1118 Handle<SeqOneByteString> result;
1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( 1119 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1117 isolate, result, isolate->factory()->NewRawOneByteString(length)); 1120 isolate, result, isolate->factory()->NewRawOneByteString(length));
1118 1121
1119 DisallowHeapAllocation no_gc; 1122 DisallowHeapAllocation no_gc;
1120 String::FlatContent flat = s->GetFlatContent(); 1123 String::FlatContent flat = s->GetFlatContent();
1124 uint8_t* dest = result->GetChars();
1125 // Instead of checking is_ascii here, we'd better modify FastAsciiConvert
Yang 2016/12/05 19:19:29 Is this a TODO?
1126 // to return the index to the first non-ASCII character.
1127 if (flat.IsOneByte() && is_ascii) {
1128 const uint8_t* src = flat.ToOneByteVector().start();
1129 bool has_changed_character = false;
1130 bool is_ascii = FastAsciiConvert<true>(reinterpret_cast<char*>(dest),
1131 reinterpret_cast<const char*>(src),
1132 length, &has_changed_character);
1133 // If not ASCII, we discard the result and start anew.
1134 if (is_ascii) return has_changed_character ? *result : *s;
1135 }
1136
1137 if (index_to_first_upper == length) {
1138 for (int index = 0; index < length; ++index) {
1139 uint16_t ch = s->Get(index);
1140 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1141 index_to_first_upper = index;
1142 break;
1143 }
1144 }
1145 }
1146
1121 if (flat.IsOneByte()) { 1147 if (flat.IsOneByte()) {
1148 // An ASCII input without any uppercase characters is already handled by
1149 // the short-string scanner and FastAsciiConvert.
1150 DCHECK(index_to_first_upper < length);
1122 const uint8_t* src = flat.ToOneByteVector().start(); 1151 const uint8_t* src = flat.ToOneByteVector().start();
1123 CopyChars(result->GetChars(), src, 1152 CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));
1124 static_cast<size_t>(first_index_to_lower)); 1153 for (int index = index_to_first_upper; index < length; ++index) {
1125 for (int index = first_index_to_lower; index < length; ++index) {
1126 uint16_t ch = static_cast<uint16_t>(src[index]); 1154 uint16_t ch = static_cast<uint16_t>(src[index]);
1127 result->SeqOneByteStringSet(index, ToLatin1Lower(ch)); 1155 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1128 } 1156 }
1129 } else { 1157 } else {
1158 // Nothing to do if the string is all ASCII with no uppercase.
1159 if (index_to_first_upper == length) return *s;
1130 const uint16_t* src = flat.ToUC16Vector().start(); 1160 const uint16_t* src = flat.ToUC16Vector().start();
1131 CopyChars(result->GetChars(), src, 1161 CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));
1132 static_cast<size_t>(first_index_to_lower)); 1162 for (int index = index_to_first_upper; index < length; ++index) {
1133 for (int index = first_index_to_lower; index < length; ++index) {
1134 uint16_t ch = src[index]; 1163 uint16_t ch = src[index];
1135 result->SeqOneByteStringSet(index, ToLatin1Lower(ch)); 1164 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1136 } 1165 }
1137 } 1166 }
1138 1167
1139 return *result; 1168 return *result;
1140 } 1169 }
1141 1170
1142 // Blink had an additional case here for ASCII 2-byte strings, but 1171 // Blink had an additional case here for ASCII 2-byte strings, but
1143 // that is subsumed by the above code (assuming there isn't a false 1172 // that is subsumed by the above code (assuming there isn't a false
1144 // negative for HasOnlyOneByteChars). 1173 // negative for HasOnlyOneByteChars).
1145 1174
1146 // Do a slower implementation for cases that include non-ASCII characters. 1175 // Do a slower implementation for cases that include non-ASCII characters.
1147 return LocaleConvertCase(s, isolate, false, ""); 1176 return LocaleConvertCase(s, isolate, false, "");
1148 } 1177 }
1149 1178
1150 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) { 1179 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
1151 HandleScope scope(isolate); 1180 HandleScope scope(isolate);
1152 DCHECK_EQ(args.length(), 1); 1181 DCHECK_EQ(args.length(), 1);
1153 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 1182 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1154 1183
1155 // This function could be optimized for no-op cases the way lowercase
1156 // counterpart is, but in empirical testing, few actual calls to upper()
1157 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
1158
1159 int32_t length = s->length(); 1184 int32_t length = s->length();
1160 s = String::Flatten(s); 1185 s = String::Flatten(s);
1161 1186
1162 if (s->HasOnlyOneByteChars()) { 1187 if (s->HasOnlyOneByteChars()) {
1188 #if 0
1163 Handle<SeqOneByteString> result; 1189 Handle<SeqOneByteString> result;
1164 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( 1190 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1165 isolate, result, isolate->factory()->NewRawOneByteString(length)); 1191 isolate, result, isolate->factory()->NewRawOneByteString(length));
1192 #endif
1193 Handle<SeqOneByteString> result =
1194 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
jungshik at Google 2016/12/02 06:52:38 The first part of Runtime_STringToUpperCaseI18N fo
Dan Ehrenberg 2016/12/02 23:35:08 I think we would crash if out of memory, and the o
Yang 2016/12/05 19:19:30 Sounds right to me as well. No need to check for e
1166 1195
1167 int sharp_s_count; 1196 int sharp_s_count;
1168 bool is_result_single_byte; 1197 bool is_result_single_byte;
1169 { 1198 {
1170 DisallowHeapAllocation no_gc; 1199 DisallowHeapAllocation no_gc;
1171 String::FlatContent flat = s->GetFlatContent(); 1200 String::FlatContent flat = s->GetFlatContent();
1172 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
1173 // could be removed because ToUpperOneByte is pretty fast now (it
1174 // does not call ICU API any more.).
1175 if (flat.IsOneByte()) { 1201 if (flat.IsOneByte()) {
1176 Vector<const uint8_t> src = flat.ToOneByteVector(); 1202 Vector<const uint8_t> src = flat.ToOneByteVector();
1177 if (ToUpperFastASCII(src, result)) return *result; 1203 bool has_changed_character = false;
1204 bool is_ascii =
1205 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
1206 reinterpret_cast<const char*>(src.start()),
1207 length, &has_changed_character);
1208 // If not ASCII, we discard the result and use the table for Latin1.
1209 if (is_ascii) return has_changed_character ? *result : *s;
1178 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); 1210 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1179 } else { 1211 } else {
1180 DCHECK(flat.IsTwoByte()); 1212 DCHECK(flat.IsTwoByte());
1181 Vector<const uint16_t> src = flat.ToUC16Vector(); 1213 Vector<const uint16_t> src = flat.ToUC16Vector();
1182 if (ToUpperFastASCII(src, result)) return *result; 1214 if (ToUpperFastASCII(src, result)) return *result;
1183 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); 1215 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1184 } 1216 }
1185 } 1217 }
1186 1218
1187 // Go to the full Unicode path if there are characters whose uppercase 1219 // Go to the full Unicode path if there are characters whose uppercase
(...skipping 59 matching lines...) Expand 10 before | Expand all | Expand 10 after
1247 Handle<FixedArray> date_cache_version = 1279 Handle<FixedArray> date_cache_version =
1248 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton( 1280 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(
1249 EternalHandles::DATE_CACHE_VERSION)); 1281 EternalHandles::DATE_CACHE_VERSION));
1250 return date_cache_version->get(0); 1282 return date_cache_version->get(0);
1251 } 1283 }
1252 1284
1253 } // namespace internal 1285 } // namespace internal
1254 } // namespace v8 1286 } // namespace v8
1255 1287
1256 #endif // V8_I18N_SUPPORT 1288 #endif // V8_I18N_SUPPORT
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698