src/runtime/runtime-i18n.cc - Issue 2533983006: Optimize case conversion with icu_case_mapping

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 2533983006: Optimize case conversion with icu_case_mapping (Closed)

Patch Set: do not use ASSIGN_RETURN_FAILURE_ON_EXCEPTION in ToUpper Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include <memory>	9 #include <memory>

10	10

	11 #include "src/api-natives.h"

11 #include "src/api.h"	12 #include "src/api.h"

12 #include "src/api-natives.h"

13 #include "src/arguments.h"	13 #include "src/arguments.h"

14 #include "src/factory.h"	14 #include "src/factory.h"

15 #include "src/i18n.h"	15 #include "src/i18n.h"

16 #include "src/isolate-inl.h"	16 #include "src/isolate-inl.h"

17 #include "src/messages.h"	17 #include "src/messages.h"

	18 #include "src/utils.h"

18	19

19 #include "unicode/brkiter.h"	20 #include "unicode/brkiter.h"

20 #include "unicode/calendar.h"	21 #include "unicode/calendar.h"

21 #include "unicode/coll.h"	22 #include "unicode/coll.h"

22 #include "unicode/curramt.h"	23 #include "unicode/curramt.h"

23 #include "unicode/datefmt.h"	24 #include "unicode/datefmt.h"

24 #include "unicode/dcfmtsym.h"	25 #include "unicode/dcfmtsym.h"

25 #include "unicode/decimfmt.h"	26 #include "unicode/decimfmt.h"

26 #include "unicode/dtfmtsym.h"	27 #include "unicode/dtfmtsym.h"

27 #include "unicode/dtptngen.h"	28 #include "unicode/dtptngen.h"

(...skipping 1056 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1084	1085

1085 } // namespace	1086 } // namespace

1086	1087

1087 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {	1088 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

1088 HandleScope scope(isolate);	1089 HandleScope scope(isolate);

1089 DCHECK_EQ(args.length(), 1);	1090 DCHECK_EQ(args.length(), 1);

1090 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1091 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1091	1092

1092 int length = s->length();	1093 int length = s->length();

1093 s = String::Flatten(s);	1094 s = String::Flatten(s);

1094 // First scan the string for uppercase and non-ASCII characters:	1095

	1096 bool is_ascii = true;

1095 if (s->HasOnlyOneByteChars()) {	1097 if (s->HasOnlyOneByteChars()) {

1096 int first_index_to_lower = length;	1098 // Scan the string for uppercase and non-ASCII characters for strings

1097 for (int index = 0; index < length; ++index) {	1099 // shorter than a machine-word without any memory allocation overhead.
	Yang 2016/12/05 19:19:29 What is the rationale for doing this only to short What is the rationale for doing this only to short strings? Maybe elaborate a bit more?
1098 // Blink specializes this path for one-byte strings, so it	1100 int index_to_first_upper = length;

1099 // does not need to do a generic get, but can do the equivalent	1101 if (static_cast<size_t>(length) < sizeof(uintptr_t)) {

1100 // of SeqOneByteStringGet.	1102 for (int index = 0; index < length; ++index) {

1101 uint16_t ch = s->Get(index);	1103 uint16_t ch = s->Get(index);

1102 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {	1104 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

1103 first_index_to_lower = index;	1105 is_ascii = !(ch & ~0x7F);

1104 break;	1106 index_to_first_upper = index;

	1107 break;

	1108 }

1105 }	1109 }

	1110 // Nothing to do if the string is all ASCII with no uppercase.

	1111 if (index_to_first_upper == length) return *s;

1106 }	1112 }
	Yang 2016/12/05 19:19:29 So if the string is longer than a word, we always So if the string is longer than a word, we always end up here with index_to_first_upper == length. This is somewhat misleading, when reading this code.
1107	1113

1108 // Nothing to do if the string is all ASCII with no uppercase.

1109 if (first_index_to_lower == length) return *s;

1110

1111 // We depend here on the invariant that the length of a Latin1	1114 // We depend here on the invariant that the length of a Latin1

1112 // string is invariant under ToLowerCase, and the result always	1115 // string is invariant under ToLowerCase, and the result always

1113 // fits in the Latin1 range in the root locale. It does not hold	1116 // fits in the Latin1 range in the root locale. It does not hold

1114 // for ToUpperCase even in the root locale.	1117 // for ToUpperCase even in the root locale.

1115 Handle<SeqOneByteString> result;	1118 Handle<SeqOneByteString> result;

1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1119 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1117 isolate, result, isolate->factory()->NewRawOneByteString(length));	1120 isolate, result, isolate->factory()->NewRawOneByteString(length));

1118	1121

1119 DisallowHeapAllocation no_gc;	1122 DisallowHeapAllocation no_gc;

1120 String::FlatContent flat = s->GetFlatContent();	1123 String::FlatContent flat = s->GetFlatContent();

	1124 uint8_t* dest = result->GetChars();

	1125 // Instead of checking is_ascii here, we'd better modify FastAsciiConvert
	Yang 2016/12/05 19:19:29 Is this a TODO? Is this a TODO?
	1126 // to return the index to the first non-ASCII character.

	1127 if (flat.IsOneByte() && is_ascii) {

	1128 const uint8_t* src = flat.ToOneByteVector().start();

	1129 bool has_changed_character = false;

	1130 bool is_ascii = FastAsciiConvert<true>(reinterpret_cast<char*>(dest),

	1131 reinterpret_cast<const char*>(src),

	1132 length, &has_changed_character);

	1133 // If not ASCII, we discard the result and start anew.

	1134 if (is_ascii) return has_changed_character ? result : s;

	1135 }

	1136

	1137 if (index_to_first_upper == length) {

	1138 for (int index = 0; index < length; ++index) {

	1139 uint16_t ch = s->Get(index);

	1140 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1141 index_to_first_upper = index;

	1142 break;

	1143 }

	1144 }

	1145 }

	1146

1121 if (flat.IsOneByte()) {	1147 if (flat.IsOneByte()) {

	1148 // An ASCII input without any uppercase characters is already handled by

	1149 // the short-string scanner and FastAsciiConvert.

	1150 DCHECK(index_to_first_upper < length);

1122 const uint8_t* src = flat.ToOneByteVector().start();	1151 const uint8_t* src = flat.ToOneByteVector().start();

1123 CopyChars(result->GetChars(), src,	1152 CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));

1124 static_cast<size_t>(first_index_to_lower));	1153 for (int index = index_to_first_upper; index < length; ++index) {

1125 for (int index = first_index_to_lower; index < length; ++index) {

1126 uint16_t ch = static_cast<uint16_t>(src[index]);	1154 uint16_t ch = static_cast<uint16_t>(src[index]);

1127 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));	1155 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

1128 }	1156 }

1129 } else {	1157 } else {

	1158 // Nothing to do if the string is all ASCII with no uppercase.

	1159 if (index_to_first_upper == length) return *s;

1130 const uint16_t* src = flat.ToUC16Vector().start();	1160 const uint16_t* src = flat.ToUC16Vector().start();

1131 CopyChars(result->GetChars(), src,	1161 CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));

1132 static_cast<size_t>(first_index_to_lower));	1162 for (int index = index_to_first_upper; index < length; ++index) {

1133 for (int index = first_index_to_lower; index < length; ++index) {

1134 uint16_t ch = src[index];	1163 uint16_t ch = src[index];

1135 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));	1164 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

1136 }	1165 }

1137 }	1166 }

1138	1167

1139 return *result;	1168 return *result;

1140 }	1169 }

1141	1170

1142 // Blink had an additional case here for ASCII 2-byte strings, but	1171 // Blink had an additional case here for ASCII 2-byte strings, but

1143 // that is subsumed by the above code (assuming there isn't a false	1172 // that is subsumed by the above code (assuming there isn't a false

1144 // negative for HasOnlyOneByteChars).	1173 // negative for HasOnlyOneByteChars).

1145	1174

1146 // Do a slower implementation for cases that include non-ASCII characters.	1175 // Do a slower implementation for cases that include non-ASCII characters.

1147 return LocaleConvertCase(s, isolate, false, "");	1176 return LocaleConvertCase(s, isolate, false, "");

1148 }	1177 }

1149	1178

1150 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {	1179 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

1151 HandleScope scope(isolate);	1180 HandleScope scope(isolate);

1152 DCHECK_EQ(args.length(), 1);	1181 DCHECK_EQ(args.length(), 1);

1153 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1182 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1154	1183

1155 // This function could be optimized for no-op cases the way lowercase

1156 // counterpart is, but in empirical testing, few actual calls to upper()

1157 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

1158

1159 int32_t length = s->length();	1184 int32_t length = s->length();

1160 s = String::Flatten(s);	1185 s = String::Flatten(s);

1161	1186

1162 if (s->HasOnlyOneByteChars()) {	1187 if (s->HasOnlyOneByteChars()) {

	1188 #if 0

1163 Handle<SeqOneByteString> result;	1189 Handle<SeqOneByteString> result;

1164 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1190 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1165 isolate, result, isolate->factory()->NewRawOneByteString(length));	1191 isolate, result, isolate->factory()->NewRawOneByteString(length));

	1192 #endif

	1193 Handle<SeqOneByteString> result =

	1194 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
	jungshik at Google 2016/12/02 06:52:38 The first part of Runtime_STringToUpperCaseI18N fo The first part of Runtime_STringToUpperCaseI18N for pure ASCII inpu is almost identical to the corresponding part in ConvertCase() in runtime-string.cc. The only difference is ASSIGN_RETURN_FAILURE_ON_EXCEPTION vs directly calling \|isolate->factory()->NewRawOneByte....\|. Somehow, that does makes a difference in performance. When I directly use NewRawOneByte.... (without checking for exception), the perf for pure ASCII input got on par with Unibrow. Otherwise, i18n code is about one sdt deviation slower than Unibrow. Is it ok NOT to use ASSIGN_RETURN_FAILURE_ON_EXCEPTION()? How can ConvertCase() in runtime-string.cc get away with it? Dan Ehrenberg 2016/12/02 23:35:08 I think we would crash if out of memory, and the o Show quoted text On 2016/12/02 06:52:38, jungshik at google wrote: > The first part of Runtime_STringToUpperCaseI18N for pure ASCII inpu is almost > identical to > the corresponding part in ConvertCase() in runtime-string.cc. > > The only difference is ASSIGN_RETURN_FAILURE_ON_EXCEPTION vs directly calling > \|isolate->factory()->NewRawOneByte....\|. Somehow, that does makes a difference > in performance. When I directly use NewRawOneByte.... (without checking for > exception), the perf for pure ASCII input got on par with Unibrow. Otherwise, > i18n code is about one sdt deviation slower than Unibrow. > > Is it ok NOT to use ASSIGN_RETURN_FAILURE_ON_EXCEPTION()? How can ConvertCase() > in runtime-string.cc get away with it? I think we would crash if out of memory, and the only way it would throw an exception is if the length is invalid. In this case, we know the length is valid because it is the length of another string. I see tons of instances of line 1194 in src/factory.cc, so I think it's OK here too, but I can understand authors who would want to use ASSIGN_RETURN_FAILURE_ON_EXCEPTION here because it's more obviously correct. Yang, any other thoughts? Yang 2016/12/05 19:19:30 Sounds right to me as well. No need to check for e Show quoted text On 2016/12/02 23:35:08, Dan Ehrenberg wrote: > On 2016/12/02 06:52:38, jungshik at google wrote: > > The first part of Runtime_STringToUpperCaseI18N for pure ASCII inpu is almost > > identical to > > the corresponding part in ConvertCase() in runtime-string.cc. > > > > The only difference is ASSIGN_RETURN_FAILURE_ON_EXCEPTION vs directly calling > > \|isolate->factory()->NewRawOneByte....\|. Somehow, that does makes a > difference > > in performance. When I directly use NewRawOneByte.... (without checking for > > exception), the perf for pure ASCII input got on par with Unibrow. Otherwise, > > i18n code is about one sdt deviation slower than Unibrow. > > > > Is it ok NOT to use ASSIGN_RETURN_FAILURE_ON_EXCEPTION()? How can > ConvertCase() > > in runtime-string.cc get away with it? > > I think we would crash if out of memory, and the only way it would throw an > exception is if the length is invalid. In this case, we know the length is valid > because it is the length of another string. I see tons of instances of line 1194 > in src/factory.cc, so I think it's OK here too, but I can understand authors who > would want to use ASSIGN_RETURN_FAILURE_ON_EXCEPTION here because it's more > obviously correct. Yang, any other thoughts? Sounds right to me as well. No need to check for exceptions.
1166	1195

1167 int sharp_s_count;	1196 int sharp_s_count;

1168 bool is_result_single_byte;	1197 bool is_result_single_byte;

1169 {	1198 {

1170 DisallowHeapAllocation no_gc;	1199 DisallowHeapAllocation no_gc;

1171 String::FlatContent flat = s->GetFlatContent();	1200 String::FlatContent flat = s->GetFlatContent();

1172 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

1173 // could be removed because ToUpperOneByte is pretty fast now (it

1174 // does not call ICU API any more.).

1175 if (flat.IsOneByte()) {	1201 if (flat.IsOneByte()) {

1176 Vector<const uint8_t> src = flat.ToOneByteVector();	1202 Vector<const uint8_t> src = flat.ToOneByteVector();

1177 if (ToUpperFastASCII(src, result)) return *result;	1203 bool has_changed_character = false;

	1204 bool is_ascii =

	1205 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),

	1206 reinterpret_cast<const char*>(src.start()),

	1207 length, &has_changed_character);

	1208 // If not ASCII, we discard the result and use the table for Latin1.

	1209 if (is_ascii) return has_changed_character ? result : s;

1178 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);	1210 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

1179 } else {	1211 } else {

1180 DCHECK(flat.IsTwoByte());	1212 DCHECK(flat.IsTwoByte());

1181 Vector<const uint16_t> src = flat.ToUC16Vector();	1213 Vector<const uint16_t> src = flat.ToUC16Vector();

1182 if (ToUpperFastASCII(src, result)) return *result;	1214 if (ToUpperFastASCII(src, result)) return *result;

1183 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);	1215 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

1184 }	1216 }

1185 }	1217 }

1186	1218

1187 // Go to the full Unicode path if there are characters whose uppercase	1219 // Go to the full Unicode path if there are characters whose uppercase

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1247 Handle<FixedArray> date_cache_version =	1279 Handle<FixedArray> date_cache_version =

1248 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(	1280 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(

1249 EternalHandles::DATE_CACHE_VERSION));	1281 EternalHandles::DATE_CACHE_VERSION));

1250 return date_cache_version->get(0);	1282 return date_cache_version->get(0);

1251 }	1283 }

1252	1284

1253 } // namespace internal	1285 } // namespace internal

1254 } // namespace v8	1286 } // namespace v8

1255	1287

1256 #endif // V8_I18N_SUPPORT	1288 #endif // V8_I18N_SUPPORT

OLD	NEW

« src/js/i18n.js ('K') | « src/js/i18n.js ('k') | src/runtime/runtime-strings.cc » ('j') | src/runtime/runtime-utils.cc » ('J')