src/runtime/runtime-i18n.cc - Issue 2533983006: Optimize case conversion with icu_case_mapping

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 2533983006: Optimize case conversion with icu_case_mapping (Closed)

Patch Set: update comments with TODO Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include <memory>	9 #include <memory>

10	10

	11 #include "src/api-natives.h"

11 #include "src/api.h"	12 #include "src/api.h"

12 #include "src/api-natives.h"

13 #include "src/arguments.h"	13 #include "src/arguments.h"

14 #include "src/factory.h"	14 #include "src/factory.h"

15 #include "src/i18n.h"	15 #include "src/i18n.h"

16 #include "src/isolate-inl.h"	16 #include "src/isolate-inl.h"

17 #include "src/messages.h"	17 #include "src/messages.h"

	18 #include "src/string-case.h"

	19 #include "src/utils.h"

18	20

19 #include "unicode/brkiter.h"	21 #include "unicode/brkiter.h"

20 #include "unicode/calendar.h"	22 #include "unicode/calendar.h"

21 #include "unicode/coll.h"	23 #include "unicode/coll.h"

22 #include "unicode/curramt.h"	24 #include "unicode/curramt.h"

23 #include "unicode/datefmt.h"	25 #include "unicode/datefmt.h"

24 #include "unicode/dcfmtsym.h"	26 #include "unicode/dcfmtsym.h"

25 #include "unicode/decimfmt.h"	27 #include "unicode/decimfmt.h"

26 #include "unicode/dtfmtsym.h"	28 #include "unicode/dtfmtsym.h"

27 #include "unicode/dtptngen.h"	29 #include "unicode/dtptngen.h"

(...skipping 1006 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1034 uint16_t ch = static_cast<uint16_t>(*it);	1036 uint16_t ch = static_cast<uint16_t>(*it);

1035 ored \|= ch;	1037 ored \|= ch;

1036 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));	1038 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

1037 }	1039 }

1038 return !(ored & ~0x7F);	1040 return !(ored & ~0x7F);

1039 }	1041 }

1040	1042

1041 const uint16_t sharp_s = 0xDF;	1043 const uint16_t sharp_s = 0xDF;

1042	1044

1043 template <typename Char>	1045 template <typename Char>

1044 bool ToUpperOneByte(const Vector<const Char>& src,	1046 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,

1045 Handle<SeqOneByteString> result, int* sharp_s_count) {	1047 int* sharp_s_count) {
	Dan Ehrenberg 2016/12/15 19:24:25 Using a pointer rather than a handle seems valid s Using a pointer rather than a handle seems valid since it's within a DisallowHeapAllocation, but was this necessary for performance or something? jungshik at Google 2016/12/16 00:37:56 This change is triggered by a change below (call-s Show quoted text On 2016/12/15 19:24:25, Dan Ehrenberg wrote: > Using a pointer rather than a handle seems valid since it's within a > DisallowHeapAllocation, but was this necessary for performance or something? This change is triggered by a change below (call-site; see lines 1194 ~ 1198). Up to PS 14, on coming across non-ASCII characters, the result up to that point was discarded in ToUpper*. With the change in lines 1194 ~ 1198, I don't throw it away any more. That requires ToUpperOneByte to take a pointer to the middle of a string (the beginning of the unprocessed part of an input) instead of a while string. That also has a side-benefit of line 1066 being simpler.
1046 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.	1048 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

1047	1049

1048 // There are two special cases.	1050 // There are two special cases.

1049 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.	1051 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

1050 // 2. Lower case sharp-S converts to "SS" (two characters)	1052 // 2. Lower case sharp-S converts to "SS" (two characters)

1051 *sharp_s_count = 0;	1053 *sharp_s_count = 0;

1052 int32_t index = 0;	1054 int32_t index = 0;

1053 for (auto it = src.begin(); it != src.end(); ++it) {	1055 for (auto it = src.begin(); it != src.end(); ++it) {

1054 uint16_t ch = static_cast<uint16_t>(*it);	1056 uint16_t ch = static_cast<uint16_t>(*it);

1055 if (V8_UNLIKELY(ch == sharp_s)) {	1057 if (V8_UNLIKELY(ch == sharp_s)) {

1056 ++(*sharp_s_count);	1058 ++(*sharp_s_count);

1057 continue;	1059 continue;

1058 }	1060 }

1059 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {	1061 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {

1060 // Since this upper-cased character does not fit in an 8-bit string, we	1062 // Since this upper-cased character does not fit in an 8-bit string, we

1061 // need to take the 16-bit path.	1063 // need to take the 16-bit path.

1062 return false;	1064 return false;

1063 }	1065 }

1064 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));	1066 *dest++ = ToLatin1Upper(ch);

1065 }	1067 }

1066	1068

1067 return true;	1069 return true;

1068 }	1070 }

1069	1071

1070 template <typename Char>	1072 template <typename Char>

1071 void ToUpperWithSharpS(const Vector<const Char>& src,	1073 void ToUpperWithSharpS(const Vector<const Char>& src,

1072 Handle<SeqOneByteString> result) {	1074 Handle<SeqOneByteString> result) {

1073 int32_t dest_index = 0;	1075 int32_t dest_index = 0;

1074 for (auto it = src.begin(); it != src.end(); ++it) {	1076 for (auto it = src.begin(); it != src.end(); ++it) {

1075 uint16_t ch = static_cast<uint16_t>(*it);	1077 uint16_t ch = static_cast<uint16_t>(*it);

1076 if (ch == sharp_s) {	1078 if (ch == sharp_s) {

1077 result->SeqOneByteStringSet(dest_index++, 'S');	1079 result->SeqOneByteStringSet(dest_index++, 'S');

1078 result->SeqOneByteStringSet(dest_index++, 'S');	1080 result->SeqOneByteStringSet(dest_index++, 'S');

1079 } else {	1081 } else {

1080 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));	1082 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

1081 }	1083 }

1082 }	1084 }

1083 }	1085 }

1084	1086

	1087 inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {

	1088 for (int index = 0; index < length; ++index) {

	1089 uint16_t ch = s->Get(index);

	1090 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1091 return index;

	1092 }

	1093 }

	1094 return length;

	1095 }

	1096

1085 } // namespace	1097 } // namespace

1086	1098

1087 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {	1099 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

1088 HandleScope scope(isolate);	1100 HandleScope scope(isolate);

1089 DCHECK_EQ(args.length(), 1);	1101 DCHECK_EQ(args.length(), 1);

1090 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1102 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1091	1103

1092 int length = s->length();	1104 int length = s->length();

1093 s = String::Flatten(s);	1105 s = String::Flatten(s);

1094 // First scan the string for uppercase and non-ASCII characters:

1095 if (s->HasOnlyOneByteChars()) {

1096 int first_index_to_lower = length;

1097 for (int index = 0; index < length; ++index) {

1098 // Blink specializes this path for one-byte strings, so it

1099 // does not need to do a generic get, but can do the equivalent

1100 // of SeqOneByteStringGet.

1101 uint16_t ch = s->Get(index);

1102 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

1103 first_index_to_lower = index;

1104 break;

1105 }

1106 }

1107	1106

1108 // Nothing to do if the string is all ASCII with no uppercase.	1107 if (!s->HasOnlyOneByteChars()) {

1109 if (first_index_to_lower == length) return *s;	1108 // Use a slower implementation for strings with characters beyond U+00FF.

1110	1109 return LocaleConvertCase(s, isolate, false, "");

1111 // We depend here on the invariant that the length of a Latin1

1112 // string is invariant under ToLowerCase, and the result always

1113 // fits in the Latin1 range in the root locale. It does not hold

1114 // for ToUpperCase even in the root locale.

1115 Handle<SeqOneByteString> result;

1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1117 isolate, result, isolate->factory()->NewRawOneByteString(length));

1118

1119 DisallowHeapAllocation no_gc;

1120 String::FlatContent flat = s->GetFlatContent();

1121 if (flat.IsOneByte()) {

1122 const uint8_t* src = flat.ToOneByteVector().start();

1123 CopyChars(result->GetChars(), src,

1124 static_cast<size_t>(first_index_to_lower));

1125 for (int index = first_index_to_lower; index < length; ++index) {

1126 uint16_t ch = static_cast<uint16_t>(src[index]);

1127 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

1128 }

1129 } else {

1130 const uint16_t* src = flat.ToUC16Vector().start();

1131 CopyChars(result->GetChars(), src,

1132 static_cast<size_t>(first_index_to_lower));

1133 for (int index = first_index_to_lower; index < length; ++index) {

1134 uint16_t ch = src[index];

1135 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

1136 }

1137 }

1138

1139 return *result;

1140 }	1110 }

1141	1111

1142 // Blink had an additional case here for ASCII 2-byte strings, but	1112 // We depend here on the invariant that the length of a Latin1

1143 // that is subsumed by the above code (assuming there isn't a false	1113 // string is invariant under ToLowerCase, and the result always

1144 // negative for HasOnlyOneByteChars).	1114 // fits in the Latin1 range in the root locale. It does not hold

	1115 // for ToUpperCase even in the root locale.

1145	1116

1146 // Do a slower implementation for cases that include non-ASCII characters.	1117 // Scan the string for uppercase and non-ASCII characters for strings

1147 return LocaleConvertCase(s, isolate, false, "");	1118 // shorter than a machine-word without any memory allocation overhead.

	1119 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()

	1120 // to two parts, one for scanning the prefix with no change and the other for

	1121 // handling ASCII-only characters.

	1122 int index_to_first_unprocessed = length;

	1123 const bool is_short = length < static_cast<int>(sizeof(uintptr_t));

	1124 if (is_short) {

	1125 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);

	1126 // Nothing to do if the string is all ASCII with no uppercase.

	1127 if (index_to_first_unprocessed == length) return *s;

	1128 }

	1129

	1130 Handle<SeqOneByteString> result =

	1131 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1132

	1133 DisallowHeapAllocation no_gc;

	1134 String::FlatContent flat = s->GetFlatContent();

	1135 uint8_t* dest = result->GetChars();

	1136 if (flat.IsOneByte()) {

	1137 const uint8_t* src = flat.ToOneByteVector().start();

	1138 bool has_changed_character = false;

	1139 index_to_first_unprocessed = FastAsciiConvert<true>(

	1140 reinterpret_cast<char>(dest), reinterpret_cast<const char>(src),

	1141 length, &has_changed_character);

	1142 // If not ASCII, we keep the result up to index_to_first_unprocessed and

	1143 // process the rest.

	1144 if (index_to_first_unprocessed == length)

	1145 return has_changed_character ? result : s;

	1146

	1147 for (int index = index_to_first_unprocessed; index < length; ++index) {

	1148 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));

	1149 }

	1150 } else {

	1151 if (index_to_first_unprocessed == length) {

	1152 DCHECK(!is_short);

	1153 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);

	1154 }

	1155 // Nothing to do if the string is all ASCII with no uppercase.

	1156 if (index_to_first_unprocessed == length) return *s;

	1157 const uint16_t* src = flat.ToUC16Vector().start();

	1158 CopyChars(dest, src, index_to_first_unprocessed);

	1159 for (int index = index_to_first_unprocessed; index < length; ++index) {

	1160 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));

	1161 }

	1162 }

	1163

	1164 return *result;

1148 }	1165 }

1149	1166

1150 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {	1167 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

1151 HandleScope scope(isolate);	1168 HandleScope scope(isolate);

1152 DCHECK_EQ(args.length(), 1);	1169 DCHECK_EQ(args.length(), 1);

1153 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1170 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1154	1171

1155 // This function could be optimized for no-op cases the way lowercase

1156 // counterpart is, but in empirical testing, few actual calls to upper()

1157 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

1158

1159 int32_t length = s->length();	1172 int32_t length = s->length();

1160 s = String::Flatten(s);	1173 s = String::Flatten(s);

1161	1174

1162 if (s->HasOnlyOneByteChars()) {	1175 if (s->HasOnlyOneByteChars()) {

1163 Handle<SeqOneByteString> result;	1176 Handle<SeqOneByteString> result =

1164 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1177 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

1165 isolate, result, isolate->factory()->NewRawOneByteString(length));

1166	1178

1167 int sharp_s_count;	1179 int sharp_s_count;

1168 bool is_result_single_byte;	1180 bool is_result_single_byte;

1169 {	1181 {

1170 DisallowHeapAllocation no_gc;	1182 DisallowHeapAllocation no_gc;

1171 String::FlatContent flat = s->GetFlatContent();	1183 String::FlatContent flat = s->GetFlatContent();

1172 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII	1184 uint8_t* dest = result->GetChars();

1173 // could be removed because ToUpperOneByte is pretty fast now (it

1174 // does not call ICU API any more.).

1175 if (flat.IsOneByte()) {	1185 if (flat.IsOneByte()) {

1176 Vector<const uint8_t> src = flat.ToOneByteVector();	1186 Vector<const uint8_t> src = flat.ToOneByteVector();

1177 if (ToUpperFastASCII(src, result)) return *result;	1187 bool has_changed_character = false;

1178 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);	1188 int index_to_first_unprocessed =

	1189 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),

	1190 reinterpret_cast<const char*>(src.start()),

	1191 length, &has_changed_character);

	1192 if (index_to_first_unprocessed == length)

	1193 return has_changed_character ? result : s;

	1194 // If not ASCII, we keep the result up to index_to_first_unprocessed and

	1195 // process the rest.

	1196 is_result_single_byte =

	1197 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),

	1198 dest + index_to_first_unprocessed, &sharp_s_count);

1179 } else {	1199 } else {

1180 DCHECK(flat.IsTwoByte());	1200 DCHECK(flat.IsTwoByte());

1181 Vector<const uint16_t> src = flat.ToUC16Vector();	1201 Vector<const uint16_t> src = flat.ToUC16Vector();

1182 if (ToUpperFastASCII(src, result)) return *result;	1202 if (ToUpperFastASCII(src, result)) return *result;

1183 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);	1203 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);

1184 }	1204 }

1185 }	1205 }

1186	1206

1187 // Go to the full Unicode path if there are characters whose uppercase	1207 // Go to the full Unicode path if there are characters whose uppercase

1188 // is beyond the Latin-1 range (cannot be represented in OneByteString).	1208 // is beyond the Latin-1 range (cannot be represented in OneByteString).

1189 if (V8_UNLIKELY(!is_result_single_byte)) {	1209 if (V8_UNLIKELY(!is_result_single_byte)) {

1190 return LocaleConvertCase(s, isolate, true, "");	1210 return LocaleConvertCase(s, isolate, true, "");

1191 }	1211 }

1192	1212

1193 if (sharp_s_count == 0) return *result;	1213 if (sharp_s_count == 0) return *result;

(...skipping 53 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1247 Handle<FixedArray> date_cache_version =	1267 Handle<FixedArray> date_cache_version =

1248 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(	1268 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(

1249 EternalHandles::DATE_CACHE_VERSION));	1269 EternalHandles::DATE_CACHE_VERSION));

1250 return date_cache_version->get(0);	1270 return date_cache_version->get(0);

1251 }	1271 }

1252	1272

1253 } // namespace internal	1273 } // namespace internal

1254 } // namespace v8	1274 } // namespace v8

1255	1275

1256 #endif // V8_I18N_SUPPORT	1276 #endif // V8_I18N_SUPPORT

OLD	NEW

« src/js/i18n.js ('K') | « src/js/i18n.js ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »