src/runtime/runtime-i18n.cc - Issue 2533033003: Fix the uppercasing of U+00E7(ç) and U+00F7(÷)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 2533033003: Fix the uppercasing of U+00E7(ç) and U+00F7(÷) (Closed)

Patch Set: a bit more tweaks + tests Created 4 years ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include <memory>	9 #include <memory>

10	10

	11 #include "src/api-natives.h"

11 #include "src/api.h"	12 #include "src/api.h"

12 #include "src/api-natives.h"

13 #include "src/arguments.h"	13 #include "src/arguments.h"

14 #include "src/factory.h"	14 #include "src/factory.h"

15 #include "src/i18n.h"	15 #include "src/i18n.h"

16 #include "src/isolate-inl.h"	16 #include "src/isolate-inl.h"

17 #include "src/messages.h"	17 #include "src/messages.h"

	18 #include "src/utils.h"

18	19

19 #include "unicode/brkiter.h"	20 #include "unicode/brkiter.h"

20 #include "unicode/calendar.h"	21 #include "unicode/calendar.h"

21 #include "unicode/coll.h"	22 #include "unicode/coll.h"

22 #include "unicode/curramt.h"	23 #include "unicode/curramt.h"

23 #include "unicode/datefmt.h"	24 #include "unicode/datefmt.h"

24 #include "unicode/dcfmtsym.h"	25 #include "unicode/dcfmtsym.h"

25 #include "unicode/decimfmt.h"	26 #include "unicode/decimfmt.h"

26 #include "unicode/dtfmtsym.h"	27 #include "unicode/dtfmtsym.h"

27 #include "unicode/dtptngen.h"	28 #include "unicode/dtptngen.h"

(...skipping 1058 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1086	1087

1087 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {	1088 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

1088 HandleScope scope(isolate);	1089 HandleScope scope(isolate);

1089 DCHECK_EQ(args.length(), 1);	1090 DCHECK_EQ(args.length(), 1);

1090 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1091 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1091	1092

1092 int length = s->length();	1093 int length = s->length();

1093 s = String::Flatten(s);	1094 s = String::Flatten(s);

1094 // First scan the string for uppercase and non-ASCII characters:	1095 // First scan the string for uppercase and non-ASCII characters:

1095 if (s->HasOnlyOneByteChars()) {	1096 if (s->HasOnlyOneByteChars()) {

1096 int first_index_to_lower = length;

1097 for (int index = 0; index < length; ++index) {

1098 // Blink specializes this path for one-byte strings, so it

1099 // does not need to do a generic get, but can do the equivalent

1100 // of SeqOneByteStringGet.

1101 uint16_t ch = s->Get(index);

1102 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

1103 first_index_to_lower = index;

1104 break;

1105 }

1106 }

1107

1108 // Nothing to do if the string is all ASCII with no uppercase.

1109 if (first_index_to_lower == length) return *s;

1110

1111 // We depend here on the invariant that the length of a Latin1	1097 // We depend here on the invariant that the length of a Latin1

1112 // string is invariant under ToLowerCase, and the result always	1098 // string is invariant under ToLowerCase, and the result always

1113 // fits in the Latin1 range in the root locale. It does not hold	1099 // fits in the Latin1 range in the root locale. It does not hold

1114 // for ToUpperCase even in the root locale.	1100 // for ToUpperCase even in the root locale.

1115 Handle<SeqOneByteString> result;	1101 Handle<SeqOneByteString> result;

1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1102 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1117 isolate, result, isolate->factory()->NewRawOneByteString(length));	1103 isolate, result, isolate->factory()->NewRawOneByteString(length));

1118	1104

1119 DisallowHeapAllocation no_gc;	1105 DisallowHeapAllocation no_gc;

1120 String::FlatContent flat = s->GetFlatContent();	1106 String::FlatContent flat = s->GetFlatContent();

	1107 uint8_t* dest = result->GetChars();

	1108 const uint8_t* src = flat.ToOneByteVector().start();

	1109 if (flat.IsOneByte() && static_cast<size_t>(length) >= sizeof(uintptr_t)) {

	1110 bool has_changed_character = false;

	1111 bool is_ascii = FastAsciiConvert<true>(reinterpret_cast<char*>(dest),

	1112 reinterpret_cast<const char*>(src),

	1113 length, &has_changed_character);

	1114 // If not ASCII, we discard the result and start anew.

	1115 if (is_ascii) return has_changed_character ? result : s;

	1116 }

	1117

	1118 int index_to_first_upper = 0;

	1119 for (int index = 0; index < length; ++index) {

	1120 uint16_t ch = s->Get(index);

	1121 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1122 index_to_first_upper = index;

	1123 break;

	1124 }

	1125 }

	1126

	1127 // An ASCII input without any uppercase characters is already handled by

	1128 // FastAsciiConvert as long as the input is a machine-word or longer.

	1129 DCHECK(index_to_first_upper < length \|\|

	1130 static_cast<size_t>(length) < sizeof(uintptr_t));

	1131 // Nothing to do if the string is all ASCII with no uppercase.

	1132 if (index_to_first_upper == length) return *s;

1121 if (flat.IsOneByte()) {	1133 if (flat.IsOneByte()) {

1122 const uint8_t* src = flat.ToOneByteVector().start();	1134 CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));

1123 CopyChars(result->GetChars(), src,	1135 for (int index = index_to_first_upper; index < length; ++index) {

1124 static_cast<size_t>(first_index_to_lower));

1125 for (int index = first_index_to_lower; index < length; ++index) {

1126 uint16_t ch = static_cast<uint16_t>(src[index]);	1136 uint16_t ch = static_cast<uint16_t>(src[index]);

1127 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));	1137 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

1128 }	1138 }

1129 } else {	1139 } else {

1130 const uint16_t* src = flat.ToUC16Vector().start();	1140 const uint16_t* src = flat.ToUC16Vector().start();

1131 CopyChars(result->GetChars(), src,	1141 CopyChars(dest, src, static_cast<size_t>(index_to_first_upper));

1132 static_cast<size_t>(first_index_to_lower));	1142 for (int index = index_to_first_upper; index < length; ++index) {

1133 for (int index = first_index_to_lower; index < length; ++index) {

1134 uint16_t ch = src[index];	1143 uint16_t ch = src[index];

1135 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));	1144 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));

1136 }	1145 }

1137 }	1146 }

1138	1147

1139 return *result;	1148 return *result;

1140 }	1149 }

1141	1150

1142 // Blink had an additional case here for ASCII 2-byte strings, but	1151 // Blink had an additional case here for ASCII 2-byte strings, but

1143 // that is subsumed by the above code (assuming there isn't a false	1152 // that is subsumed by the above code (assuming there isn't a false

1144 // negative for HasOnlyOneByteChars).	1153 // negative for HasOnlyOneByteChars).

1145	1154

1146 // Do a slower implementation for cases that include non-ASCII characters.	1155 // Do a slower implementation for cases that include non-ASCII characters.

1147 return LocaleConvertCase(s, isolate, false, "");	1156 return LocaleConvertCase(s, isolate, false, "");

1148 }	1157 }

1149	1158

1150 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {	1159 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

1151 HandleScope scope(isolate);	1160 HandleScope scope(isolate);

1152 DCHECK_EQ(args.length(), 1);	1161 DCHECK_EQ(args.length(), 1);

1153 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	1162 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1154	1163

1155 // This function could be optimized for no-op cases the way lowercase

1156 // counterpart is, but in empirical testing, few actual calls to upper()

1157 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.

1158

1159 int32_t length = s->length();	1164 int32_t length = s->length();

1160 s = String::Flatten(s);	1165 s = String::Flatten(s);

1161	1166

1162 if (s->HasOnlyOneByteChars()) {	1167 if (s->HasOnlyOneByteChars()) {

1163 Handle<SeqOneByteString> result;	1168 Handle<SeqOneByteString> result;

1164 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(	1169 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1165 isolate, result, isolate->factory()->NewRawOneByteString(length));	1170 isolate, result, isolate->factory()->NewRawOneByteString(length));

1166	1171

1167 int sharp_s_count;	1172 int sharp_s_count;

1168 bool is_result_single_byte;	1173 bool is_result_single_byte;

1169 {	1174 {

1170 DisallowHeapAllocation no_gc;	1175 DisallowHeapAllocation no_gc;

1171 String::FlatContent flat = s->GetFlatContent();	1176 String::FlatContent flat = s->GetFlatContent();

1172 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII

1173 // could be removed because ToUpperOneByte is pretty fast now (it

1174 // does not call ICU API any more.).

1175 if (flat.IsOneByte()) {	1177 if (flat.IsOneByte()) {

1176 Vector<const uint8_t> src = flat.ToOneByteVector();	1178 Vector<const uint8_t> src = flat.ToOneByteVector();

1177 if (ToUpperFastASCII(src, result)) return *result;	1179 bool has_changed_character = false;

	1180 bool is_ascii =

	1181 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),

	1182 reinterpret_cast<const char*>(src.start()),

	1183 length, &has_changed_character);

	1184 // If not ASCII, we discard the result and use the table for Latin1.

	1185 if (is_ascii) return has_changed_character ? result : s;

1178 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);	1186 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

1179 } else {	1187 } else {

1180 DCHECK(flat.IsTwoByte());	1188 DCHECK(flat.IsTwoByte());

1181 Vector<const uint16_t> src = flat.ToUC16Vector();	1189 Vector<const uint16_t> src = flat.ToUC16Vector();

1182 if (ToUpperFastASCII(src, result)) return *result;	1190 if (ToUpperFastASCII(src, result)) return *result;

1183 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);	1191 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);

1184 }	1192 }

1185 }	1193 }

1186	1194

1187 // Go to the full Unicode path if there are characters whose uppercase	1195 // Go to the full Unicode path if there are characters whose uppercase

(...skipping 59 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1247 Handle<FixedArray> date_cache_version =	1255 Handle<FixedArray> date_cache_version =

1248 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(	1256 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(

1249 EternalHandles::DATE_CACHE_VERSION));	1257 EternalHandles::DATE_CACHE_VERSION));

1250 return date_cache_version->get(0);	1258 return date_cache_version->get(0);

1251 }	1259 }

1252	1260

1253 } // namespace internal	1261 } // namespace internal

1254 } // namespace v8	1262 } // namespace v8

1255	1263

1256 #endif // V8_I18N_SUPPORT	1264 #endif // V8_I18N_SUPPORT

OLD	NEW

« no previous file with comments | « BUILD.gn ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »