OLD | NEW |
---|---|
1 // Copyright 2014 the V8 project authors. All rights reserved. | 1 // Copyright 2014 the V8 project authors. All rights reserved. |
2 // Use of this source code is governed by a BSD-style license that can be | 2 // Use of this source code is governed by a BSD-style license that can be |
3 // found in the LICENSE file. | 3 // found in the LICENSE file. |
4 | 4 |
5 | 5 |
6 #ifdef V8_I18N_SUPPORT | 6 #ifdef V8_I18N_SUPPORT |
7 #include "src/runtime/runtime-utils.h" | 7 #include "src/runtime/runtime-utils.h" |
8 | 8 |
9 #include <memory> | 9 #include <memory> |
10 | 10 |
11 #include "src/api-natives.h" | |
11 #include "src/api.h" | 12 #include "src/api.h" |
12 #include "src/api-natives.h" | |
13 #include "src/arguments.h" | 13 #include "src/arguments.h" |
14 #include "src/factory.h" | 14 #include "src/factory.h" |
15 #include "src/i18n.h" | 15 #include "src/i18n.h" |
16 #include "src/isolate-inl.h" | 16 #include "src/isolate-inl.h" |
17 #include "src/messages.h" | 17 #include "src/messages.h" |
18 #include "src/string-case.h" | |
19 #include "src/utils.h" | |
18 | 20 |
19 #include "unicode/brkiter.h" | 21 #include "unicode/brkiter.h" |
20 #include "unicode/calendar.h" | 22 #include "unicode/calendar.h" |
21 #include "unicode/coll.h" | 23 #include "unicode/coll.h" |
22 #include "unicode/curramt.h" | 24 #include "unicode/curramt.h" |
23 #include "unicode/datefmt.h" | 25 #include "unicode/datefmt.h" |
24 #include "unicode/dcfmtsym.h" | 26 #include "unicode/dcfmtsym.h" |
25 #include "unicode/decimfmt.h" | 27 #include "unicode/decimfmt.h" |
26 #include "unicode/dtfmtsym.h" | 28 #include "unicode/dtfmtsym.h" |
27 #include "unicode/dtptngen.h" | 29 #include "unicode/dtptngen.h" |
(...skipping 1006 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1034 uint16_t ch = static_cast<uint16_t>(*it); | 1036 uint16_t ch = static_cast<uint16_t>(*it); |
1035 ored |= ch; | 1037 ored |= ch; |
1036 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch)); | 1038 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch)); |
1037 } | 1039 } |
1038 return !(ored & ~0x7F); | 1040 return !(ored & ~0x7F); |
1039 } | 1041 } |
1040 | 1042 |
1041 const uint16_t sharp_s = 0xDF; | 1043 const uint16_t sharp_s = 0xDF; |
1042 | 1044 |
1043 template <typename Char> | 1045 template <typename Char> |
1044 bool ToUpperOneByte(const Vector<const Char>& src, | 1046 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest, |
1045 Handle<SeqOneByteString> result, int* sharp_s_count) { | 1047 int* sharp_s_count) { |
Dan Ehrenberg
2016/12/15 19:24:25
Using a pointer rather than a handle seems valid s
jungshik at Google
2016/12/16 00:37:56
This change is triggered by a change below (call-s
| |
1046 // Still pretty-fast path for the input with non-ASCII Latin-1 characters. | 1048 // Still pretty-fast path for the input with non-ASCII Latin-1 characters. |
1047 | 1049 |
1048 // There are two special cases. | 1050 // There are two special cases. |
1049 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF. | 1051 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF. |
1050 // 2. Lower case sharp-S converts to "SS" (two characters) | 1052 // 2. Lower case sharp-S converts to "SS" (two characters) |
1051 *sharp_s_count = 0; | 1053 *sharp_s_count = 0; |
1052 int32_t index = 0; | 1054 int32_t index = 0; |
1053 for (auto it = src.begin(); it != src.end(); ++it) { | 1055 for (auto it = src.begin(); it != src.end(); ++it) { |
1054 uint16_t ch = static_cast<uint16_t>(*it); | 1056 uint16_t ch = static_cast<uint16_t>(*it); |
1055 if (V8_UNLIKELY(ch == sharp_s)) { | 1057 if (V8_UNLIKELY(ch == sharp_s)) { |
1056 ++(*sharp_s_count); | 1058 ++(*sharp_s_count); |
1057 continue; | 1059 continue; |
1058 } | 1060 } |
1059 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) { | 1061 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) { |
1060 // Since this upper-cased character does not fit in an 8-bit string, we | 1062 // Since this upper-cased character does not fit in an 8-bit string, we |
1061 // need to take the 16-bit path. | 1063 // need to take the 16-bit path. |
1062 return false; | 1064 return false; |
1063 } | 1065 } |
1064 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch)); | 1066 *dest++ = ToLatin1Upper(ch); |
1065 } | 1067 } |
1066 | 1068 |
1067 return true; | 1069 return true; |
1068 } | 1070 } |
1069 | 1071 |
1070 template <typename Char> | 1072 template <typename Char> |
1071 void ToUpperWithSharpS(const Vector<const Char>& src, | 1073 void ToUpperWithSharpS(const Vector<const Char>& src, |
1072 Handle<SeqOneByteString> result) { | 1074 Handle<SeqOneByteString> result) { |
1073 int32_t dest_index = 0; | 1075 int32_t dest_index = 0; |
1074 for (auto it = src.begin(); it != src.end(); ++it) { | 1076 for (auto it = src.begin(); it != src.end(); ++it) { |
1075 uint16_t ch = static_cast<uint16_t>(*it); | 1077 uint16_t ch = static_cast<uint16_t>(*it); |
1076 if (ch == sharp_s) { | 1078 if (ch == sharp_s) { |
1077 result->SeqOneByteStringSet(dest_index++, 'S'); | 1079 result->SeqOneByteStringSet(dest_index++, 'S'); |
1078 result->SeqOneByteStringSet(dest_index++, 'S'); | 1080 result->SeqOneByteStringSet(dest_index++, 'S'); |
1079 } else { | 1081 } else { |
1080 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch)); | 1082 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch)); |
1081 } | 1083 } |
1082 } | 1084 } |
1083 } | 1085 } |
1084 | 1086 |
1087 inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) { | |
1088 for (int index = 0; index < length; ++index) { | |
1089 uint16_t ch = s->Get(index); | |
1090 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) { | |
1091 return index; | |
1092 } | |
1093 } | |
1094 return length; | |
1095 } | |
1096 | |
1085 } // namespace | 1097 } // namespace |
1086 | 1098 |
1087 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) { | 1099 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) { |
1088 HandleScope scope(isolate); | 1100 HandleScope scope(isolate); |
1089 DCHECK_EQ(args.length(), 1); | 1101 DCHECK_EQ(args.length(), 1); |
1090 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); | 1102 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); |
1091 | 1103 |
1092 int length = s->length(); | 1104 int length = s->length(); |
1093 s = String::Flatten(s); | 1105 s = String::Flatten(s); |
1094 // First scan the string for uppercase and non-ASCII characters: | |
1095 if (s->HasOnlyOneByteChars()) { | |
1096 int first_index_to_lower = length; | |
1097 for (int index = 0; index < length; ++index) { | |
1098 // Blink specializes this path for one-byte strings, so it | |
1099 // does not need to do a generic get, but can do the equivalent | |
1100 // of SeqOneByteStringGet. | |
1101 uint16_t ch = s->Get(index); | |
1102 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) { | |
1103 first_index_to_lower = index; | |
1104 break; | |
1105 } | |
1106 } | |
1107 | 1106 |
1108 // Nothing to do if the string is all ASCII with no uppercase. | 1107 if (!s->HasOnlyOneByteChars()) { |
1109 if (first_index_to_lower == length) return *s; | 1108 // Use a slower implementation for strings with characters beyond U+00FF. |
1110 | 1109 return LocaleConvertCase(s, isolate, false, ""); |
1111 // We depend here on the invariant that the length of a Latin1 | |
1112 // string is invariant under ToLowerCase, and the result always | |
1113 // fits in the Latin1 range in the *root locale*. It does not hold | |
1114 // for ToUpperCase even in the root locale. | |
1115 Handle<SeqOneByteString> result; | |
1116 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( | |
1117 isolate, result, isolate->factory()->NewRawOneByteString(length)); | |
1118 | |
1119 DisallowHeapAllocation no_gc; | |
1120 String::FlatContent flat = s->GetFlatContent(); | |
1121 if (flat.IsOneByte()) { | |
1122 const uint8_t* src = flat.ToOneByteVector().start(); | |
1123 CopyChars(result->GetChars(), src, | |
1124 static_cast<size_t>(first_index_to_lower)); | |
1125 for (int index = first_index_to_lower; index < length; ++index) { | |
1126 uint16_t ch = static_cast<uint16_t>(src[index]); | |
1127 result->SeqOneByteStringSet(index, ToLatin1Lower(ch)); | |
1128 } | |
1129 } else { | |
1130 const uint16_t* src = flat.ToUC16Vector().start(); | |
1131 CopyChars(result->GetChars(), src, | |
1132 static_cast<size_t>(first_index_to_lower)); | |
1133 for (int index = first_index_to_lower; index < length; ++index) { | |
1134 uint16_t ch = src[index]; | |
1135 result->SeqOneByteStringSet(index, ToLatin1Lower(ch)); | |
1136 } | |
1137 } | |
1138 | |
1139 return *result; | |
1140 } | 1110 } |
1141 | 1111 |
1142 // Blink had an additional case here for ASCII 2-byte strings, but | 1112 // We depend here on the invariant that the length of a Latin1 |
1143 // that is subsumed by the above code (assuming there isn't a false | 1113 // string is invariant under ToLowerCase, and the result always |
1144 // negative for HasOnlyOneByteChars). | 1114 // fits in the Latin1 range in the *root locale*. It does not hold |
1115 // for ToUpperCase even in the root locale. | |
1145 | 1116 |
1146 // Do a slower implementation for cases that include non-ASCII characters. | 1117 // Scan the string for uppercase and non-ASCII characters for strings |
1147 return LocaleConvertCase(s, isolate, false, ""); | 1118 // shorter than a machine-word without any memory allocation overhead. |
1119 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert() | |
1120 // to two parts, one for scanning the prefix with no change and the other for | |
1121 // handling ASCII-only characters. | |
1122 int index_to_first_unprocessed = length; | |
1123 const bool is_short = length < static_cast<int>(sizeof(uintptr_t)); | |
1124 if (is_short) { | |
1125 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length); | |
1126 // Nothing to do if the string is all ASCII with no uppercase. | |
1127 if (index_to_first_unprocessed == length) return *s; | |
1128 } | |
1129 | |
1130 Handle<SeqOneByteString> result = | |
1131 isolate->factory()->NewRawOneByteString(length).ToHandleChecked(); | |
1132 | |
1133 DisallowHeapAllocation no_gc; | |
1134 String::FlatContent flat = s->GetFlatContent(); | |
1135 uint8_t* dest = result->GetChars(); | |
1136 if (flat.IsOneByte()) { | |
1137 const uint8_t* src = flat.ToOneByteVector().start(); | |
1138 bool has_changed_character = false; | |
1139 index_to_first_unprocessed = FastAsciiConvert<true>( | |
1140 reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src), | |
1141 length, &has_changed_character); | |
1142 // If not ASCII, we keep the result up to index_to_first_unprocessed and | |
1143 // process the rest. | |
1144 if (index_to_first_unprocessed == length) | |
1145 return has_changed_character ? *result : *s; | |
1146 | |
1147 for (int index = index_to_first_unprocessed; index < length; ++index) { | |
1148 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index])); | |
1149 } | |
1150 } else { | |
1151 if (index_to_first_unprocessed == length) { | |
1152 DCHECK(!is_short); | |
1153 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length); | |
1154 } | |
1155 // Nothing to do if the string is all ASCII with no uppercase. | |
1156 if (index_to_first_unprocessed == length) return *s; | |
1157 const uint16_t* src = flat.ToUC16Vector().start(); | |
1158 CopyChars(dest, src, index_to_first_unprocessed); | |
1159 for (int index = index_to_first_unprocessed; index < length; ++index) { | |
1160 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index])); | |
1161 } | |
1162 } | |
1163 | |
1164 return *result; | |
1148 } | 1165 } |
1149 | 1166 |
1150 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) { | 1167 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) { |
1151 HandleScope scope(isolate); | 1168 HandleScope scope(isolate); |
1152 DCHECK_EQ(args.length(), 1); | 1169 DCHECK_EQ(args.length(), 1); |
1153 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); | 1170 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); |
1154 | 1171 |
1155 // This function could be optimized for no-op cases the way lowercase | |
1156 // counterpart is, but in empirical testing, few actual calls to upper() | |
1157 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning. | |
1158 | |
1159 int32_t length = s->length(); | 1172 int32_t length = s->length(); |
1160 s = String::Flatten(s); | 1173 s = String::Flatten(s); |
1161 | 1174 |
1162 if (s->HasOnlyOneByteChars()) { | 1175 if (s->HasOnlyOneByteChars()) { |
1163 Handle<SeqOneByteString> result; | 1176 Handle<SeqOneByteString> result = |
1164 ASSIGN_RETURN_FAILURE_ON_EXCEPTION( | 1177 isolate->factory()->NewRawOneByteString(length).ToHandleChecked(); |
1165 isolate, result, isolate->factory()->NewRawOneByteString(length)); | |
1166 | 1178 |
1167 int sharp_s_count; | 1179 int sharp_s_count; |
1168 bool is_result_single_byte; | 1180 bool is_result_single_byte; |
1169 { | 1181 { |
1170 DisallowHeapAllocation no_gc; | 1182 DisallowHeapAllocation no_gc; |
1171 String::FlatContent flat = s->GetFlatContent(); | 1183 String::FlatContent flat = s->GetFlatContent(); |
1172 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII | 1184 uint8_t* dest = result->GetChars(); |
1173 // could be removed because ToUpperOneByte is pretty fast now (it | |
1174 // does not call ICU API any more.). | |
1175 if (flat.IsOneByte()) { | 1185 if (flat.IsOneByte()) { |
1176 Vector<const uint8_t> src = flat.ToOneByteVector(); | 1186 Vector<const uint8_t> src = flat.ToOneByteVector(); |
1177 if (ToUpperFastASCII(src, result)) return *result; | 1187 bool has_changed_character = false; |
1178 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); | 1188 int index_to_first_unprocessed = |
1189 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()), | |
1190 reinterpret_cast<const char*>(src.start()), | |
1191 length, &has_changed_character); | |
1192 if (index_to_first_unprocessed == length) | |
1193 return has_changed_character ? *result : *s; | |
1194 // If not ASCII, we keep the result up to index_to_first_unprocessed and | |
1195 // process the rest. | |
1196 is_result_single_byte = | |
1197 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length), | |
1198 dest + index_to_first_unprocessed, &sharp_s_count); | |
1179 } else { | 1199 } else { |
1180 DCHECK(flat.IsTwoByte()); | 1200 DCHECK(flat.IsTwoByte()); |
1181 Vector<const uint16_t> src = flat.ToUC16Vector(); | 1201 Vector<const uint16_t> src = flat.ToUC16Vector(); |
1182 if (ToUpperFastASCII(src, result)) return *result; | 1202 if (ToUpperFastASCII(src, result)) return *result; |
1183 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count); | 1203 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count); |
1184 } | 1204 } |
1185 } | 1205 } |
1186 | 1206 |
1187 // Go to the full Unicode path if there are characters whose uppercase | 1207 // Go to the full Unicode path if there are characters whose uppercase |
1188 // is beyond the Latin-1 range (cannot be represented in OneByteString). | 1208 // is beyond the Latin-1 range (cannot be represented in OneByteString). |
1189 if (V8_UNLIKELY(!is_result_single_byte)) { | 1209 if (V8_UNLIKELY(!is_result_single_byte)) { |
1190 return LocaleConvertCase(s, isolate, true, ""); | 1210 return LocaleConvertCase(s, isolate, true, ""); |
1191 } | 1211 } |
1192 | 1212 |
1193 if (sharp_s_count == 0) return *result; | 1213 if (sharp_s_count == 0) return *result; |
(...skipping 53 matching lines...) Expand 10 before | Expand all | Expand 10 after Loading... | |
1247 Handle<FixedArray> date_cache_version = | 1267 Handle<FixedArray> date_cache_version = |
1248 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton( | 1268 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton( |
1249 EternalHandles::DATE_CACHE_VERSION)); | 1269 EternalHandles::DATE_CACHE_VERSION)); |
1250 return date_cache_version->get(0); | 1270 return date_cache_version->get(0); |
1251 } | 1271 } |
1252 | 1272 |
1253 } // namespace internal | 1273 } // namespace internal |
1254 } // namespace v8 | 1274 } // namespace v8 |
1255 | 1275 |
1256 #endif // V8_I18N_SUPPORT | 1276 #endif // V8_I18N_SUPPORT |
OLD | NEW |