Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(625)

Side by Side Diff: src/i18n.cc

Issue 2728763006: Migrate some case conversion functions from JS to CPP builtins (Closed)
Patch Set: rebase Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/i18n.h ('k') | src/runtime/runtime-i18n.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2013 the V8 project authors. All rights reserved. 1 // Copyright 2013 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 // limitations under the License. 4 // limitations under the License.
5 5
6 #include "src/i18n.h" 6 #include "src/i18n.h"
7 7
8 #include <memory> 8 #include <memory>
9 9
10 #include "src/api.h" 10 #include "src/api.h"
11 #include "src/factory.h" 11 #include "src/factory.h"
12 #include "src/isolate.h" 12 #include "src/isolate.h"
13 #include "src/objects-inl.h" 13 #include "src/objects-inl.h"
14 #include "src/string-case.h"
14 #include "unicode/brkiter.h" 15 #include "unicode/brkiter.h"
15 #include "unicode/calendar.h" 16 #include "unicode/calendar.h"
16 #include "unicode/coll.h" 17 #include "unicode/coll.h"
17 #include "unicode/curramt.h" 18 #include "unicode/curramt.h"
18 #include "unicode/dcfmtsym.h" 19 #include "unicode/dcfmtsym.h"
19 #include "unicode/decimfmt.h" 20 #include "unicode/decimfmt.h"
20 #include "unicode/dtfmtsym.h" 21 #include "unicode/dtfmtsym.h"
21 #include "unicode/dtptngen.h" 22 #include "unicode/dtptngen.h"
22 #include "unicode/gregocal.h" 23 #include "unicode/gregocal.h"
23 #include "unicode/locid.h" 24 #include "unicode/locid.h"
(...skipping 660 matching lines...) Expand 10 before | Expand all | Expand 10 after
684 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"), 685 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"),
685 factory->NewStringFromAsciiChecked(result), 686 factory->NewStringFromAsciiChecked(result),
686 SLOPPY).Assert(); 687 SLOPPY).Assert();
687 } else { 688 } else {
688 // This would never happen, since we got the locale from ICU. 689 // This would never happen, since we got the locale from ICU.
689 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"), 690 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"),
690 factory->NewStringFromStaticChars("und"), 691 factory->NewStringFromStaticChars("und"),
691 SLOPPY).Assert(); 692 SLOPPY).Assert();
692 } 693 }
693 } 694 }
694
695 } // namespace 695 } // namespace
696 696
697
698 // static 697 // static
699 icu::SimpleDateFormat* DateFormat::InitializeDateTimeFormat( 698 icu::SimpleDateFormat* DateFormat::InitializeDateTimeFormat(
700 Isolate* isolate, 699 Isolate* isolate,
701 Handle<String> locale, 700 Handle<String> locale,
702 Handle<JSObject> options, 701 Handle<JSObject> options,
703 Handle<JSObject> resolved) { 702 Handle<JSObject> resolved) {
704 // Convert BCP47 into ICU locale format. 703 // Convert BCP47 into ICU locale format.
705 UErrorCode status = U_ZERO_ERROR; 704 UErrorCode status = U_ZERO_ERROR;
706 icu::Locale icu_locale; 705 icu::Locale icu_locale;
707 char icu_result[ULOC_FULLNAME_CAPACITY]; 706 char icu_result[ULOC_FULLNAME_CAPACITY];
(...skipping 194 matching lines...) Expand 10 before | Expand all | Expand 10 after
902 return reinterpret_cast<icu::BreakIterator*>(obj->GetEmbedderField(0)); 901 return reinterpret_cast<icu::BreakIterator*>(obj->GetEmbedderField(0));
903 } 902 }
904 903
905 void V8BreakIterator::DeleteBreakIterator( 904 void V8BreakIterator::DeleteBreakIterator(
906 const v8::WeakCallbackInfo<void>& data) { 905 const v8::WeakCallbackInfo<void>& data) {
907 delete reinterpret_cast<icu::BreakIterator*>(data.GetInternalField(0)); 906 delete reinterpret_cast<icu::BreakIterator*>(data.GetInternalField(0));
908 delete reinterpret_cast<icu::UnicodeString*>(data.GetInternalField(1)); 907 delete reinterpret_cast<icu::UnicodeString*>(data.GetInternalField(1));
909 GlobalHandles::Destroy(reinterpret_cast<Object**>(data.GetParameter())); 908 GlobalHandles::Destroy(reinterpret_cast<Object**>(data.GetParameter()));
910 } 909 }
911 910
911 namespace {
912 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
913
914 const uint8_t kToLower[256] = {
915 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
916 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
917 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
918 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
919 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
920 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
921 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
922 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
923 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
924 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
925 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
926 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
927 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
928 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
929 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
930 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
931 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
932 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
933 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
934 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
935 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
936 0xFC, 0xFD, 0xFE, 0xFF,
937 };
938
939 inline uint16_t ToLatin1Lower(uint16_t ch) {
940 return static_cast<uint16_t>(kToLower[ch]);
941 }
942
943 inline uint16_t ToASCIIUpper(uint16_t ch) {
944 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
945 }
946
947 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
948 inline uint16_t ToLatin1Upper(uint16_t ch) {
949 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
950 return ch &
951 ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xF7))
952 << 5);
953 }
954
955 template <typename Char>
956 bool ToUpperFastASCII(const Vector<const Char>& src,
957 Handle<SeqOneByteString> result) {
958 // Do a faster loop for the case where all the characters are ASCII.
959 uint16_t ored = 0;
960 int32_t index = 0;
961 for (auto it = src.begin(); it != src.end(); ++it) {
962 uint16_t ch = static_cast<uint16_t>(*it);
963 ored |= ch;
964 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
965 }
966 return !(ored & ~0x7F);
967 }
968
969 const uint16_t sharp_s = 0xDF;
970
971 template <typename Char>
972 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,
973 int* sharp_s_count) {
974 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
975
976 // There are two special cases.
977 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
978 // 2. Lower case sharp-S converts to "SS" (two characters)
979 *sharp_s_count = 0;
980 for (auto it = src.begin(); it != src.end(); ++it) {
981 uint16_t ch = static_cast<uint16_t>(*it);
982 if (V8_UNLIKELY(ch == sharp_s)) {
983 ++(*sharp_s_count);
984 continue;
985 }
986 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
987 // Since this upper-cased character does not fit in an 8-bit string, we
988 // need to take the 16-bit path.
989 return false;
990 }
991 *dest++ = ToLatin1Upper(ch);
992 }
993
994 return true;
995 }
996
997 template <typename Char>
998 void ToUpperWithSharpS(const Vector<const Char>& src,
999 Handle<SeqOneByteString> result) {
1000 int32_t dest_index = 0;
1001 for (auto it = src.begin(); it != src.end(); ++it) {
1002 uint16_t ch = static_cast<uint16_t>(*it);
1003 if (ch == sharp_s) {
1004 result->SeqOneByteStringSet(dest_index++, 'S');
1005 result->SeqOneByteStringSet(dest_index++, 'S');
1006 } else {
1007 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
1008 }
1009 }
1010 }
1011
1012 inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {
1013 for (int index = 0; index < length; ++index) {
1014 uint16_t ch = s->Get(index);
1015 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
1016 return index;
1017 }
1018 }
1019 return length;
1020 }
1021
1022 } // namespace
1023
1024 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,
1025 std::unique_ptr<uc16[]>* dest,
1026 int32_t length) {
1027 DCHECK(flat.IsFlat());
1028 if (flat.IsOneByte()) {
1029 if (!*dest) {
1030 dest->reset(NewArray<uc16>(length));
1031 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);
1032 }
1033 return reinterpret_cast<const UChar*>(dest->get());
1034 } else {
1035 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
1036 }
1037 }
1038
1039 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
1040 bool is_to_upper, const char* lang) {
1041 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
1042 int32_t src_length = s->length();
1043 int32_t dest_length = src_length;
1044 UErrorCode status;
1045 Handle<SeqTwoByteString> result;
1046 std::unique_ptr<uc16[]> sap;
1047
1048 if (dest_length == 0) return isolate->heap()->empty_string();
1049
1050 // This is not a real loop. It'll be executed only once (no overflow) or
1051 // twice (overflow).
1052 for (int i = 0; i < 2; ++i) {
1053 // Case conversion can increase the string length (e.g. sharp-S => SS) so
1054 // that we have to handle RangeError exceptions here.
1055 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1056 isolate, result, isolate->factory()->NewRawTwoByteString(dest_length));
1057 DisallowHeapAllocation no_gc;
1058 DCHECK(s->IsFlat());
1059 String::FlatContent flat = s->GetFlatContent();
1060 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
1061 status = U_ZERO_ERROR;
1062 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
1063 dest_length, src, src_length, lang, &status);
1064 if (status != U_BUFFER_OVERFLOW_ERROR) break;
1065 }
1066
1067 // In most cases, the output will fill the destination buffer completely
1068 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
1069 // Only in rare cases, it'll be shorter than the destination buffer and
1070 // |result| has to be truncated.
1071 DCHECK(U_SUCCESS(status));
1072 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
1073 DCHECK(dest_length == result->length());
1074 return *result;
1075 }
1076 if (U_SUCCESS(status)) {
1077 DCHECK(dest_length < result->length());
1078 return *Handle<SeqTwoByteString>::cast(
1079 SeqString::Truncate(result, dest_length));
1080 }
1081 return *s;
1082 }
1083
1084 MUST_USE_RESULT Object* ConvertToLower(Handle<String> s, Isolate* isolate) {
1085 if (!s->HasOnlyOneByteChars()) {
1086 // Use a slower implementation for strings with characters beyond U+00FF.
1087 return LocaleConvertCase(s, isolate, false, "");
1088 }
1089
1090 int length = s->length();
1091
1092 // We depend here on the invariant that the length of a Latin1
1093 // string is invariant under ToLowerCase, and the result always
1094 // fits in the Latin1 range in the *root locale*. It does not hold
1095 // for ToUpperCase even in the root locale.
1096
1097 // Scan the string for uppercase and non-ASCII characters for strings
1098 // shorter than a machine-word without any memory allocation overhead.
1099 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()
1100 // to two parts, one for scanning the prefix with no change and the other for
1101 // handling ASCII-only characters.
1102 int index_to_first_unprocessed = length;
1103 const bool is_short = length < static_cast<int>(sizeof(uintptr_t));
1104 if (is_short) {
1105 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
1106 // Nothing to do if the string is all ASCII with no uppercase.
1107 if (index_to_first_unprocessed == length) return *s;
1108 }
1109
1110 Handle<SeqOneByteString> result =
1111 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1112
1113 DisallowHeapAllocation no_gc;
1114 DCHECK(s->IsFlat());
1115 String::FlatContent flat = s->GetFlatContent();
1116 uint8_t* dest = result->GetChars();
1117 if (flat.IsOneByte()) {
1118 const uint8_t* src = flat.ToOneByteVector().start();
1119 bool has_changed_character = false;
1120 index_to_first_unprocessed = FastAsciiConvert<true>(
1121 reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src),
1122 length, &has_changed_character);
1123 // If not ASCII, we keep the result up to index_to_first_unprocessed and
1124 // process the rest.
1125 if (index_to_first_unprocessed == length)
1126 return has_changed_character ? *result : *s;
1127
1128 for (int index = index_to_first_unprocessed; index < length; ++index) {
1129 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
1130 }
1131 } else {
1132 if (index_to_first_unprocessed == length) {
1133 DCHECK(!is_short);
1134 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
1135 }
1136 // Nothing to do if the string is all ASCII with no uppercase.
1137 if (index_to_first_unprocessed == length) return *s;
1138 const uint16_t* src = flat.ToUC16Vector().start();
1139 CopyChars(dest, src, index_to_first_unprocessed);
1140 for (int index = index_to_first_unprocessed; index < length; ++index) {
1141 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
1142 }
1143 }
1144
1145 return *result;
1146 }
1147
1148 MUST_USE_RESULT Object* ConvertToUpper(Handle<String> s, Isolate* isolate) {
1149 int32_t length = s->length();
1150 if (s->HasOnlyOneByteChars() && length > 0) {
1151 Handle<SeqOneByteString> result =
1152 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1153
1154 DCHECK(s->IsFlat());
1155 int sharp_s_count;
1156 bool is_result_single_byte;
1157 {
1158 DisallowHeapAllocation no_gc;
1159 String::FlatContent flat = s->GetFlatContent();
1160 uint8_t* dest = result->GetChars();
1161 if (flat.IsOneByte()) {
1162 Vector<const uint8_t> src = flat.ToOneByteVector();
1163 bool has_changed_character = false;
1164 int index_to_first_unprocessed =
1165 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
1166 reinterpret_cast<const char*>(src.start()),
1167 length, &has_changed_character);
1168 if (index_to_first_unprocessed == length)
1169 return has_changed_character ? *result : *s;
1170 // If not ASCII, we keep the result up to index_to_first_unprocessed and
1171 // process the rest.
1172 is_result_single_byte =
1173 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),
1174 dest + index_to_first_unprocessed, &sharp_s_count);
1175 } else {
1176 DCHECK(flat.IsTwoByte());
1177 Vector<const uint16_t> src = flat.ToUC16Vector();
1178 if (ToUpperFastASCII(src, result)) return *result;
1179 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);
1180 }
1181 }
1182
1183 // Go to the full Unicode path if there are characters whose uppercase
1184 // is beyond the Latin-1 range (cannot be represented in OneByteString).
1185 if (V8_UNLIKELY(!is_result_single_byte)) {
1186 return LocaleConvertCase(s, isolate, true, "");
1187 }
1188
1189 if (sharp_s_count == 0) return *result;
1190
1191 // We have sharp_s_count sharp-s characters, but the result is still
1192 // in the Latin-1 range.
1193 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1194 isolate, result,
1195 isolate->factory()->NewRawOneByteString(length + sharp_s_count));
1196 DisallowHeapAllocation no_gc;
1197 String::FlatContent flat = s->GetFlatContent();
1198 if (flat.IsOneByte()) {
1199 ToUpperWithSharpS(flat.ToOneByteVector(), result);
1200 } else {
1201 ToUpperWithSharpS(flat.ToUC16Vector(), result);
1202 }
1203
1204 return *result;
1205 }
1206
1207 return LocaleConvertCase(s, isolate, true, "");
1208 }
1209
1210 MUST_USE_RESULT Object* ConvertCase(Handle<String> s, bool is_upper,
1211 Isolate* isolate) {
1212 return is_upper ? ConvertToUpper(s, isolate) : ConvertToLower(s, isolate);
1213 }
1214
912 } // namespace internal 1215 } // namespace internal
913 } // namespace v8 1216 } // namespace v8
OLDNEW
« no previous file with comments | « src/i18n.h ('k') | src/runtime/runtime-i18n.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698