src/i18n.cc - Issue 2728763006: Migrate some case conversion functions from JS to CPP builtins

Side by Side Diff: src/i18n.cc

Issue 2728763006: Migrate some case conversion functions from JS to CPP builtins (Closed)

Patch Set: rebase Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2013 the V8 project authors. All rights reserved.	1 // Copyright 2013 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4 // limitations under the License.	4 // limitations under the License.

5	5

6 #include "src/i18n.h"	6 #include "src/i18n.h"

7	7

8 #include <memory>	8 #include <memory>

9	9

10 #include "src/api.h"	10 #include "src/api.h"

11 #include "src/factory.h"	11 #include "src/factory.h"

12 #include "src/isolate.h"	12 #include "src/isolate.h"

13 #include "src/objects-inl.h"	13 #include "src/objects-inl.h"

	14 #include "src/string-case.h"

14 #include "unicode/brkiter.h"	15 #include "unicode/brkiter.h"

15 #include "unicode/calendar.h"	16 #include "unicode/calendar.h"

16 #include "unicode/coll.h"	17 #include "unicode/coll.h"

17 #include "unicode/curramt.h"	18 #include "unicode/curramt.h"

18 #include "unicode/dcfmtsym.h"	19 #include "unicode/dcfmtsym.h"

19 #include "unicode/decimfmt.h"	20 #include "unicode/decimfmt.h"

20 #include "unicode/dtfmtsym.h"	21 #include "unicode/dtfmtsym.h"

21 #include "unicode/dtptngen.h"	22 #include "unicode/dtptngen.h"

22 #include "unicode/gregocal.h"	23 #include "unicode/gregocal.h"

23 #include "unicode/locid.h"	24 #include "unicode/locid.h"

(...skipping 660 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
684 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"),	685 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"),

685 factory->NewStringFromAsciiChecked(result),	686 factory->NewStringFromAsciiChecked(result),

686 SLOPPY).Assert();	687 SLOPPY).Assert();

687 } else {	688 } else {

688 // This would never happen, since we got the locale from ICU.	689 // This would never happen, since we got the locale from ICU.

689 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"),	690 JSObject::SetProperty(resolved, factory->NewStringFromStaticChars("locale"),

690 factory->NewStringFromStaticChars("und"),	691 factory->NewStringFromStaticChars("und"),

691 SLOPPY).Assert();	692 SLOPPY).Assert();

692 }	693 }

693 }	694 }

694

695 } // namespace	695 } // namespace

696	696

697

698 // static	697 // static

699 icu::SimpleDateFormat* DateFormat::InitializeDateTimeFormat(	698 icu::SimpleDateFormat* DateFormat::InitializeDateTimeFormat(

700 Isolate* isolate,	699 Isolate* isolate,

701 Handle<String> locale,	700 Handle<String> locale,

702 Handle<JSObject> options,	701 Handle<JSObject> options,

703 Handle<JSObject> resolved) {	702 Handle<JSObject> resolved) {

704 // Convert BCP47 into ICU locale format.	703 // Convert BCP47 into ICU locale format.

705 UErrorCode status = U_ZERO_ERROR;	704 UErrorCode status = U_ZERO_ERROR;

706 icu::Locale icu_locale;	705 icu::Locale icu_locale;

707 char icu_result[ULOC_FULLNAME_CAPACITY];	706 char icu_result[ULOC_FULLNAME_CAPACITY];

(...skipping 194 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
902 return reinterpret_cast<icu::BreakIterator*>(obj->GetEmbedderField(0));	901 return reinterpret_cast<icu::BreakIterator*>(obj->GetEmbedderField(0));

903 }	902 }

904	903

905 void V8BreakIterator::DeleteBreakIterator(	904 void V8BreakIterator::DeleteBreakIterator(

906 const v8::WeakCallbackInfo<void>& data) {	905 const v8::WeakCallbackInfo<void>& data) {

907 delete reinterpret_cast<icu::BreakIterator*>(data.GetInternalField(0));	906 delete reinterpret_cast<icu::BreakIterator*>(data.GetInternalField(0));

908 delete reinterpret_cast<icu::UnicodeString*>(data.GetInternalField(1));	907 delete reinterpret_cast<icu::UnicodeString*>(data.GetInternalField(1));

909 GlobalHandles::Destroy(reinterpret_cast<Object**>(data.GetParameter()));	908 GlobalHandles::Destroy(reinterpret_cast<Object**>(data.GetParameter()));

910 }	909 }

911	910

	911 namespace {

	912 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

	913

	914 const uint8_t kToLower[256] = {

	915 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,

	916 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

	917 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,

	918 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,

	919 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,

	920 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,

	921 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,

	922 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,

	923 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,

	924 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,

	925 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,

	926 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,

	927 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,

	928 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,

	929 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,

	930 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,

	931 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,

	932 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,

	933 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,

	934 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,

	935 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,

	936 0xFC, 0xFD, 0xFE, 0xFF,

	937 };

	938

	939 inline uint16_t ToLatin1Lower(uint16_t ch) {

	940 return static_cast<uint16_t>(kToLower[ch]);

	941 }

	942

	943 inline uint16_t ToASCIIUpper(uint16_t ch) {

	944 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

	945 }

	946

	947 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.

	948 inline uint16_t ToLatin1Upper(uint16_t ch) {

	949 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);

	950 return ch &

	951 ~(((ch >= 'a' && ch <= 'z') \|\| (((ch & 0xE0) == 0xE0) && ch != 0xF7))

	952 << 5);

	953 }

	954

	955 template <typename Char>

	956 bool ToUpperFastASCII(const Vector<const Char>& src,

	957 Handle<SeqOneByteString> result) {

	958 // Do a faster loop for the case where all the characters are ASCII.

	959 uint16_t ored = 0;

	960 int32_t index = 0;

	961 for (auto it = src.begin(); it != src.end(); ++it) {

	962 uint16_t ch = static_cast<uint16_t>(*it);

	963 ored \|= ch;

	964 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

	965 }

	966 return !(ored & ~0x7F);

	967 }

	968

	969 const uint16_t sharp_s = 0xDF;

	970

	971 template <typename Char>

	972 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,

	973 int* sharp_s_count) {

	974 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

	975

	976 // There are two special cases.

	977 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

	978 // 2. Lower case sharp-S converts to "SS" (two characters)

	979 *sharp_s_count = 0;

	980 for (auto it = src.begin(); it != src.end(); ++it) {

	981 uint16_t ch = static_cast<uint16_t>(*it);

	982 if (V8_UNLIKELY(ch == sharp_s)) {

	983 ++(*sharp_s_count);

	984 continue;

	985 }

	986 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {

	987 // Since this upper-cased character does not fit in an 8-bit string, we

	988 // need to take the 16-bit path.

	989 return false;

	990 }

	991 *dest++ = ToLatin1Upper(ch);

	992 }

	993

	994 return true;

	995 }

	996

	997 template <typename Char>

	998 void ToUpperWithSharpS(const Vector<const Char>& src,

	999 Handle<SeqOneByteString> result) {

	1000 int32_t dest_index = 0;

	1001 for (auto it = src.begin(); it != src.end(); ++it) {

	1002 uint16_t ch = static_cast<uint16_t>(*it);

	1003 if (ch == sharp_s) {

	1004 result->SeqOneByteStringSet(dest_index++, 'S');

	1005 result->SeqOneByteStringSet(dest_index++, 'S');

	1006 } else {

	1007 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

	1008 }

	1009 }

	1010 }

	1011

	1012 inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {

	1013 for (int index = 0; index < length; ++index) {

	1014 uint16_t ch = s->Get(index);

	1015 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

	1016 return index;

	1017 }

	1018 }

	1019 return length;

	1020 }

	1021

	1022 } // namespace

	1023

	1024 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,

	1025 std::unique_ptr<uc16[]>* dest,

	1026 int32_t length) {

	1027 DCHECK(flat.IsFlat());

	1028 if (flat.IsOneByte()) {

	1029 if (!*dest) {

	1030 dest->reset(NewArray<uc16>(length));

	1031 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);

	1032 }

	1033 return reinterpret_cast<const UChar*>(dest->get());

	1034 } else {

	1035 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

	1036 }

	1037 }

	1038

	1039 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

	1040 bool is_to_upper, const char* lang) {

	1041 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;

	1042 int32_t src_length = s->length();

	1043 int32_t dest_length = src_length;

	1044 UErrorCode status;

	1045 Handle<SeqTwoByteString> result;

	1046 std::unique_ptr<uc16[]> sap;

	1047

	1048 if (dest_length == 0) return isolate->heap()->empty_string();

	1049

	1050 // This is not a real loop. It'll be executed only once (no overflow) or

	1051 // twice (overflow).

	1052 for (int i = 0; i < 2; ++i) {

	1053 // Case conversion can increase the string length (e.g. sharp-S => SS) so

	1054 // that we have to handle RangeError exceptions here.

	1055 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1056 isolate, result, isolate->factory()->NewRawTwoByteString(dest_length));

	1057 DisallowHeapAllocation no_gc;

	1058 DCHECK(s->IsFlat());

	1059 String::FlatContent flat = s->GetFlatContent();

	1060 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);

	1061 status = U_ZERO_ERROR;

	1062 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),

	1063 dest_length, src, src_length, lang, &status);

	1064 if (status != U_BUFFER_OVERFLOW_ERROR) break;

	1065 }

	1066

	1067 // In most cases, the output will fill the destination buffer completely

	1068 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).

	1069 // Only in rare cases, it'll be shorter than the destination buffer and

	1070 // \|result\| has to be truncated.

	1071 DCHECK(U_SUCCESS(status));

	1072 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {

	1073 DCHECK(dest_length == result->length());

	1074 return *result;

	1075 }

	1076 if (U_SUCCESS(status)) {

	1077 DCHECK(dest_length < result->length());

	1078 return *Handle<SeqTwoByteString>::cast(

	1079 SeqString::Truncate(result, dest_length));

	1080 }

	1081 return *s;

	1082 }

	1083

	1084 MUST_USE_RESULT Object* ConvertToLower(Handle<String> s, Isolate* isolate) {

	1085 if (!s->HasOnlyOneByteChars()) {

	1086 // Use a slower implementation for strings with characters beyond U+00FF.

	1087 return LocaleConvertCase(s, isolate, false, "");

	1088 }

	1089

	1090 int length = s->length();

	1091

	1092 // We depend here on the invariant that the length of a Latin1

	1093 // string is invariant under ToLowerCase, and the result always

	1094 // fits in the Latin1 range in the root locale. It does not hold

	1095 // for ToUpperCase even in the root locale.

	1096

	1097 // Scan the string for uppercase and non-ASCII characters for strings

	1098 // shorter than a machine-word without any memory allocation overhead.

	1099 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()

	1100 // to two parts, one for scanning the prefix with no change and the other for

	1101 // handling ASCII-only characters.

	1102 int index_to_first_unprocessed = length;

	1103 const bool is_short = length < static_cast<int>(sizeof(uintptr_t));

	1104 if (is_short) {

	1105 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);

	1106 // Nothing to do if the string is all ASCII with no uppercase.

	1107 if (index_to_first_unprocessed == length) return *s;

	1108 }

	1109

	1110 Handle<SeqOneByteString> result =

	1111 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1112

	1113 DisallowHeapAllocation no_gc;

	1114 DCHECK(s->IsFlat());

	1115 String::FlatContent flat = s->GetFlatContent();

	1116 uint8_t* dest = result->GetChars();

	1117 if (flat.IsOneByte()) {

	1118 const uint8_t* src = flat.ToOneByteVector().start();

	1119 bool has_changed_character = false;

	1120 index_to_first_unprocessed = FastAsciiConvert<true>(

	1121 reinterpret_cast<char>(dest), reinterpret_cast<const char>(src),

	1122 length, &has_changed_character);

	1123 // If not ASCII, we keep the result up to index_to_first_unprocessed and

	1124 // process the rest.

	1125 if (index_to_first_unprocessed == length)

	1126 return has_changed_character ? result : s;

	1127

	1128 for (int index = index_to_first_unprocessed; index < length; ++index) {

	1129 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));

	1130 }

	1131 } else {

	1132 if (index_to_first_unprocessed == length) {

	1133 DCHECK(!is_short);

	1134 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);

	1135 }

	1136 // Nothing to do if the string is all ASCII with no uppercase.

	1137 if (index_to_first_unprocessed == length) return *s;

	1138 const uint16_t* src = flat.ToUC16Vector().start();

	1139 CopyChars(dest, src, index_to_first_unprocessed);

	1140 for (int index = index_to_first_unprocessed; index < length; ++index) {

	1141 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));

	1142 }

	1143 }

	1144

	1145 return *result;

	1146 }

	1147

	1148 MUST_USE_RESULT Object* ConvertToUpper(Handle<String> s, Isolate* isolate) {

	1149 int32_t length = s->length();

	1150 if (s->HasOnlyOneByteChars() && length > 0) {

	1151 Handle<SeqOneByteString> result =

	1152 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

	1153

	1154 DCHECK(s->IsFlat());

	1155 int sharp_s_count;

	1156 bool is_result_single_byte;

	1157 {

	1158 DisallowHeapAllocation no_gc;

	1159 String::FlatContent flat = s->GetFlatContent();

	1160 uint8_t* dest = result->GetChars();

	1161 if (flat.IsOneByte()) {

	1162 Vector<const uint8_t> src = flat.ToOneByteVector();

	1163 bool has_changed_character = false;

	1164 int index_to_first_unprocessed =

	1165 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),

	1166 reinterpret_cast<const char*>(src.start()),

	1167 length, &has_changed_character);

	1168 if (index_to_first_unprocessed == length)

	1169 return has_changed_character ? result : s;

	1170 // If not ASCII, we keep the result up to index_to_first_unprocessed and

	1171 // process the rest.

	1172 is_result_single_byte =

	1173 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),

	1174 dest + index_to_first_unprocessed, &sharp_s_count);

	1175 } else {

	1176 DCHECK(flat.IsTwoByte());

	1177 Vector<const uint16_t> src = flat.ToUC16Vector();

	1178 if (ToUpperFastASCII(src, result)) return *result;

	1179 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);

	1180 }

	1181 }

	1182

	1183 // Go to the full Unicode path if there are characters whose uppercase

	1184 // is beyond the Latin-1 range (cannot be represented in OneByteString).

	1185 if (V8_UNLIKELY(!is_result_single_byte)) {

	1186 return LocaleConvertCase(s, isolate, true, "");

	1187 }

	1188

	1189 if (sharp_s_count == 0) return *result;

	1190

	1191 // We have sharp_s_count sharp-s characters, but the result is still

	1192 // in the Latin-1 range.

	1193 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

	1194 isolate, result,

	1195 isolate->factory()->NewRawOneByteString(length + sharp_s_count));

	1196 DisallowHeapAllocation no_gc;

	1197 String::FlatContent flat = s->GetFlatContent();

	1198 if (flat.IsOneByte()) {

	1199 ToUpperWithSharpS(flat.ToOneByteVector(), result);

	1200 } else {

	1201 ToUpperWithSharpS(flat.ToUC16Vector(), result);

	1202 }

	1203

	1204 return *result;

	1205 }

	1206

	1207 return LocaleConvertCase(s, isolate, true, "");

	1208 }

	1209

	1210 MUST_USE_RESULT Object* ConvertCase(Handle<String> s, bool is_upper,

	1211 Isolate* isolate) {

	1212 return is_upper ? ConvertToUpper(s, isolate) : ConvertToLower(s, isolate);

	1213 }

	1214

912 } // namespace internal	1215 } // namespace internal

913 } // namespace v8	1216 } // namespace v8

OLD	NEW

« no previous file with comments | « src/i18n.h ('k') | src/runtime/runtime-i18n.cc » ('j') | no next file with comments »