src/runtime/runtime-i18n.cc - Issue 2728763006: Migrate some case conversion functions from JS to CPP builtins

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 2728763006: Migrate some case conversion functions from JS to CPP builtins (Closed)

Patch Set: rebase Created 3 years, 9 months ago

Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.

Jump to:

OLD	NEW
1 // Copyright 2014 the V8 project authors. All rights reserved.	1 // Copyright 2014 the V8 project authors. All rights reserved.

2 // Use of this source code is governed by a BSD-style license that can be	2 // Use of this source code is governed by a BSD-style license that can be

3 // found in the LICENSE file.	3 // found in the LICENSE file.

4	4

5	5

6 #ifdef V8_I18N_SUPPORT	6 #ifdef V8_I18N_SUPPORT

7 #include "src/runtime/runtime-utils.h"	7 #include "src/runtime/runtime-utils.h"

8	8

9 #include <memory>	9 #include <memory>

10	10

11 #include "src/api-natives.h"	11 #include "src/api-natives.h"

12 #include "src/api.h"	12 #include "src/api.h"

13 #include "src/arguments.h"	13 #include "src/arguments.h"

14 #include "src/factory.h"	14 #include "src/factory.h"

15 #include "src/i18n.h"	15 #include "src/i18n.h"

16 #include "src/isolate-inl.h"	16 #include "src/isolate-inl.h"

17 #include "src/messages.h"	17 #include "src/messages.h"

18 #include "src/string-case.h"

19 #include "src/utils.h"	18 #include "src/utils.h"

20	19

21 #include "unicode/brkiter.h"	20 #include "unicode/brkiter.h"

22 #include "unicode/calendar.h"	21 #include "unicode/calendar.h"

23 #include "unicode/coll.h"	22 #include "unicode/coll.h"

24 #include "unicode/curramt.h"	23 #include "unicode/curramt.h"

25 #include "unicode/datefmt.h"	24 #include "unicode/datefmt.h"

26 #include "unicode/dcfmtsym.h"	25 #include "unicode/dcfmtsym.h"

27 #include "unicode/decimfmt.h"	26 #include "unicode/decimfmt.h"

28 #include "unicode/dtfmtsym.h"	27 #include "unicode/dtfmtsym.h"

(...skipping 13 matching lines...) Expand all Loading...
42 #include "unicode/ucurr.h"	41 #include "unicode/ucurr.h"

43 #include "unicode/uloc.h"	42 #include "unicode/uloc.h"

44 #include "unicode/unistr.h"	43 #include "unicode/unistr.h"

45 #include "unicode/unum.h"	44 #include "unicode/unum.h"

46 #include "unicode/ustring.h"	45 #include "unicode/ustring.h"

47 #include "unicode/uversion.h"	46 #include "unicode/uversion.h"

48	47

49	48

50 namespace v8 {	49 namespace v8 {

51 namespace internal {	50 namespace internal {

52 namespace {

53

54 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,

55 std::unique_ptr<uc16[]>* dest,

56 int32_t length) {

57 DCHECK(flat.IsFlat());

58 if (flat.IsOneByte()) {

59 if (!*dest) {

60 dest->reset(NewArray<uc16>(length));

61 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);

62 }

63 return reinterpret_cast<const UChar*>(dest->get());

64 } else {

65 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());

66 }

67 }

68

69 } // namespace

70	51

71 // ECMA 402 6.2.3	52 // ECMA 402 6.2.3

72 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {	53 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {

73 HandleScope scope(isolate);	54 HandleScope scope(isolate);

74 Factory* factory = isolate->factory();	55 Factory* factory = isolate->factory();

75	56

76 DCHECK_EQ(1, args.length());	57 DCHECK_EQ(1, args.length());

77 CONVERT_ARG_HANDLE_CHECKED(String, locale_id_str, 0);	58 CONVERT_ARG_HANDLE_CHECKED(String, locale_id_str, 0);

78	59

79 v8::String::Utf8Value locale_id(v8::Utils::ToLocal(locale_id_str));	60 v8::String::Utf8Value locale_id(v8::Utils::ToLocal(locale_id_str));

(...skipping 736 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
816 return *isolate->factory()->NewStringFromStaticChars("letter");	797 return *isolate->factory()->NewStringFromStaticChars("letter");

817 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {	798 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {

818 return *isolate->factory()->NewStringFromStaticChars("kana");	799 return *isolate->factory()->NewStringFromStaticChars("kana");

819 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {	800 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {

820 return *isolate->factory()->NewStringFromStaticChars("ideo");	801 return *isolate->factory()->NewStringFromStaticChars("ideo");

821 } else {	802 } else {

822 return *isolate->factory()->NewStringFromStaticChars("unknown");	803 return *isolate->factory()->NewStringFromStaticChars("unknown");

823 }	804 }

824 }	805 }

825	806

826 namespace {

827 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,

828 bool is_to_upper, const char* lang) {

829 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;

830 int32_t src_length = s->length();

831 int32_t dest_length = src_length;

832 UErrorCode status;

833 Handle<SeqTwoByteString> result;

834 std::unique_ptr<uc16[]> sap;

835

836 if (dest_length == 0) return isolate->heap()->empty_string();

837

838 // This is not a real loop. It'll be executed only once (no overflow) or

839 // twice (overflow).

840 for (int i = 0; i < 2; ++i) {

841 // Case conversion can increase the string length (e.g. sharp-S => SS) so

842 // that we have to handle RangeError exceptions here.

843 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

844 isolate, result, isolate->factory()->NewRawTwoByteString(dest_length));

845 DisallowHeapAllocation no_gc;

846 DCHECK(s->IsFlat());

847 String::FlatContent flat = s->GetFlatContent();

848 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);

849 status = U_ZERO_ERROR;

850 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),

851 dest_length, src, src_length, lang, &status);

852 if (status != U_BUFFER_OVERFLOW_ERROR) break;

853 }

854

855 // In most cases, the output will fill the destination buffer completely

856 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).

857 // Only in rare cases, it'll be shorter than the destination buffer and

858 // \|result\| has to be truncated.

859 DCHECK(U_SUCCESS(status));

860 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {

861 DCHECK(dest_length == result->length());

862 return *result;

863 }

864 if (U_SUCCESS(status)) {

865 DCHECK(dest_length < result->length());

866 return *Handle<SeqTwoByteString>::cast(

867 SeqString::Truncate(result, dest_length));

868 }

869 return *s;

870 }

871

872 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }

873

874 const uint8_t kToLower[256] = {

875 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,

876 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,

877 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,

878 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,

879 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,

880 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,

881 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,

882 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,

883 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,

884 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,

885 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,

886 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,

887 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,

888 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,

889 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,

890 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,

891 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,

892 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,

893 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,

894 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,

895 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,

896 0xFC, 0xFD, 0xFE, 0xFF,

897 };

898

899 inline uint16_t ToLatin1Lower(uint16_t ch) {

900 return static_cast<uint16_t>(kToLower[ch]);

901 }

902

903 inline uint16_t ToASCIIUpper(uint16_t ch) {

904 return ch & ~((ch >= 'a' && ch <= 'z') << 5);

905 }

906

907 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.

908 inline uint16_t ToLatin1Upper(uint16_t ch) {

909 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);

910 return ch &

911 ~(((ch >= 'a' && ch <= 'z') \|\| (((ch & 0xE0) == 0xE0) && ch != 0xF7))

912 << 5);

913 }

914

915 template <typename Char>

916 bool ToUpperFastASCII(const Vector<const Char>& src,

917 Handle<SeqOneByteString> result) {

918 // Do a faster loop for the case where all the characters are ASCII.

919 uint16_t ored = 0;

920 int32_t index = 0;

921 for (auto it = src.begin(); it != src.end(); ++it) {

922 uint16_t ch = static_cast<uint16_t>(*it);

923 ored \|= ch;

924 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));

925 }

926 return !(ored & ~0x7F);

927 }

928

929 const uint16_t sharp_s = 0xDF;

930

931 template <typename Char>

932 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,

933 int* sharp_s_count) {

934 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.

935

936 // There are two special cases.

937 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.

938 // 2. Lower case sharp-S converts to "SS" (two characters)

939 *sharp_s_count = 0;

940 for (auto it = src.begin(); it != src.end(); ++it) {

941 uint16_t ch = static_cast<uint16_t>(*it);

942 if (V8_UNLIKELY(ch == sharp_s)) {

943 ++(*sharp_s_count);

944 continue;

945 }

946 if (V8_UNLIKELY(ch == 0xB5 \|\| ch == 0xFF)) {

947 // Since this upper-cased character does not fit in an 8-bit string, we

948 // need to take the 16-bit path.

949 return false;

950 }

951 *dest++ = ToLatin1Upper(ch);

952 }

953

954 return true;

955 }

956

957 template <typename Char>

958 void ToUpperWithSharpS(const Vector<const Char>& src,

959 Handle<SeqOneByteString> result) {

960 int32_t dest_index = 0;

961 for (auto it = src.begin(); it != src.end(); ++it) {

962 uint16_t ch = static_cast<uint16_t>(*it);

963 if (ch == sharp_s) {

964 result->SeqOneByteStringSet(dest_index++, 'S');

965 result->SeqOneByteStringSet(dest_index++, 'S');

966 } else {

967 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));

968 }

969 }

970 }

971

972 inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {

973 for (int index = 0; index < length; ++index) {

974 uint16_t ch = s->Get(index);

975 if (V8_UNLIKELY(IsASCIIUpper(ch) \|\| ch & ~0x7F)) {

976 return index;

977 }

978 }

979 return length;

980 }

981

982 MUST_USE_RESULT Object* ConvertToLower(Handle<String> s, Isolate* isolate) {

983 if (!s->HasOnlyOneByteChars()) {

984 // Use a slower implementation for strings with characters beyond U+00FF.

985 return LocaleConvertCase(s, isolate, false, "");

986 }

987

988 int length = s->length();

989

990 // We depend here on the invariant that the length of a Latin1

991 // string is invariant under ToLowerCase, and the result always

992 // fits in the Latin1 range in the root locale. It does not hold

993 // for ToUpperCase even in the root locale.

994

995 // Scan the string for uppercase and non-ASCII characters for strings

996 // shorter than a machine-word without any memory allocation overhead.

997 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()

998 // to two parts, one for scanning the prefix with no change and the other for

999 // handling ASCII-only characters.

1000 int index_to_first_unprocessed = length;

1001 const bool is_short = length < static_cast<int>(sizeof(uintptr_t));

1002 if (is_short) {

1003 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);

1004 // Nothing to do if the string is all ASCII with no uppercase.

1005 if (index_to_first_unprocessed == length) return *s;

1006 }

1007

1008 Handle<SeqOneByteString> result =

1009 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

1010

1011 DisallowHeapAllocation no_gc;

1012 DCHECK(s->IsFlat());

1013 String::FlatContent flat = s->GetFlatContent();

1014 uint8_t* dest = result->GetChars();

1015 if (flat.IsOneByte()) {

1016 const uint8_t* src = flat.ToOneByteVector().start();

1017 bool has_changed_character = false;

1018 index_to_first_unprocessed = FastAsciiConvert<true>(

1019 reinterpret_cast<char>(dest), reinterpret_cast<const char>(src),

1020 length, &has_changed_character);

1021 // If not ASCII, we keep the result up to index_to_first_unprocessed and

1022 // process the rest.

1023 if (index_to_first_unprocessed == length)

1024 return has_changed_character ? result : s;

1025

1026 for (int index = index_to_first_unprocessed; index < length; ++index) {

1027 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));

1028 }

1029 } else {

1030 if (index_to_first_unprocessed == length) {

1031 DCHECK(!is_short);

1032 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);

1033 }

1034 // Nothing to do if the string is all ASCII with no uppercase.

1035 if (index_to_first_unprocessed == length) return *s;

1036 const uint16_t* src = flat.ToUC16Vector().start();

1037 CopyChars(dest, src, index_to_first_unprocessed);

1038 for (int index = index_to_first_unprocessed; index < length; ++index) {

1039 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));

1040 }

1041 }

1042

1043 return *result;

1044 }

1045

1046 MUST_USE_RESULT Object* ConvertToUpper(Handle<String> s, Isolate* isolate) {

1047 int32_t length = s->length();

1048 if (s->HasOnlyOneByteChars() && length > 0) {

1049 Handle<SeqOneByteString> result =

1050 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();

1051

1052 DCHECK(s->IsFlat());

1053 int sharp_s_count;

1054 bool is_result_single_byte;

1055 {

1056 DisallowHeapAllocation no_gc;

1057 String::FlatContent flat = s->GetFlatContent();

1058 uint8_t* dest = result->GetChars();

1059 if (flat.IsOneByte()) {

1060 Vector<const uint8_t> src = flat.ToOneByteVector();

1061 bool has_changed_character = false;

1062 int index_to_first_unprocessed =

1063 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),

1064 reinterpret_cast<const char*>(src.start()),

1065 length, &has_changed_character);

1066 if (index_to_first_unprocessed == length)

1067 return has_changed_character ? result : s;

1068 // If not ASCII, we keep the result up to index_to_first_unprocessed and

1069 // process the rest.

1070 is_result_single_byte =

1071 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),

1072 dest + index_to_first_unprocessed, &sharp_s_count);

1073 } else {

1074 DCHECK(flat.IsTwoByte());

1075 Vector<const uint16_t> src = flat.ToUC16Vector();

1076 if (ToUpperFastASCII(src, result)) return *result;

1077 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);

1078 }

1079 }

1080

1081 // Go to the full Unicode path if there are characters whose uppercase

1082 // is beyond the Latin-1 range (cannot be represented in OneByteString).

1083 if (V8_UNLIKELY(!is_result_single_byte)) {

1084 return LocaleConvertCase(s, isolate, true, "");

1085 }

1086

1087 if (sharp_s_count == 0) return *result;

1088

1089 // We have sharp_s_count sharp-s characters, but the result is still

1090 // in the Latin-1 range.

1091 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(

1092 isolate, result,

1093 isolate->factory()->NewRawOneByteString(length + sharp_s_count));

1094 DisallowHeapAllocation no_gc;

1095 String::FlatContent flat = s->GetFlatContent();

1096 if (flat.IsOneByte()) {

1097 ToUpperWithSharpS(flat.ToOneByteVector(), result);

1098 } else {

1099 ToUpperWithSharpS(flat.ToUC16Vector(), result);

1100 }

1101

1102 return *result;

1103 }

1104

1105 return LocaleConvertCase(s, isolate, true, "");

1106 }

1107

1108 MUST_USE_RESULT Object* ConvertCase(Handle<String> s, bool is_upper,

1109 Isolate* isolate) {

1110 return is_upper ? ConvertToUpper(s, isolate) : ConvertToLower(s, isolate);

1111 }

1112

1113 } // namespace

1114

1115 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {	807 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {

1116 HandleScope scope(isolate);	808 HandleScope scope(isolate);

1117 DCHECK_EQ(args.length(), 1);	809 DCHECK_EQ(args.length(), 1);

1118 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);	810 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);

1119 s = String::Flatten(s);	811 s = String::Flatten(s);

1120 return ConvertToLower(s, isolate);	812 return ConvertToLower(s, isolate);

1121 }	813 }

1122	814

1123 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {	815 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {

1124 HandleScope scope(isolate);	816 HandleScope scope(isolate);

(...skipping 57 matching lines...) Expand 10 before \| Expand all \| Expand 10 after Loading...
1182 Handle<FixedArray> date_cache_version =	874 Handle<FixedArray> date_cache_version =

1183 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(	875 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(

1184 EternalHandles::DATE_CACHE_VERSION));	876 EternalHandles::DATE_CACHE_VERSION));

1185 return date_cache_version->get(0);	877 return date_cache_version->get(0);

1186 }	878 }

1187	879

1188 } // namespace internal	880 } // namespace internal

1189 } // namespace v8	881 } // namespace v8

1190	882

1191 #endif // V8_I18N_SUPPORT	883 #endif // V8_I18N_SUPPORT

OLD	NEW

« no previous file with comments | « src/i18n.cc ('k') | src/v8.gyp » ('j') | no next file with comments »