Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(711)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 2728763006: Migrate some case conversion functions from JS to CPP builtins (Closed)
Patch Set: rebase Created 3 years, 9 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/i18n.cc ('k') | src/v8.gyp » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 5
6 #ifdef V8_I18N_SUPPORT 6 #ifdef V8_I18N_SUPPORT
7 #include "src/runtime/runtime-utils.h" 7 #include "src/runtime/runtime-utils.h"
8 8
9 #include <memory> 9 #include <memory>
10 10
11 #include "src/api-natives.h" 11 #include "src/api-natives.h"
12 #include "src/api.h" 12 #include "src/api.h"
13 #include "src/arguments.h" 13 #include "src/arguments.h"
14 #include "src/factory.h" 14 #include "src/factory.h"
15 #include "src/i18n.h" 15 #include "src/i18n.h"
16 #include "src/isolate-inl.h" 16 #include "src/isolate-inl.h"
17 #include "src/messages.h" 17 #include "src/messages.h"
18 #include "src/string-case.h"
19 #include "src/utils.h" 18 #include "src/utils.h"
20 19
21 #include "unicode/brkiter.h" 20 #include "unicode/brkiter.h"
22 #include "unicode/calendar.h" 21 #include "unicode/calendar.h"
23 #include "unicode/coll.h" 22 #include "unicode/coll.h"
24 #include "unicode/curramt.h" 23 #include "unicode/curramt.h"
25 #include "unicode/datefmt.h" 24 #include "unicode/datefmt.h"
26 #include "unicode/dcfmtsym.h" 25 #include "unicode/dcfmtsym.h"
27 #include "unicode/decimfmt.h" 26 #include "unicode/decimfmt.h"
28 #include "unicode/dtfmtsym.h" 27 #include "unicode/dtfmtsym.h"
(...skipping 13 matching lines...) Expand all
42 #include "unicode/ucurr.h" 41 #include "unicode/ucurr.h"
43 #include "unicode/uloc.h" 42 #include "unicode/uloc.h"
44 #include "unicode/unistr.h" 43 #include "unicode/unistr.h"
45 #include "unicode/unum.h" 44 #include "unicode/unum.h"
46 #include "unicode/ustring.h" 45 #include "unicode/ustring.h"
47 #include "unicode/uversion.h" 46 #include "unicode/uversion.h"
48 47
49 48
50 namespace v8 { 49 namespace v8 {
51 namespace internal { 50 namespace internal {
52 namespace {
53
54 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,
55 std::unique_ptr<uc16[]>* dest,
56 int32_t length) {
57 DCHECK(flat.IsFlat());
58 if (flat.IsOneByte()) {
59 if (!*dest) {
60 dest->reset(NewArray<uc16>(length));
61 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);
62 }
63 return reinterpret_cast<const UChar*>(dest->get());
64 } else {
65 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
66 }
67 }
68
69 } // namespace
70 51
71 // ECMA 402 6.2.3 52 // ECMA 402 6.2.3
72 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) { 53 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {
73 HandleScope scope(isolate); 54 HandleScope scope(isolate);
74 Factory* factory = isolate->factory(); 55 Factory* factory = isolate->factory();
75 56
76 DCHECK_EQ(1, args.length()); 57 DCHECK_EQ(1, args.length());
77 CONVERT_ARG_HANDLE_CHECKED(String, locale_id_str, 0); 58 CONVERT_ARG_HANDLE_CHECKED(String, locale_id_str, 0);
78 59
79 v8::String::Utf8Value locale_id(v8::Utils::ToLocal(locale_id_str)); 60 v8::String::Utf8Value locale_id(v8::Utils::ToLocal(locale_id_str));
(...skipping 736 matching lines...) Expand 10 before | Expand all | Expand 10 after
816 return *isolate->factory()->NewStringFromStaticChars("letter"); 797 return *isolate->factory()->NewStringFromStaticChars("letter");
817 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) { 798 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {
818 return *isolate->factory()->NewStringFromStaticChars("kana"); 799 return *isolate->factory()->NewStringFromStaticChars("kana");
819 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) { 800 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {
820 return *isolate->factory()->NewStringFromStaticChars("ideo"); 801 return *isolate->factory()->NewStringFromStaticChars("ideo");
821 } else { 802 } else {
822 return *isolate->factory()->NewStringFromStaticChars("unknown"); 803 return *isolate->factory()->NewStringFromStaticChars("unknown");
823 } 804 }
824 } 805 }
825 806
826 namespace {
827 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
828 bool is_to_upper, const char* lang) {
829 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
830 int32_t src_length = s->length();
831 int32_t dest_length = src_length;
832 UErrorCode status;
833 Handle<SeqTwoByteString> result;
834 std::unique_ptr<uc16[]> sap;
835
836 if (dest_length == 0) return isolate->heap()->empty_string();
837
838 // This is not a real loop. It'll be executed only once (no overflow) or
839 // twice (overflow).
840 for (int i = 0; i < 2; ++i) {
841 // Case conversion can increase the string length (e.g. sharp-S => SS) so
842 // that we have to handle RangeError exceptions here.
843 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
844 isolate, result, isolate->factory()->NewRawTwoByteString(dest_length));
845 DisallowHeapAllocation no_gc;
846 DCHECK(s->IsFlat());
847 String::FlatContent flat = s->GetFlatContent();
848 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
849 status = U_ZERO_ERROR;
850 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
851 dest_length, src, src_length, lang, &status);
852 if (status != U_BUFFER_OVERFLOW_ERROR) break;
853 }
854
855 // In most cases, the output will fill the destination buffer completely
856 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
857 // Only in rare cases, it'll be shorter than the destination buffer and
858 // |result| has to be truncated.
859 DCHECK(U_SUCCESS(status));
860 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
861 DCHECK(dest_length == result->length());
862 return *result;
863 }
864 if (U_SUCCESS(status)) {
865 DCHECK(dest_length < result->length());
866 return *Handle<SeqTwoByteString>::cast(
867 SeqString::Truncate(result, dest_length));
868 }
869 return *s;
870 }
871
872 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
873
874 const uint8_t kToLower[256] = {
875 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
876 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
877 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
878 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
879 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
880 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
881 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
882 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
883 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
884 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
885 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
886 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
887 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
888 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
889 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
890 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
891 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
892 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
893 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
894 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
895 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
896 0xFC, 0xFD, 0xFE, 0xFF,
897 };
898
899 inline uint16_t ToLatin1Lower(uint16_t ch) {
900 return static_cast<uint16_t>(kToLower[ch]);
901 }
902
903 inline uint16_t ToASCIIUpper(uint16_t ch) {
904 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
905 }
906
907 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
908 inline uint16_t ToLatin1Upper(uint16_t ch) {
909 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
910 return ch &
911 ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xF7))
912 << 5);
913 }
914
915 template <typename Char>
916 bool ToUpperFastASCII(const Vector<const Char>& src,
917 Handle<SeqOneByteString> result) {
918 // Do a faster loop for the case where all the characters are ASCII.
919 uint16_t ored = 0;
920 int32_t index = 0;
921 for (auto it = src.begin(); it != src.end(); ++it) {
922 uint16_t ch = static_cast<uint16_t>(*it);
923 ored |= ch;
924 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
925 }
926 return !(ored & ~0x7F);
927 }
928
929 const uint16_t sharp_s = 0xDF;
930
931 template <typename Char>
932 bool ToUpperOneByte(const Vector<const Char>& src, uint8_t* dest,
933 int* sharp_s_count) {
934 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
935
936 // There are two special cases.
937 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
938 // 2. Lower case sharp-S converts to "SS" (two characters)
939 *sharp_s_count = 0;
940 for (auto it = src.begin(); it != src.end(); ++it) {
941 uint16_t ch = static_cast<uint16_t>(*it);
942 if (V8_UNLIKELY(ch == sharp_s)) {
943 ++(*sharp_s_count);
944 continue;
945 }
946 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
947 // Since this upper-cased character does not fit in an 8-bit string, we
948 // need to take the 16-bit path.
949 return false;
950 }
951 *dest++ = ToLatin1Upper(ch);
952 }
953
954 return true;
955 }
956
957 template <typename Char>
958 void ToUpperWithSharpS(const Vector<const Char>& src,
959 Handle<SeqOneByteString> result) {
960 int32_t dest_index = 0;
961 for (auto it = src.begin(); it != src.end(); ++it) {
962 uint16_t ch = static_cast<uint16_t>(*it);
963 if (ch == sharp_s) {
964 result->SeqOneByteStringSet(dest_index++, 'S');
965 result->SeqOneByteStringSet(dest_index++, 'S');
966 } else {
967 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
968 }
969 }
970 }
971
972 inline int FindFirstUpperOrNonAscii(Handle<String> s, int length) {
973 for (int index = 0; index < length; ++index) {
974 uint16_t ch = s->Get(index);
975 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
976 return index;
977 }
978 }
979 return length;
980 }
981
982 MUST_USE_RESULT Object* ConvertToLower(Handle<String> s, Isolate* isolate) {
983 if (!s->HasOnlyOneByteChars()) {
984 // Use a slower implementation for strings with characters beyond U+00FF.
985 return LocaleConvertCase(s, isolate, false, "");
986 }
987
988 int length = s->length();
989
990 // We depend here on the invariant that the length of a Latin1
991 // string is invariant under ToLowerCase, and the result always
992 // fits in the Latin1 range in the *root locale*. It does not hold
993 // for ToUpperCase even in the root locale.
994
995 // Scan the string for uppercase and non-ASCII characters for strings
996 // shorter than a machine-word without any memory allocation overhead.
997 // TODO(jshin): Apply this to a longer input by breaking FastAsciiConvert()
998 // to two parts, one for scanning the prefix with no change and the other for
999 // handling ASCII-only characters.
1000 int index_to_first_unprocessed = length;
1001 const bool is_short = length < static_cast<int>(sizeof(uintptr_t));
1002 if (is_short) {
1003 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
1004 // Nothing to do if the string is all ASCII with no uppercase.
1005 if (index_to_first_unprocessed == length) return *s;
1006 }
1007
1008 Handle<SeqOneByteString> result =
1009 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1010
1011 DisallowHeapAllocation no_gc;
1012 DCHECK(s->IsFlat());
1013 String::FlatContent flat = s->GetFlatContent();
1014 uint8_t* dest = result->GetChars();
1015 if (flat.IsOneByte()) {
1016 const uint8_t* src = flat.ToOneByteVector().start();
1017 bool has_changed_character = false;
1018 index_to_first_unprocessed = FastAsciiConvert<true>(
1019 reinterpret_cast<char*>(dest), reinterpret_cast<const char*>(src),
1020 length, &has_changed_character);
1021 // If not ASCII, we keep the result up to index_to_first_unprocessed and
1022 // process the rest.
1023 if (index_to_first_unprocessed == length)
1024 return has_changed_character ? *result : *s;
1025
1026 for (int index = index_to_first_unprocessed; index < length; ++index) {
1027 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
1028 }
1029 } else {
1030 if (index_to_first_unprocessed == length) {
1031 DCHECK(!is_short);
1032 index_to_first_unprocessed = FindFirstUpperOrNonAscii(s, length);
1033 }
1034 // Nothing to do if the string is all ASCII with no uppercase.
1035 if (index_to_first_unprocessed == length) return *s;
1036 const uint16_t* src = flat.ToUC16Vector().start();
1037 CopyChars(dest, src, index_to_first_unprocessed);
1038 for (int index = index_to_first_unprocessed; index < length; ++index) {
1039 dest[index] = ToLatin1Lower(static_cast<uint16_t>(src[index]));
1040 }
1041 }
1042
1043 return *result;
1044 }
1045
1046 MUST_USE_RESULT Object* ConvertToUpper(Handle<String> s, Isolate* isolate) {
1047 int32_t length = s->length();
1048 if (s->HasOnlyOneByteChars() && length > 0) {
1049 Handle<SeqOneByteString> result =
1050 isolate->factory()->NewRawOneByteString(length).ToHandleChecked();
1051
1052 DCHECK(s->IsFlat());
1053 int sharp_s_count;
1054 bool is_result_single_byte;
1055 {
1056 DisallowHeapAllocation no_gc;
1057 String::FlatContent flat = s->GetFlatContent();
1058 uint8_t* dest = result->GetChars();
1059 if (flat.IsOneByte()) {
1060 Vector<const uint8_t> src = flat.ToOneByteVector();
1061 bool has_changed_character = false;
1062 int index_to_first_unprocessed =
1063 FastAsciiConvert<false>(reinterpret_cast<char*>(result->GetChars()),
1064 reinterpret_cast<const char*>(src.start()),
1065 length, &has_changed_character);
1066 if (index_to_first_unprocessed == length)
1067 return has_changed_character ? *result : *s;
1068 // If not ASCII, we keep the result up to index_to_first_unprocessed and
1069 // process the rest.
1070 is_result_single_byte =
1071 ToUpperOneByte(src.SubVector(index_to_first_unprocessed, length),
1072 dest + index_to_first_unprocessed, &sharp_s_count);
1073 } else {
1074 DCHECK(flat.IsTwoByte());
1075 Vector<const uint16_t> src = flat.ToUC16Vector();
1076 if (ToUpperFastASCII(src, result)) return *result;
1077 is_result_single_byte = ToUpperOneByte(src, dest, &sharp_s_count);
1078 }
1079 }
1080
1081 // Go to the full Unicode path if there are characters whose uppercase
1082 // is beyond the Latin-1 range (cannot be represented in OneByteString).
1083 if (V8_UNLIKELY(!is_result_single_byte)) {
1084 return LocaleConvertCase(s, isolate, true, "");
1085 }
1086
1087 if (sharp_s_count == 0) return *result;
1088
1089 // We have sharp_s_count sharp-s characters, but the result is still
1090 // in the Latin-1 range.
1091 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1092 isolate, result,
1093 isolate->factory()->NewRawOneByteString(length + sharp_s_count));
1094 DisallowHeapAllocation no_gc;
1095 String::FlatContent flat = s->GetFlatContent();
1096 if (flat.IsOneByte()) {
1097 ToUpperWithSharpS(flat.ToOneByteVector(), result);
1098 } else {
1099 ToUpperWithSharpS(flat.ToUC16Vector(), result);
1100 }
1101
1102 return *result;
1103 }
1104
1105 return LocaleConvertCase(s, isolate, true, "");
1106 }
1107
1108 MUST_USE_RESULT Object* ConvertCase(Handle<String> s, bool is_upper,
1109 Isolate* isolate) {
1110 return is_upper ? ConvertToUpper(s, isolate) : ConvertToLower(s, isolate);
1111 }
1112
1113 } // namespace
1114
1115 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) { 807 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
1116 HandleScope scope(isolate); 808 HandleScope scope(isolate);
1117 DCHECK_EQ(args.length(), 1); 809 DCHECK_EQ(args.length(), 1);
1118 CONVERT_ARG_HANDLE_CHECKED(String, s, 0); 810 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1119 s = String::Flatten(s); 811 s = String::Flatten(s);
1120 return ConvertToLower(s, isolate); 812 return ConvertToLower(s, isolate);
1121 } 813 }
1122 814
1123 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) { 815 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
1124 HandleScope scope(isolate); 816 HandleScope scope(isolate);
(...skipping 57 matching lines...) Expand 10 before | Expand all | Expand 10 after
1182 Handle<FixedArray> date_cache_version = 874 Handle<FixedArray> date_cache_version =
1183 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton( 875 Handle<FixedArray>::cast(isolate->eternal_handles()->GetSingleton(
1184 EternalHandles::DATE_CACHE_VERSION)); 876 EternalHandles::DATE_CACHE_VERSION));
1185 return date_cache_version->get(0); 877 return date_cache_version->get(0);
1186 } 878 }
1187 879
1188 } // namespace internal 880 } // namespace internal
1189 } // namespace v8 881 } // namespace v8
1190 882
1191 #endif // V8_I18N_SUPPORT 883 #endif // V8_I18N_SUPPORT
OLDNEW
« no previous file with comments | « src/i18n.cc ('k') | src/v8.gyp » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698