Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(26)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Yang's comment addressed - return right away for no-change Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 5
6 #ifdef V8_I18N_SUPPORT 6 #ifdef V8_I18N_SUPPORT
7 #include "src/runtime/runtime-utils.h" 7 #include "src/runtime/runtime-utils.h"
8 8
9 #include "src/api.h" 9 #include "src/api.h"
10 #include "src/api-natives.h" 10 #include "src/api-natives.h"
(...skipping 11 matching lines...) Expand all
22 #include "unicode/dcfmtsym.h" 22 #include "unicode/dcfmtsym.h"
23 #include "unicode/decimfmt.h" 23 #include "unicode/decimfmt.h"
24 #include "unicode/dtfmtsym.h" 24 #include "unicode/dtfmtsym.h"
25 #include "unicode/dtptngen.h" 25 #include "unicode/dtptngen.h"
26 #include "unicode/locid.h" 26 #include "unicode/locid.h"
27 #include "unicode/numfmt.h" 27 #include "unicode/numfmt.h"
28 #include "unicode/numsys.h" 28 #include "unicode/numsys.h"
29 #include "unicode/rbbi.h" 29 #include "unicode/rbbi.h"
30 #include "unicode/smpdtfmt.h" 30 #include "unicode/smpdtfmt.h"
31 #include "unicode/timezone.h" 31 #include "unicode/timezone.h"
32 #include "unicode/translit.h"
32 #include "unicode/uchar.h" 33 #include "unicode/uchar.h"
33 #include "unicode/ucol.h" 34 #include "unicode/ucol.h"
34 #include "unicode/ucurr.h" 35 #include "unicode/ucurr.h"
35 #include "unicode/uloc.h" 36 #include "unicode/uloc.h"
37 #include "unicode/unistr.h"
36 #include "unicode/unum.h" 38 #include "unicode/unum.h"
37 #include "unicode/uversion.h" 39 #include "unicode/uversion.h"
38 40
39 41
40 namespace v8 { 42 namespace v8 {
41 namespace internal { 43 namespace internal {
42 44
43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) { 45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {
44 HandleScope scope(isolate); 46 HandleScope scope(isolate);
45 Factory* factory = isolate->factory(); 47 Factory* factory = isolate->factory();
(...skipping 696 matching lines...) Expand 10 before | Expand all | Expand 10 after
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) { 744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {
743 return *isolate->factory()->NewStringFromStaticChars("letter"); 745 return *isolate->factory()->NewStringFromStaticChars("letter");
744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) { 746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {
745 return *isolate->factory()->NewStringFromStaticChars("kana"); 747 return *isolate->factory()->NewStringFromStaticChars("kana");
746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) { 748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {
747 return *isolate->factory()->NewStringFromStaticChars("ideo"); 749 return *isolate->factory()->NewStringFromStaticChars("ideo");
748 } else { 750 } else {
749 return *isolate->factory()->NewStringFromStaticChars("unknown"); 751 return *isolate->factory()->NewStringFromStaticChars("unknown");
750 } 752 }
751 } 753 }
754
srl295 2016/07/27 18:53:39 filed ICU bug http://bugs.icu-project.org/trac/tic
755 namespace {
756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,
757 const char* transliterator_id) {
758 UErrorCode status = U_ZERO_ERROR;
759 base::SmartPointer<icu::Transliterator> translit(
760 icu::Transliterator::createInstance(
761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,
762 status));
763 if (U_FAILURE(status)) return;
764 translit->transliterate(*input);
765 }
766
767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,
768 base::SmartArrayPointer<uc16>* dest,
769 int32_t length) {
770 DCHECK(flat.IsFlat());
771 if (flat.IsOneByte()) {
772 if (dest->is_empty()) {
773 dest->Reset(NewArray<uc16>(length));
774 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);
775 }
776 return reinterpret_cast<const UChar*>(dest->get());
777 } else {
778 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
779 }
780 }
781
782 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
783 bool is_to_upper, const char* lang) {
784 int32_t src_length = s->length();
785
786 // Greek uppercasing has to be done via transliteration.
787 // TODO(jshin): Drop this special-casing once ICU's regular case conversion
788 // API supports Greek uppercasing. See
789 // http://bugs.icu-project.org/trac/ticket/10582 .
790 // In the meantime, if there's no Greek character in |s|, call this
791 // function again with the root locale (lang="").
792 // ICU's C API for transliteration is nasty and we just use C++ API.
793 if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {
794 icu::UnicodeString converted;
795 base::SmartArrayPointer<uc16> sap;
796 {
797 DisallowHeapAllocation no_gc;
798 String::FlatContent flat = s->GetFlatContent();
799 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
800 // Starts with the source string (read-only alias with copy-on-write
801 // semantics) and will be modified to contain the converted result.
802 // Using read-only alias at first saves one copy operation if
803 // transliteration does not change the input, which is rather rare.
804 // Moreover, transliteration takes rather long so that saving one copy
805 // helps only a little bit.
806 converted.setTo(false, src, src_length);
807 ConvertCaseWithTransliterator(&converted, "el-Upper");
808 // If no change is made, just return |s|.
809 if (converted.getBuffer() == src) return *s;
810 }
811 Handle<String> result;
812 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
813 isolate, result,
814 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(
815 reinterpret_cast<const uint16_t*>(converted.getBuffer()),
816 converted.length())));
817 return *result;
818 }
819
820 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
821
822 int32_t dest_length = src_length;
823 UErrorCode status;
824 Handle<SeqTwoByteString> result;
825 base::SmartArrayPointer<uc16> sap;
826
827 // This is not a real loop. It'll be executed only once (no overflow) or
828 // twice (overflow).
829 for (int i = 0; i < 2; ++i) {
830 result =
831 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();
832 DisallowHeapAllocation no_gc;
833 String::FlatContent flat = s->GetFlatContent();
834 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
835 status = U_ZERO_ERROR;
836 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
837 dest_length, src, src_length, lang, &status);
838 if (status != U_BUFFER_OVERFLOW_ERROR) break;
839 }
840
841 // In most cases, the output will fill the destination buffer completely
842 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
843 // Only in rare cases, it'll be shorter than the destination buffer and
844 // |result| has to be truncated.
845 DCHECK(U_SUCCESS(status));
846 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
847 DCHECK(dest_length == result->length());
848 return *result;
849 }
850 if (U_SUCCESS(status)) {
851 DCHECK(dest_length < result->length());
852 return *Handle<SeqTwoByteString>::cast(
853 SeqString::Truncate(result, dest_length));
854 }
855 return *s;
856 }
857
858 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
859
860 const uint8_t kToLower[256] = {
861 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
862 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
863 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
864 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
865 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
866 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
867 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
868 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
869 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
870 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
871 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
872 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
873 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
874 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
875 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
876 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
877 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
878 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
879 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
880 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
881 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
882 0xFC, 0xFD, 0xFE, 0xFF,
883 };
884
885 inline uint16_t ToLatin1Lower(uint16_t ch) {
886 return static_cast<uint16_t>(kToLower[ch]);
887 }
888
889 inline uint16_t ToASCIIUpper(uint16_t ch) {
890 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
891 }
892
893 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
894 inline uint16_t ToLatin1Upper(uint16_t ch) {
895 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
896 return ch &
897 ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xE7))
898 << 5);
899 }
900
901 template <typename Char>
902 bool ToUpperFastASCII(const Vector<const Char>& src,
903 Handle<SeqOneByteString> result) {
904 // Do a faster loop for the case where all the characters are ASCII.
905 uint16_t ored = 0;
906 int32_t index = 0;
907 for (auto it = src.begin(); it != src.end(); ++it) {
908 uint16_t ch = static_cast<uint16_t>(*it);
909 ored |= ch;
910 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
911 }
912 return !(ored & ~0x7F);
913 }
914
915 const uint16_t sharp_s = 0xDF;
916
917 template <typename Char>
918 bool ToUpperOneByte(const Vector<const Char>& src,
919 Handle<SeqOneByteString> result, int* sharp_s_count) {
920 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
921
922 // There are two special cases.
923 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
924 // 2. Lower case sharp-S converts to "SS" (two characters)
925 *sharp_s_count = 0;
926 int32_t index = 0;
927 for (auto it = src.begin(); it != src.end(); ++it) {
928 uint16_t ch = static_cast<uint16_t>(*it);
929 if (V8_UNLIKELY(ch == sharp_s)) {
930 ++(*sharp_s_count);
931 continue;
932 }
933 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
934 // Since this upper-cased character does not fit in an 8-bit string, we
935 // need to take the 16-bit path.
936 return false;
937 }
938 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));
939 }
940
941 return true;
942 }
943
944 template <typename Char>
945 void ToUpperWithSharpS(const Vector<const Char>& src,
946 Handle<SeqOneByteString> result) {
947 int32_t dest_index = 0;
948 for (auto it = src.begin(); it != src.end(); ++it) {
949 uint16_t ch = static_cast<uint16_t>(*it);
950 if (ch == sharp_s) {
951 result->SeqOneByteStringSet(dest_index++, 'S');
952 result->SeqOneByteStringSet(dest_index++, 'S');
953 } else {
954 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
955 }
956 }
957 }
958
959 } // namespace
960
961 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
962 HandleScope scope(isolate);
963 DCHECK_EQ(args.length(), 1);
964 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
965
966 int length = s->length();
967 s = String::Flatten(s);
968 // First scan the string for uppercase and non-ASCII characters:
969 if (s->HasOnlyOneByteChars()) {
970 unsigned first_index_to_lower = length;
971 for (int index = 0; index < length; ++index) {
972 // Blink specializes this path for one-byte strings, so it
973 // does not need to do a generic get, but can do the equivalent
974 // of SeqOneByteStringGet.
975 uint16_t ch = s->Get(index);
976 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
977 first_index_to_lower = index;
978 break;
979 }
980 }
981
982 // Nothing to do if the string is all ASCII with no uppercase.
983 if (first_index_to_lower == length) return *s;
984
985 // We depend here on the invariant that the length of a Latin1
986 // string is invariant under ToLowerCase, and the result always
987 // fits in the Latin1 range in the *root locale*. It does not hold
988 // for ToUpperCase even in the root locale.
989 Handle<SeqOneByteString> result;
990 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
991 isolate, result, isolate->factory()->NewRawOneByteString(length));
992
993 DisallowHeapAllocation no_gc;
994 String::FlatContent flat = s->GetFlatContent();
995 if (flat.IsOneByte()) {
996 const uint8_t* src = flat.ToOneByteVector().start();
997 CopyChars(result->GetChars(), src, first_index_to_lower);
998 for (int index = first_index_to_lower; index < length; ++index) {
999 uint16_t ch = static_cast<uint16_t>(src[index]);
1000 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1001 }
1002 } else {
1003 const uint16_t* src = flat.ToUC16Vector().start();
1004 CopyChars(result->GetChars(), src, first_index_to_lower);
1005 for (int index = first_index_to_lower; index < length; ++index) {
1006 uint16_t ch = src[index];
1007 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1008 }
1009 }
1010
1011 return *result;
1012 }
1013
1014 // Blink had an additional case here for ASCII 2-byte strings, but
1015 // that is subsumed by the above code (assuming there isn't a false
1016 // negative for HasOnlyOneByteChars).
1017
1018 // Do a slower implementation for cases that include non-ASCII characters.
1019 return LocaleConvertCase(s, isolate, false, "");
1020 }
1021
1022 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
1023 HandleScope scope(isolate);
1024 DCHECK_EQ(args.length(), 1);
1025 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1026
1027 // This function could be optimized for no-op cases the way lowercase
1028 // counterpart is, but in empirical testing, few actual calls to upper()
1029 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
1030
1031 int32_t length = s->length();
1032 s = String::Flatten(s);
1033
1034 if (s->HasOnlyOneByteChars()) {
1035 Handle<SeqOneByteString> result;
1036 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1037 isolate, result, isolate->factory()->NewRawOneByteString(length));
1038
1039 int sharp_s_count;
1040 bool is_result_single_byte;
1041 {
1042 DisallowHeapAllocation no_gc;
1043 String::FlatContent flat = s->GetFlatContent();
1044 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
1045 // could be removed because ToUpperOneByte is pretty fast now (it
1046 // does not call ICU API any more.).
1047 if (flat.IsOneByte()) {
1048 Vector<const uint8_t> src = flat.ToOneByteVector();
1049 if (ToUpperFastASCII(src, result)) return *result;
1050 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1051 } else {
1052 DCHECK(flat.IsTwoByte());
1053 Vector<const uint16_t> src = flat.ToUC16Vector();
1054 if (ToUpperFastASCII(src, result)) return *result;
1055 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1056 }
1057 }
1058
1059 // Go to the full Unicode path if there are characters whose uppercase
1060 // is beyond the Latin-1 range (cannot be represented in OneByteString).
1061 if (V8_UNLIKELY(!is_result_single_byte)) {
1062 return LocaleConvertCase(s, isolate, true, "");
1063 }
1064
1065 if (sharp_s_count == 0) return *result;
1066
1067 // We have sharp_s_count sharp-s characters, but the result is still
1068 // in the Latin-1 range.
1069 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1070 isolate, result,
1071 isolate->factory()->NewRawOneByteString(length + sharp_s_count));
1072 DisallowHeapAllocation no_gc;
1073 String::FlatContent flat = s->GetFlatContent();
1074 if (flat.IsOneByte()) {
1075 ToUpperWithSharpS(flat.ToOneByteVector(), result);
1076 } else {
1077 ToUpperWithSharpS(flat.ToUC16Vector(), result);
1078 }
1079
1080 return *result;
1081 }
1082
1083 return LocaleConvertCase(s, isolate, true, "");
1084 }
1085
1086 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {
1087 HandleScope scope(isolate);
1088 DCHECK_EQ(args.length(), 3);
1089 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1090 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);
1091 CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);
1092
1093 // All the languages requiring special handling ("az", "el", "lt", "tr")
1094 // have a 2-letter language code.
1095 DCHECK(lang->length() == 2);
1096 uint8_t lang_str[3];
1097 memcpy(lang_str, lang->GetChars(), 2);
1098 lang_str[2] = 0;
1099 s = String::Flatten(s);
1100 // TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath
1101 // in the root locale needs to be adjusted for az, lt and tr because even case
1102 // mapping of ASCII range characters are different in those locales.
1103 // Greek (el) does not require any adjustment, though.
1104 return LocaleConvertCase(s, isolate, is_upper,
1105 reinterpret_cast<const char*>(lang_str));
1106 }
1107
752 } // namespace internal 1108 } // namespace internal
753 } // namespace v8 1109 } // namespace v8
754 1110
755 #endif // V8_I18N_SUPPORT 1111 #endif // V8_I18N_SUPPORT
OLDNEW
« src/js/i18n.js ('K') | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698