Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(272)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: working with icu-case-mapping.js manually compiled in; gyp change not picking it up Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 5
6 #ifdef V8_I18N_SUPPORT 6 #ifdef V8_I18N_SUPPORT
7 #include "src/runtime/runtime-utils.h" 7 #include "src/runtime/runtime-utils.h"
8 8
9 #include "src/api.h" 9 #include "src/api.h"
10 #include "src/api-natives.h" 10 #include "src/api-natives.h"
(...skipping 11 matching lines...) Expand all
22 #include "unicode/dcfmtsym.h" 22 #include "unicode/dcfmtsym.h"
23 #include "unicode/decimfmt.h" 23 #include "unicode/decimfmt.h"
24 #include "unicode/dtfmtsym.h" 24 #include "unicode/dtfmtsym.h"
25 #include "unicode/dtptngen.h" 25 #include "unicode/dtptngen.h"
26 #include "unicode/locid.h" 26 #include "unicode/locid.h"
27 #include "unicode/numfmt.h" 27 #include "unicode/numfmt.h"
28 #include "unicode/numsys.h" 28 #include "unicode/numsys.h"
29 #include "unicode/rbbi.h" 29 #include "unicode/rbbi.h"
30 #include "unicode/smpdtfmt.h" 30 #include "unicode/smpdtfmt.h"
31 #include "unicode/timezone.h" 31 #include "unicode/timezone.h"
32 #include "unicode/translit.h"
32 #include "unicode/uchar.h" 33 #include "unicode/uchar.h"
33 #include "unicode/ucol.h" 34 #include "unicode/ucol.h"
34 #include "unicode/ucurr.h" 35 #include "unicode/ucurr.h"
35 #include "unicode/uloc.h" 36 #include "unicode/uloc.h"
37 #include "unicode/unistr.h"
36 #include "unicode/unum.h" 38 #include "unicode/unum.h"
37 #include "unicode/uversion.h" 39 #include "unicode/uversion.h"
38 40
39 41
40 namespace v8 { 42 namespace v8 {
41 namespace internal { 43 namespace internal {
42 44
43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) { 45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {
44 HandleScope scope(isolate); 46 HandleScope scope(isolate);
45 Factory* factory = isolate->factory(); 47 Factory* factory = isolate->factory();
(...skipping 696 matching lines...) Expand 10 before | Expand all | Expand 10 after
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) { 744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {
743 return *isolate->factory()->NewStringFromStaticChars("letter"); 745 return *isolate->factory()->NewStringFromStaticChars("letter");
744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) { 746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {
745 return *isolate->factory()->NewStringFromStaticChars("kana"); 747 return *isolate->factory()->NewStringFromStaticChars("kana");
746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) { 748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {
747 return *isolate->factory()->NewStringFromStaticChars("ideo"); 749 return *isolate->factory()->NewStringFromStaticChars("ideo");
748 } else { 750 } else {
749 return *isolate->factory()->NewStringFromStaticChars("unknown"); 751 return *isolate->factory()->NewStringFromStaticChars("unknown");
750 } 752 }
751 } 753 }
754
755 namespace {
756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,
757 const char* transliterator_id) {
758 UErrorCode status = U_ZERO_ERROR;
759 base::SmartPointer<icu::Transliterator> translit(
760 icu::Transliterator::createInstance(
761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,
762 status));
763 if (U_FAILURE(status)) return;
764 translit->transliterate(*input);
765 }
766
767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat, uc16* dest,
768 int32_t length) {
769 DCHECK(flat.IsFlat());
770 if (flat.IsOneByte()) {
771 CopyChars(dest, flat.ToOneByteVector().start(), length);
772 return static_cast<const UChar*>(dest);
773 } else {
774 return static_cast<const UChar*>(flat.ToUC16Vector().start());
775 }
776 }
777
778 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
779 bool is_to_upper, const char* lang) {
780 int32_t src_length = s->length();
781
782 // Greek uppercasing has to be done via transliteration.
783 // TODO(jshin): Drop this special-casing once ICU's regular case conversion
784 // API supports Greek uppercasing. See
785 // http://bugs.icu-project.org/trac/ticket/10582 .
786 // In the meantime, if there's no Greek character in |s|, call this
787 // function again with the root locale (lang="").
788 // ICU's C API for transliteration is nasty and we just use C++ API.
789 if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {
790 icu::UnicodeString converted;
791 {
792 DisallowHeapAllocation no_gc;
793 String::FlatContent flat = s->GetFlatContent();
794 base::SmartArrayPointer<uc16> sap(NewArray<uc16>(src_length));
Yang 2016/04/29 07:38:47 This seems wrong. In case of two-byte string, we d
jungshik at Google 2016/04/29 18:03:23 UnicodeString's |setTo(src, src_length| does copy
795 const UChar* src = GetUCharBufferFromFlat(flat, sap.get(), src_length);
796 // Starts with the source string (copied) and will be modified to contain
797 // the converted result. Note that there's no benefit in using
798 // read-aliasing |setTo| (3 argument version) because the buffer is copied
799 // anyway upon transliteration.
800 converted.setTo(src, src_length);
801 ConvertCaseWithTransliterator(&converted, "el-Upper");
802 }
803 Handle<String> result;
804 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
805 isolate, result,
806 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(
807 reinterpret_cast<const uint16_t*>(converted.getBuffer()),
808 converted.length())));
809 return *result;
810 }
811
812 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
813
814 int32_t dest_length = src_length;
815 UErrorCode status;
816 Handle<SeqTwoByteString> result;
817 base::SmartArrayPointer<uc16> sap(NewArray<uc16>(src_length));
818
819 // This is not a real loop. It'll be executed only once (no overflow) or
820 // twice (overflow).
821 for (int i = 0; i < 2; ++i) {
822 result =
823 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();
824 DisallowHeapAllocation no_gc;
825 String::FlatContent flat = s->GetFlatContent();
826 const UChar* src = GetUCharBufferFromFlat(flat, sap.get(), src_length);
827 status = U_ZERO_ERROR;
828 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
829 dest_length, src, src_length, lang, &status);
830 if (status != U_BUFFER_OVERFLOW_ERROR) break;
831 }
832
833 // In most cases, the output will fill the destination buffer completely
834 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
835 // Only in rare cases, it'll be shorter than the destination buffer and
836 // |result| has to be truncated.
837 DCHECK(U_SUCCESS(status));
838 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
839 DCHECK(dest_length == result->length());
840 return *result;
841 }
842 if (U_SUCCESS(status)) {
843 DCHECK(dest_length < result->length());
844 return *Handle<SeqTwoByteString>::cast(
845 SeqString::Truncate(result, dest_length));
846 }
847 return *s;
848 }
849
850 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
851
852 const uint8_t kToLower[256] = {
853 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
854 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
855 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
856 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
857 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
858 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
859 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
860 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
861 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
862 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
863 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
864 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
865 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
866 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
867 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
868 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
869 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
870 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
871 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
872 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
873 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
874 0xFC, 0xFD, 0xFE, 0xFF,
875 };
876
877 inline uint16_t ToLatin1Lower(uint16_t ch) {
878 return static_cast<uint16_t>(kToLower[ch]);
879 }
880
881 inline uint16_t ToASCIIUpper(uint16_t ch) {
882 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
883 }
884
885 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
886 inline uint16_t ToLatin1Upper(uint16_t ch) {
887 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
888 return ch &
889 ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xE7))
890 << 5);
891 }
892
893 template <typename Char>
894 bool ToUpperFastASCII(const Vector<const Char>& src,
895 Handle<SeqOneByteString> result) {
896 // Do a faster loop for the case where all the characters are ASCII.
897 uint16_t ored = 0;
898 int32_t index = 0;
899 for (auto it = src.begin(); it != src.end(); ++it) {
900 uint16_t ch = static_cast<uint16_t>(*it);
901 ored |= ch;
902 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
903 }
904 return !(ored & ~0x7F);
905 }
906
907 const uint16_t sharp_s = 0xDF;
908
909 template <typename Char>
910 bool ToUpperOneByte(const Vector<const Char>& src,
911 Handle<SeqOneByteString> result, int* sharp_s_count) {
912 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
913
914 // There are two special cases.
915 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
916 // 2. Lower case sharp-S converts to "SS" (two characters)
917 *sharp_s_count = 0;
918 int32_t index = 0;
919 for (auto it = src.begin(); it != src.end(); ++it) {
920 uint16_t ch = static_cast<uint16_t>(*it);
921 if (V8_UNLIKELY(ch == sharp_s)) {
922 ++(*sharp_s_count);
923 continue;
924 }
925 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
926 // Since this upper-cased character does not fit in an 8-bit string, we
927 // need to take the 16-bit path.
928 return false;
929 }
930 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));
931 }
932
933 return true;
934 }
935
936 template <typename Char>
937 void ToUpperWithSharpS(const Vector<const Char>& src,
938 Handle<SeqOneByteString> result) {
939 int32_t dest_index = 0;
940 for (auto it = src.begin(); it != src.end(); ++it) {
941 uint16_t ch = static_cast<uint16_t>(*it);
942 if (ch == sharp_s) {
943 result->SeqOneByteStringSet(dest_index++, 'S');
944 result->SeqOneByteStringSet(dest_index++, 'S');
945 } else {
946 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
947 }
948 }
949 }
950
951 } // namespace
952
953 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
954 HandleScope scope(isolate);
955 DCHECK_EQ(args.length(), 1);
956 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
957
958 int length = s->length();
959 s = String::Flatten(s);
960 // First scan the string for uppercase and non-ASCII characters:
961 if (s->HasOnlyOneByteChars()) {
962 unsigned first_index_to_lower = length;
963 for (int index = 0; index < length; ++index) {
964 // Blink specializes this path for one-byte strings, so it
965 // does not need to do a generic get, but can do the equivalent
966 // of SeqOneByteStringGet.
967 uint16_t ch = s->Get(index);
968 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
969 first_index_to_lower = index;
970 break;
971 }
972 }
973
974 // Nothing to do if the string is all ASCII with no uppercase.
975 if (first_index_to_lower == length) return *s;
976
977 // We depend here on the invariant that the length of a Latin1
978 // string is invariant under ToLowerCase, and the result always
979 // fits in the Latin1 range in the *root locale*. It does not hold
980 // for ToUpperCase even in the root locale.
981 Handle<SeqOneByteString> result;
982 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
983 isolate, result, isolate->factory()->NewRawOneByteString(length));
984
985 DisallowHeapAllocation no_gc;
986 String::FlatContent flat = s->GetFlatContent();
987 if (flat.IsOneByte()) {
988 const uint8_t* src = flat.ToOneByteVector().start();
989 CopyChars(result->GetChars(), src, first_index_to_lower);
990 for (int index = first_index_to_lower; index < length; ++index) {
991 uint16_t ch = static_cast<uint16_t>(src[index]);
992 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
993 }
994 } else {
995 const uint16_t* src = flat.ToUC16Vector().start();
996 CopyChars(result->GetChars(), src, first_index_to_lower);
997 for (int index = first_index_to_lower; index < length; ++index) {
998 uint16_t ch = src[index];
999 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1000 }
1001 }
1002
1003 return *result;
1004 }
1005
1006 // Blink had an additional case here for ASCII 2-byte strings, but
1007 // that is subsumed by the above code (assuming there isn't a false
1008 // negative for HasOnlyOneByteChars).
1009
1010 // Do a slower implementation for cases that include non-ASCII characters.
1011 return LocaleConvertCase(s, isolate, false, "");
1012 }
1013
1014 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
1015 HandleScope scope(isolate);
1016 DCHECK_EQ(args.length(), 1);
1017 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1018
1019 // This function could be optimized for no-op cases the way lowercase
1020 // counterpart is, but in empirical testing, few actual calls to upper()
1021 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
1022
1023 int32_t length = s->length();
1024 s = String::Flatten(s);
1025
1026 if (s->HasOnlyOneByteChars()) {
1027 Handle<SeqOneByteString> result;
1028 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1029 isolate, result, isolate->factory()->NewRawOneByteString(length));
1030
1031 int sharp_s_count;
1032 bool is_result_single_byte;
1033 {
1034 DisallowHeapAllocation no_gc;
1035 String::FlatContent flat = s->GetFlatContent();
1036 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
1037 // could be removed because ToUpperOneByte is pretty fast now (it
1038 // does not call ICU API any more.).
1039 if (flat.IsOneByte()) {
1040 Vector<const uint8_t> src = flat.ToOneByteVector();
1041 if (ToUpperFastASCII(src, result)) return *result;
1042 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1043 } else {
1044 DCHECK(flat.IsTwoByte());
1045 Vector<const uint16_t> src = flat.ToUC16Vector();
1046 if (ToUpperFastASCII(src, result)) return *result;
1047 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1048 }
1049 }
1050
1051 // Go to the full Unicode path if there are characters whose uppercase
1052 // is beyond the Latin-1 range (cannot be represented in OneByteString).
1053 if (V8_UNLIKELY(!is_result_single_byte)) {
1054 return LocaleConvertCase(s, isolate, true, "");
1055 }
1056
1057 if (sharp_s_count == 0) return *result;
1058
1059 // We have sharp_s_count sharp-s characters, but the result is still
1060 // in the Latin-1 range.
1061 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1062 isolate, result,
1063 isolate->factory()->NewRawOneByteString(length + sharp_s_count));
1064 DisallowHeapAllocation no_gc;
1065 String::FlatContent flat = s->GetFlatContent();
1066 if (flat.IsOneByte()) {
1067 ToUpperWithSharpS(flat.ToOneByteVector(), result);
1068 } else {
1069 ToUpperWithSharpS(flat.ToUC16Vector(), result);
1070 }
1071
1072 return *result;
1073 }
1074
1075 return LocaleConvertCase(s, isolate, true, "");
1076 }
1077
1078 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {
1079 HandleScope scope(isolate);
1080 DCHECK_EQ(args.length(), 3);
1081 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1082 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);
1083 CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);
1084
1085 // All the languages requiring special handling ("az", "el", "lt", "tr")
1086 // have a 2-letter language code.
1087 DCHECK(lang->length() == 2);
1088 uint8_t lang_str[3];
1089 memcpy(lang_str, lang->GetChars(), 2);
1090 lang_str[2] = 0;
1091 s = String::Flatten(s);
1092 // TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath
1093 // in the root locale needs to be adjusted for az, lt and tr because even case
1094 // mapping of ASCII range characters are different in those locales.
1095 // Greek (el) does not require any adjustment, though.
1096 return LocaleConvertCase(s, isolate, is_upper,
1097 reinterpret_cast<const char*>(lang_str));
1098 }
1099
752 } // namespace internal 1100 } // namespace internal
753 } // namespace v8 1101 } // namespace v8
754 1102
755 #endif // V8_I18N_SUPPORT 1103 #endif // V8_I18N_SUPPORT
OLDNEW

Powered by Google App Engine
This is Rietveld 408576698