Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(449)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: Address Yang's comment Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 5
6 #ifdef V8_I18N_SUPPORT 6 #ifdef V8_I18N_SUPPORT
7 #include "src/runtime/runtime-utils.h" 7 #include "src/runtime/runtime-utils.h"
8 8
9 #include "src/api.h" 9 #include "src/api.h"
10 #include "src/api-natives.h" 10 #include "src/api-natives.h"
(...skipping 11 matching lines...) Expand all
22 #include "unicode/dcfmtsym.h" 22 #include "unicode/dcfmtsym.h"
23 #include "unicode/decimfmt.h" 23 #include "unicode/decimfmt.h"
24 #include "unicode/dtfmtsym.h" 24 #include "unicode/dtfmtsym.h"
25 #include "unicode/dtptngen.h" 25 #include "unicode/dtptngen.h"
26 #include "unicode/locid.h" 26 #include "unicode/locid.h"
27 #include "unicode/numfmt.h" 27 #include "unicode/numfmt.h"
28 #include "unicode/numsys.h" 28 #include "unicode/numsys.h"
29 #include "unicode/rbbi.h" 29 #include "unicode/rbbi.h"
30 #include "unicode/smpdtfmt.h" 30 #include "unicode/smpdtfmt.h"
31 #include "unicode/timezone.h" 31 #include "unicode/timezone.h"
32 #include "unicode/translit.h"
32 #include "unicode/uchar.h" 33 #include "unicode/uchar.h"
33 #include "unicode/ucol.h" 34 #include "unicode/ucol.h"
34 #include "unicode/ucurr.h" 35 #include "unicode/ucurr.h"
35 #include "unicode/uloc.h" 36 #include "unicode/uloc.h"
37 #include "unicode/unistr.h"
36 #include "unicode/unum.h" 38 #include "unicode/unum.h"
37 #include "unicode/uversion.h" 39 #include "unicode/uversion.h"
38 40
39 41
40 namespace v8 { 42 namespace v8 {
41 namespace internal { 43 namespace internal {
42 44
43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) { 45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {
44 HandleScope scope(isolate); 46 HandleScope scope(isolate);
45 Factory* factory = isolate->factory(); 47 Factory* factory = isolate->factory();
(...skipping 696 matching lines...) Expand 10 before | Expand all | Expand 10 after
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) { 744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {
743 return *isolate->factory()->NewStringFromStaticChars("letter"); 745 return *isolate->factory()->NewStringFromStaticChars("letter");
744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) { 746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {
745 return *isolate->factory()->NewStringFromStaticChars("kana"); 747 return *isolate->factory()->NewStringFromStaticChars("kana");
746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) { 748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {
747 return *isolate->factory()->NewStringFromStaticChars("ideo"); 749 return *isolate->factory()->NewStringFromStaticChars("ideo");
748 } else { 750 } else {
749 return *isolate->factory()->NewStringFromStaticChars("unknown"); 751 return *isolate->factory()->NewStringFromStaticChars("unknown");
750 } 752 }
751 } 753 }
754
755 namespace {
756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,
757 const char* transliterator_id) {
758 UErrorCode status = U_ZERO_ERROR;
759 base::SmartPointer<icu::Transliterator> translit(
760 icu::Transliterator::createInstance(
761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,
762 status));
763 if (U_FAILURE(status)) return;
764 translit->transliterate(*input);
765 }
766
767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat,
768 base::SmartArrayPointer<uc16>* dest,
769 int32_t length) {
770 DCHECK(flat.IsFlat());
771 if (flat.IsOneByte()) {
772 if (dest->is_empty()) {
773 dest->Reset(NewArray<uc16>(length));
774 CopyChars(dest->get(), flat.ToOneByteVector().start(), length);
775 }
776 return reinterpret_cast<const UChar*>(dest->get());
777 } else {
778 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
779 }
780 }
781
782 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
783 bool is_to_upper, const char* lang) {
784 int32_t src_length = s->length();
785
786 // Greek uppercasing has to be done via transliteration.
787 // TODO(jshin): Drop this special-casing once ICU's regular case conversion
788 // API supports Greek uppercasing. See
789 // http://bugs.icu-project.org/trac/ticket/10582 .
790 // In the meantime, if there's no Greek character in |s|, call this
791 // function again with the root locale (lang="").
792 // ICU's C API for transliteration is nasty and we just use C++ API.
793 if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {
794 icu::UnicodeString converted;
795 base::SmartArrayPointer<uc16> sap;
796 {
797 DisallowHeapAllocation no_gc;
798 String::FlatContent flat = s->GetFlatContent();
799 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
800 // Starts with the source string (read-only alias with copy-on-write
801 // semantics) and will be modified to contain the converted result.
802 // Using read-only alias at first saves one copy operation if
803 // transliteration does not change the input, which is rather rare.
804 // Moreover, transliteration takes rather long so that saving one copy
805 // helps only a little bit.
806 converted.setTo(false, src, src_length);
807 ConvertCaseWithTransliterator(&converted, "el-Upper");
Yang 2016/05/11 08:42:30 So... if ConvertCaseWithTransliterator does not ch
jungshik at Google 2016/05/11 18:10:08 Done.
808 }
809 Handle<String> result;
810 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
811 isolate, result,
812 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(
813 reinterpret_cast<const uint16_t*>(converted.getBuffer()),
814 converted.length())));
815 return *result;
816 }
817
818 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
819
820 int32_t dest_length = src_length;
821 UErrorCode status;
822 Handle<SeqTwoByteString> result;
823 base::SmartArrayPointer<uc16> sap;
824
825 // This is not a real loop. It'll be executed only once (no overflow) or
826 // twice (overflow).
827 for (int i = 0; i < 2; ++i) {
828 result =
829 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();
830 DisallowHeapAllocation no_gc;
831 String::FlatContent flat = s->GetFlatContent();
832 const UChar* src = GetUCharBufferFromFlat(flat, &sap, src_length);
833 status = U_ZERO_ERROR;
834 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
835 dest_length, src, src_length, lang, &status);
836 if (status != U_BUFFER_OVERFLOW_ERROR) break;
837 }
838
839 // In most cases, the output will fill the destination buffer completely
840 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
841 // Only in rare cases, it'll be shorter than the destination buffer and
842 // |result| has to be truncated.
843 DCHECK(U_SUCCESS(status));
844 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
845 DCHECK(dest_length == result->length());
846 return *result;
847 }
848 if (U_SUCCESS(status)) {
849 DCHECK(dest_length < result->length());
850 return *Handle<SeqTwoByteString>::cast(
851 SeqString::Truncate(result, dest_length));
852 }
853 return *s;
854 }
855
856 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
857
858 const uint8_t kToLower[256] = {
859 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
860 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
861 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
862 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
863 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
864 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
865 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
866 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
867 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
868 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
869 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
870 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
871 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
872 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
873 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
874 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
875 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
876 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
877 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
878 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
879 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
880 0xFC, 0xFD, 0xFE, 0xFF,
881 };
882
883 inline uint16_t ToLatin1Lower(uint16_t ch) {
884 return static_cast<uint16_t>(kToLower[ch]);
885 }
886
887 inline uint16_t ToASCIIUpper(uint16_t ch) {
888 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
889 }
890
891 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
892 inline uint16_t ToLatin1Upper(uint16_t ch) {
893 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
894 return ch &
895 ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xE7))
896 << 5);
897 }
898
899 template <typename Char>
900 bool ToUpperFastASCII(const Vector<const Char>& src,
901 Handle<SeqOneByteString> result) {
902 // Do a faster loop for the case where all the characters are ASCII.
903 uint16_t ored = 0;
904 int32_t index = 0;
905 for (auto it = src.begin(); it != src.end(); ++it) {
906 uint16_t ch = static_cast<uint16_t>(*it);
907 ored |= ch;
908 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
909 }
910 return !(ored & ~0x7F);
911 }
912
913 const uint16_t sharp_s = 0xDF;
914
915 template <typename Char>
916 bool ToUpperOneByte(const Vector<const Char>& src,
917 Handle<SeqOneByteString> result, int* sharp_s_count) {
918 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
919
920 // There are two special cases.
921 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
922 // 2. Lower case sharp-S converts to "SS" (two characters)
923 *sharp_s_count = 0;
924 int32_t index = 0;
925 for (auto it = src.begin(); it != src.end(); ++it) {
926 uint16_t ch = static_cast<uint16_t>(*it);
927 if (V8_UNLIKELY(ch == sharp_s)) {
928 ++(*sharp_s_count);
929 continue;
930 }
931 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
932 // Since this upper-cased character does not fit in an 8-bit string, we
933 // need to take the 16-bit path.
934 return false;
935 }
936 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));
937 }
938
939 return true;
940 }
941
942 template <typename Char>
943 void ToUpperWithSharpS(const Vector<const Char>& src,
944 Handle<SeqOneByteString> result) {
945 int32_t dest_index = 0;
946 for (auto it = src.begin(); it != src.end(); ++it) {
947 uint16_t ch = static_cast<uint16_t>(*it);
948 if (ch == sharp_s) {
949 result->SeqOneByteStringSet(dest_index++, 'S');
950 result->SeqOneByteStringSet(dest_index++, 'S');
951 } else {
952 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
953 }
954 }
955 }
956
957 } // namespace
958
959 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
960 HandleScope scope(isolate);
961 DCHECK_EQ(args.length(), 1);
962 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
963
964 int length = s->length();
965 s = String::Flatten(s);
966 // First scan the string for uppercase and non-ASCII characters:
967 if (s->HasOnlyOneByteChars()) {
968 unsigned first_index_to_lower = length;
969 for (int index = 0; index < length; ++index) {
970 // Blink specializes this path for one-byte strings, so it
971 // does not need to do a generic get, but can do the equivalent
972 // of SeqOneByteStringGet.
973 uint16_t ch = s->Get(index);
974 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
975 first_index_to_lower = index;
976 break;
977 }
978 }
979
980 // Nothing to do if the string is all ASCII with no uppercase.
981 if (first_index_to_lower == length) return *s;
982
983 // We depend here on the invariant that the length of a Latin1
984 // string is invariant under ToLowerCase, and the result always
985 // fits in the Latin1 range in the *root locale*. It does not hold
986 // for ToUpperCase even in the root locale.
987 Handle<SeqOneByteString> result;
988 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
989 isolate, result, isolate->factory()->NewRawOneByteString(length));
990
991 DisallowHeapAllocation no_gc;
992 String::FlatContent flat = s->GetFlatContent();
993 if (flat.IsOneByte()) {
994 const uint8_t* src = flat.ToOneByteVector().start();
995 CopyChars(result->GetChars(), src, first_index_to_lower);
996 for (int index = first_index_to_lower; index < length; ++index) {
997 uint16_t ch = static_cast<uint16_t>(src[index]);
998 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
999 }
1000 } else {
1001 const uint16_t* src = flat.ToUC16Vector().start();
1002 CopyChars(result->GetChars(), src, first_index_to_lower);
1003 for (int index = first_index_to_lower; index < length; ++index) {
1004 uint16_t ch = src[index];
1005 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1006 }
1007 }
1008
1009 return *result;
1010 }
1011
1012 // Blink had an additional case here for ASCII 2-byte strings, but
1013 // that is subsumed by the above code (assuming there isn't a false
1014 // negative for HasOnlyOneByteChars).
1015
1016 // Do a slower implementation for cases that include non-ASCII characters.
1017 return LocaleConvertCase(s, isolate, false, "");
1018 }
1019
1020 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
1021 HandleScope scope(isolate);
1022 DCHECK_EQ(args.length(), 1);
1023 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1024
1025 // This function could be optimized for no-op cases the way lowercase
1026 // counterpart is, but in empirical testing, few actual calls to upper()
1027 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
1028
1029 int32_t length = s->length();
1030 s = String::Flatten(s);
1031
1032 if (s->HasOnlyOneByteChars()) {
1033 Handle<SeqOneByteString> result;
1034 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1035 isolate, result, isolate->factory()->NewRawOneByteString(length));
1036
1037 int sharp_s_count;
1038 bool is_result_single_byte;
1039 {
1040 DisallowHeapAllocation no_gc;
1041 String::FlatContent flat = s->GetFlatContent();
1042 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
1043 // could be removed because ToUpperOneByte is pretty fast now (it
1044 // does not call ICU API any more.).
1045 if (flat.IsOneByte()) {
1046 Vector<const uint8_t> src = flat.ToOneByteVector();
1047 if (ToUpperFastASCII(src, result)) return *result;
1048 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1049 } else {
1050 DCHECK(flat.IsTwoByte());
1051 Vector<const uint16_t> src = flat.ToUC16Vector();
1052 if (ToUpperFastASCII(src, result)) return *result;
1053 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1054 }
1055 }
1056
1057 // Go to the full Unicode path if there are characters whose uppercase
1058 // is beyond the Latin-1 range (cannot be represented in OneByteString).
1059 if (V8_UNLIKELY(!is_result_single_byte)) {
1060 return LocaleConvertCase(s, isolate, true, "");
1061 }
1062
1063 if (sharp_s_count == 0) return *result;
1064
1065 // We have sharp_s_count sharp-s characters, but the result is still
1066 // in the Latin-1 range.
1067 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1068 isolate, result,
1069 isolate->factory()->NewRawOneByteString(length + sharp_s_count));
1070 DisallowHeapAllocation no_gc;
1071 String::FlatContent flat = s->GetFlatContent();
1072 if (flat.IsOneByte()) {
1073 ToUpperWithSharpS(flat.ToOneByteVector(), result);
1074 } else {
1075 ToUpperWithSharpS(flat.ToUC16Vector(), result);
1076 }
1077
1078 return *result;
1079 }
1080
1081 return LocaleConvertCase(s, isolate, true, "");
1082 }
1083
1084 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {
1085 HandleScope scope(isolate);
1086 DCHECK_EQ(args.length(), 3);
1087 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1088 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);
1089 CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);
1090
1091 // All the languages requiring special handling ("az", "el", "lt", "tr")
1092 // have a 2-letter language code.
1093 DCHECK(lang->length() == 2);
1094 uint8_t lang_str[3];
1095 memcpy(lang_str, lang->GetChars(), 2);
1096 lang_str[2] = 0;
1097 s = String::Flatten(s);
1098 // TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath
1099 // in the root locale needs to be adjusted for az, lt and tr because even case
1100 // mapping of ASCII range characters are different in those locales.
1101 // Greek (el) does not require any adjustment, though.
1102 return LocaleConvertCase(s, isolate, is_upper,
1103 reinterpret_cast<const char*>(lang_str));
1104 }
1105
752 } // namespace internal 1106 } // namespace internal
753 } // namespace v8 1107 } // namespace v8
754 1108
755 #endif // V8_I18N_SUPPORT 1109 #endif // V8_I18N_SUPPORT
OLDNEW
« no previous file with comments | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698