Chromium Code Reviews
chromiumcodereview-hr@appspot.gserviceaccount.com (chromiumcodereview-hr) | Please choose your nickname with Settings | Help | Chromium Project | Gerrit Changes | Sign out
(480)

Side by Side Diff: src/runtime/runtime-i18n.cc

Issue 1812673005: Use ICU case conversion/transliterator for case conversion behind a flag (Closed) Base URL: https://chromium.googlesource.com/v8/v8.git@master
Patch Set: windows compile fix: use reinterpret_cast for uc16* => UChar* Created 4 years, 7 months ago
Use n/p to move between diff chunks; N/P to move between comments. Draft comments are only viewable by you.
Jump to:
View unified diff | Download patch
« no previous file with comments | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »
Toggle Intra-line Diffs ('i') | Expand Comments ('e') | Collapse Comments ('c') | Show Comments Hide Comments ('s')
OLDNEW
1 // Copyright 2014 the V8 project authors. All rights reserved. 1 // Copyright 2014 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be 2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file. 3 // found in the LICENSE file.
4 4
5 5
6 #ifdef V8_I18N_SUPPORT 6 #ifdef V8_I18N_SUPPORT
7 #include "src/runtime/runtime-utils.h" 7 #include "src/runtime/runtime-utils.h"
8 8
9 #include "src/api.h" 9 #include "src/api.h"
10 #include "src/api-natives.h" 10 #include "src/api-natives.h"
(...skipping 11 matching lines...) Expand all
22 #include "unicode/dcfmtsym.h" 22 #include "unicode/dcfmtsym.h"
23 #include "unicode/decimfmt.h" 23 #include "unicode/decimfmt.h"
24 #include "unicode/dtfmtsym.h" 24 #include "unicode/dtfmtsym.h"
25 #include "unicode/dtptngen.h" 25 #include "unicode/dtptngen.h"
26 #include "unicode/locid.h" 26 #include "unicode/locid.h"
27 #include "unicode/numfmt.h" 27 #include "unicode/numfmt.h"
28 #include "unicode/numsys.h" 28 #include "unicode/numsys.h"
29 #include "unicode/rbbi.h" 29 #include "unicode/rbbi.h"
30 #include "unicode/smpdtfmt.h" 30 #include "unicode/smpdtfmt.h"
31 #include "unicode/timezone.h" 31 #include "unicode/timezone.h"
32 #include "unicode/translit.h"
32 #include "unicode/uchar.h" 33 #include "unicode/uchar.h"
33 #include "unicode/ucol.h" 34 #include "unicode/ucol.h"
34 #include "unicode/ucurr.h" 35 #include "unicode/ucurr.h"
35 #include "unicode/uloc.h" 36 #include "unicode/uloc.h"
37 #include "unicode/unistr.h"
36 #include "unicode/unum.h" 38 #include "unicode/unum.h"
37 #include "unicode/uversion.h" 39 #include "unicode/uversion.h"
38 40
39 41
40 namespace v8 { 42 namespace v8 {
41 namespace internal { 43 namespace internal {
42 44
43 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) { 45 RUNTIME_FUNCTION(Runtime_CanonicalizeLanguageTag) {
44 HandleScope scope(isolate); 46 HandleScope scope(isolate);
45 Factory* factory = isolate->factory(); 47 Factory* factory = isolate->factory();
(...skipping 696 matching lines...) Expand 10 before | Expand all | Expand 10 after
742 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) { 744 } else if (status >= UBRK_WORD_LETTER && status < UBRK_WORD_LETTER_LIMIT) {
743 return *isolate->factory()->NewStringFromStaticChars("letter"); 745 return *isolate->factory()->NewStringFromStaticChars("letter");
744 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) { 746 } else if (status >= UBRK_WORD_KANA && status < UBRK_WORD_KANA_LIMIT) {
745 return *isolate->factory()->NewStringFromStaticChars("kana"); 747 return *isolate->factory()->NewStringFromStaticChars("kana");
746 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) { 748 } else if (status >= UBRK_WORD_IDEO && status < UBRK_WORD_IDEO_LIMIT) {
747 return *isolate->factory()->NewStringFromStaticChars("ideo"); 749 return *isolate->factory()->NewStringFromStaticChars("ideo");
748 } else { 750 } else {
749 return *isolate->factory()->NewStringFromStaticChars("unknown"); 751 return *isolate->factory()->NewStringFromStaticChars("unknown");
750 } 752 }
751 } 753 }
754
755 namespace {
756 void ConvertCaseWithTransliterator(icu::UnicodeString* input,
757 const char* transliterator_id) {
758 UErrorCode status = U_ZERO_ERROR;
759 base::SmartPointer<icu::Transliterator> translit(
760 icu::Transliterator::createInstance(
761 icu::UnicodeString(transliterator_id, -1, US_INV), UTRANS_FORWARD,
762 status));
763 if (U_FAILURE(status)) return;
764 translit->transliterate(*input);
765 }
766
767 const UChar* GetUCharBufferFromFlat(const String::FlatContent& flat, uc16* dest,
768 int32_t length) {
769 DCHECK(flat.IsFlat());
770 if (flat.IsOneByte()) {
771 CopyChars(dest, flat.ToOneByteVector().start(), length);
772 return reinterpret_cast<const UChar*>(dest);
773 } else {
774 return reinterpret_cast<const UChar*>(flat.ToUC16Vector().start());
775 }
776 }
777
778 MUST_USE_RESULT Object* LocaleConvertCase(Handle<String> s, Isolate* isolate,
779 bool is_to_upper, const char* lang) {
780 int32_t src_length = s->length();
781
782 // Greek uppercasing has to be done via transliteration.
783 // TODO(jshin): Drop this special-casing once ICU's regular case conversion
784 // API supports Greek uppercasing. See
785 // http://bugs.icu-project.org/trac/ticket/10582 .
786 // In the meantime, if there's no Greek character in |s|, call this
787 // function again with the root locale (lang="").
788 // ICU's C API for transliteration is nasty and we just use C++ API.
789 if (V8_UNLIKELY(is_to_upper && lang[0] == 'e' && lang[1] == 'l')) {
790 icu::UnicodeString converted;
791 base::SmartArrayPointer<uc16> sap(NewArray<uc16>(src_length));
792 {
793 DisallowHeapAllocation no_gc;
794 String::FlatContent flat = s->GetFlatContent();
795 const UChar* src = GetUCharBufferFromFlat(flat, sap.get(), src_length);
796 // Starts with the source string (read-only alias with copy-on-write
797 // semantics) and will be modified to contain the converted result.
798 // Using read-only alias at first saves one copy operation if
799 // transliteration does not change the input, which is rather rare.
800 // Moreover, transliteration takes rather long so that saving one copy
801 // helps only a little bit.
802 converted.setTo(false, src, src_length);
jungshik at Google 2016/04/29 23:41:43 Yang, https://docs.google.com/spreadsheets/d/19F
803 ConvertCaseWithTransliterator(&converted, "el-Upper");
804 }
805 Handle<String> result;
806 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
807 isolate, result,
808 isolate->factory()->NewStringFromTwoByte(Vector<const uint16_t>(
809 reinterpret_cast<const uint16_t*>(converted.getBuffer()),
810 converted.length())));
811 return *result;
812 }
813
814 auto case_converter = is_to_upper ? u_strToUpper : u_strToLower;
815
816 int32_t dest_length = src_length;
817 UErrorCode status;
818 Handle<SeqTwoByteString> result;
819 base::SmartArrayPointer<uc16> sap(NewArray<uc16>(src_length));
820
821 // This is not a real loop. It'll be executed only once (no overflow) or
822 // twice (overflow).
823 for (int i = 0; i < 2; ++i) {
824 result =
825 isolate->factory()->NewRawTwoByteString(dest_length).ToHandleChecked();
826 DisallowHeapAllocation no_gc;
827 String::FlatContent flat = s->GetFlatContent();
828 const UChar* src = GetUCharBufferFromFlat(flat, sap.get(), src_length);
829 status = U_ZERO_ERROR;
830 dest_length = case_converter(reinterpret_cast<UChar*>(result->GetChars()),
831 dest_length, src, src_length, lang, &status);
832 if (status != U_BUFFER_OVERFLOW_ERROR) break;
833 }
834
835 // In most cases, the output will fill the destination buffer completely
836 // leading to an unterminated string (U_STRING_NOT_TERMINATED_WARNING).
837 // Only in rare cases, it'll be shorter than the destination buffer and
838 // |result| has to be truncated.
839 DCHECK(U_SUCCESS(status));
840 if (V8_LIKELY(status == U_STRING_NOT_TERMINATED_WARNING)) {
841 DCHECK(dest_length == result->length());
842 return *result;
843 }
844 if (U_SUCCESS(status)) {
845 DCHECK(dest_length < result->length());
846 return *Handle<SeqTwoByteString>::cast(
847 SeqString::Truncate(result, dest_length));
848 }
849 return *s;
850 }
851
852 inline bool IsASCIIUpper(uint16_t ch) { return ch >= 'A' && ch <= 'Z'; }
853
854 const uint8_t kToLower[256] = {
855 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B,
856 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
857 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23,
858 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
859 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B,
860 0x3C, 0x3D, 0x3E, 0x3F, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67,
861 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73,
862 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, 0x5B, 0x5C, 0x5D, 0x5E, 0x5F,
863 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B,
864 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
865 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, 0x7F, 0x80, 0x81, 0x82, 0x83,
866 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x8D, 0x8E, 0x8F,
867 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9A, 0x9B,
868 0x9C, 0x9D, 0x9E, 0x9F, 0xA0, 0xA1, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
869 0xA8, 0xA9, 0xAA, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF, 0xB0, 0xB1, 0xB2, 0xB3,
870 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xBA, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
871 0xE0, 0xE1, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB,
872 0xEC, 0xED, 0xEE, 0xEF, 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xD7,
873 0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xDF, 0xE0, 0xE1, 0xE2, 0xE3,
874 0xE4, 0xE5, 0xE6, 0xE7, 0xE8, 0xE9, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF,
875 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8, 0xF9, 0xFA, 0xFB,
876 0xFC, 0xFD, 0xFE, 0xFF,
877 };
878
879 inline uint16_t ToLatin1Lower(uint16_t ch) {
880 return static_cast<uint16_t>(kToLower[ch]);
881 }
882
883 inline uint16_t ToASCIIUpper(uint16_t ch) {
884 return ch & ~((ch >= 'a' && ch <= 'z') << 5);
885 }
886
887 // Does not work for U+00DF (sharp-s), U+00B5 (micron), U+00FF.
888 inline uint16_t ToLatin1Upper(uint16_t ch) {
889 DCHECK(ch != 0xDF && ch != 0xB5 && ch != 0xFF);
890 return ch &
891 ~(((ch >= 'a' && ch <= 'z') || (((ch & 0xE0) == 0xE0) && ch != 0xE7))
892 << 5);
893 }
894
895 template <typename Char>
896 bool ToUpperFastASCII(const Vector<const Char>& src,
897 Handle<SeqOneByteString> result) {
898 // Do a faster loop for the case where all the characters are ASCII.
899 uint16_t ored = 0;
900 int32_t index = 0;
901 for (auto it = src.begin(); it != src.end(); ++it) {
902 uint16_t ch = static_cast<uint16_t>(*it);
903 ored |= ch;
904 result->SeqOneByteStringSet(index++, ToASCIIUpper(ch));
905 }
906 return !(ored & ~0x7F);
907 }
908
909 const uint16_t sharp_s = 0xDF;
910
911 template <typename Char>
912 bool ToUpperOneByte(const Vector<const Char>& src,
913 Handle<SeqOneByteString> result, int* sharp_s_count) {
914 // Still pretty-fast path for the input with non-ASCII Latin-1 characters.
915
916 // There are two special cases.
917 // 1. U+00B5 and U+00FF are mapped to a character beyond U+00FF.
918 // 2. Lower case sharp-S converts to "SS" (two characters)
919 *sharp_s_count = 0;
920 int32_t index = 0;
921 for (auto it = src.begin(); it != src.end(); ++it) {
922 uint16_t ch = static_cast<uint16_t>(*it);
923 if (V8_UNLIKELY(ch == sharp_s)) {
924 ++(*sharp_s_count);
925 continue;
926 }
927 if (V8_UNLIKELY(ch == 0xB5 || ch == 0xFF)) {
928 // Since this upper-cased character does not fit in an 8-bit string, we
929 // need to take the 16-bit path.
930 return false;
931 }
932 result->SeqOneByteStringSet(index++, ToLatin1Upper(ch));
933 }
934
935 return true;
936 }
937
938 template <typename Char>
939 void ToUpperWithSharpS(const Vector<const Char>& src,
940 Handle<SeqOneByteString> result) {
941 int32_t dest_index = 0;
942 for (auto it = src.begin(); it != src.end(); ++it) {
943 uint16_t ch = static_cast<uint16_t>(*it);
944 if (ch == sharp_s) {
945 result->SeqOneByteStringSet(dest_index++, 'S');
946 result->SeqOneByteStringSet(dest_index++, 'S');
947 } else {
948 result->SeqOneByteStringSet(dest_index++, ToLatin1Upper(ch));
949 }
950 }
951 }
952
953 } // namespace
954
955 RUNTIME_FUNCTION(Runtime_StringToLowerCaseI18N) {
956 HandleScope scope(isolate);
957 DCHECK_EQ(args.length(), 1);
958 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
959
960 int length = s->length();
961 s = String::Flatten(s);
962 // First scan the string for uppercase and non-ASCII characters:
963 if (s->HasOnlyOneByteChars()) {
964 unsigned first_index_to_lower = length;
965 for (int index = 0; index < length; ++index) {
966 // Blink specializes this path for one-byte strings, so it
967 // does not need to do a generic get, but can do the equivalent
968 // of SeqOneByteStringGet.
969 uint16_t ch = s->Get(index);
970 if (V8_UNLIKELY(IsASCIIUpper(ch) || ch & ~0x7F)) {
971 first_index_to_lower = index;
972 break;
973 }
974 }
975
976 // Nothing to do if the string is all ASCII with no uppercase.
977 if (first_index_to_lower == length) return *s;
978
979 // We depend here on the invariant that the length of a Latin1
980 // string is invariant under ToLowerCase, and the result always
981 // fits in the Latin1 range in the *root locale*. It does not hold
982 // for ToUpperCase even in the root locale.
983 Handle<SeqOneByteString> result;
984 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
985 isolate, result, isolate->factory()->NewRawOneByteString(length));
986
987 DisallowHeapAllocation no_gc;
988 String::FlatContent flat = s->GetFlatContent();
989 if (flat.IsOneByte()) {
990 const uint8_t* src = flat.ToOneByteVector().start();
991 CopyChars(result->GetChars(), src, first_index_to_lower);
992 for (int index = first_index_to_lower; index < length; ++index) {
993 uint16_t ch = static_cast<uint16_t>(src[index]);
994 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
995 }
996 } else {
997 const uint16_t* src = flat.ToUC16Vector().start();
998 CopyChars(result->GetChars(), src, first_index_to_lower);
999 for (int index = first_index_to_lower; index < length; ++index) {
1000 uint16_t ch = src[index];
1001 result->SeqOneByteStringSet(index, ToLatin1Lower(ch));
1002 }
1003 }
1004
1005 return *result;
1006 }
1007
1008 // Blink had an additional case here for ASCII 2-byte strings, but
1009 // that is subsumed by the above code (assuming there isn't a false
1010 // negative for HasOnlyOneByteChars).
1011
1012 // Do a slower implementation for cases that include non-ASCII characters.
1013 return LocaleConvertCase(s, isolate, false, "");
1014 }
1015
1016 RUNTIME_FUNCTION(Runtime_StringToUpperCaseI18N) {
1017 HandleScope scope(isolate);
1018 DCHECK_EQ(args.length(), 1);
1019 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1020
1021 // This function could be optimized for no-op cases the way lowercase
1022 // counterpart is, but in empirical testing, few actual calls to upper()
1023 // are no-ops. So, it wouldn't be worth the extra time for pre-scanning.
1024
1025 int32_t length = s->length();
1026 s = String::Flatten(s);
1027
1028 if (s->HasOnlyOneByteChars()) {
1029 Handle<SeqOneByteString> result;
1030 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1031 isolate, result, isolate->factory()->NewRawOneByteString(length));
1032
1033 int sharp_s_count;
1034 bool is_result_single_byte;
1035 {
1036 DisallowHeapAllocation no_gc;
1037 String::FlatContent flat = s->GetFlatContent();
1038 // If it was ok to slow down ASCII-only input slightly, ToUpperFastASCII
1039 // could be removed because ToUpperOneByte is pretty fast now (it
1040 // does not call ICU API any more.).
1041 if (flat.IsOneByte()) {
1042 Vector<const uint8_t> src = flat.ToOneByteVector();
1043 if (ToUpperFastASCII(src, result)) return *result;
1044 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1045 } else {
1046 DCHECK(flat.IsTwoByte());
1047 Vector<const uint16_t> src = flat.ToUC16Vector();
1048 if (ToUpperFastASCII(src, result)) return *result;
1049 is_result_single_byte = ToUpperOneByte(src, result, &sharp_s_count);
1050 }
1051 }
1052
1053 // Go to the full Unicode path if there are characters whose uppercase
1054 // is beyond the Latin-1 range (cannot be represented in OneByteString).
1055 if (V8_UNLIKELY(!is_result_single_byte)) {
1056 return LocaleConvertCase(s, isolate, true, "");
1057 }
1058
1059 if (sharp_s_count == 0) return *result;
1060
1061 // We have sharp_s_count sharp-s characters, but the result is still
1062 // in the Latin-1 range.
1063 ASSIGN_RETURN_FAILURE_ON_EXCEPTION(
1064 isolate, result,
1065 isolate->factory()->NewRawOneByteString(length + sharp_s_count));
1066 DisallowHeapAllocation no_gc;
1067 String::FlatContent flat = s->GetFlatContent();
1068 if (flat.IsOneByte()) {
1069 ToUpperWithSharpS(flat.ToOneByteVector(), result);
1070 } else {
1071 ToUpperWithSharpS(flat.ToUC16Vector(), result);
1072 }
1073
1074 return *result;
1075 }
1076
1077 return LocaleConvertCase(s, isolate, true, "");
1078 }
1079
1080 RUNTIME_FUNCTION(Runtime_StringLocaleConvertCase) {
1081 HandleScope scope(isolate);
1082 DCHECK_EQ(args.length(), 3);
1083 CONVERT_ARG_HANDLE_CHECKED(String, s, 0);
1084 CONVERT_BOOLEAN_ARG_CHECKED(is_upper, 1);
1085 CONVERT_ARG_HANDLE_CHECKED(SeqOneByteString, lang, 2);
1086
1087 // All the languages requiring special handling ("az", "el", "lt", "tr")
1088 // have a 2-letter language code.
1089 DCHECK(lang->length() == 2);
1090 uint8_t lang_str[3];
1091 memcpy(lang_str, lang->GetChars(), 2);
1092 lang_str[2] = 0;
1093 s = String::Flatten(s);
1094 // TODO(jshin): Consider adding a fast path for ASCII or Latin-1. The fastpath
1095 // in the root locale needs to be adjusted for az, lt and tr because even case
1096 // mapping of ASCII range characters are different in those locales.
1097 // Greek (el) does not require any adjustment, though.
1098 return LocaleConvertCase(s, isolate, is_upper,
1099 reinterpret_cast<const char*>(lang_str));
1100 }
1101
752 } // namespace internal 1102 } // namespace internal
753 } // namespace v8 1103 } // namespace v8
754 1104
755 #endif // V8_I18N_SUPPORT 1105 #endif // V8_I18N_SUPPORT
OLDNEW
« no previous file with comments | « src/runtime/runtime.h ('k') | src/runtime/runtime-strings.cc » ('j') | no next file with comments »

Powered by Google App Engine
This is Rietveld 408576698